In [120]:
import random
from collections import Counter
from langchain_openai import ChatOpenAI
import pandas as pd 
from fastbm25 import fastbm25

from getpass import getpass
from tqdm.notebook import tqdm

import json 

from api.db import connect_to_database, close_connection
from api.parsing import parse_text_from_url, parse_text_from_url_v2

In [119]:
import importlib
importlib.reload(api.parsing)

<module 'api.parsing' from '/Users/nikitamatveev/1Progs/learn-anything.xyz/api/python/topic_suggestion/app/api/parsing.py'>

In [2]:
def get_top_topics(client, k):
  # extract topic names from all links 
  link_topic_ids = client.query("SELECT GlobalLink {url, mainTopic: {id, prettyName}} filter exists .mainTopic")
  link_topic_ids = [t.mainTopic.prettyName for t in link_topic_ids]
  # get top k topics by frequency
  top_topics = Counter(link_topic_ids).most_common(k)
  top_topics = [t[0] for t in top_topics]
  return top_topics

def get_random_links(client, n, top_topics):
  # extract all links
  link_topic_ids = client.query("SELECT GlobalLink {url, mainTopic: {id, prettyName}} filter exists .mainTopic")
  # leave only links from top topics
  link_topic_ids = [_ for _ in link_topic_ids if _.mainTopic.prettyName in top_topics]
  # return sample of size n from given links
  return random.sample(link_topic_ids, n)

### Collecting top topics and sample for evaluating topic-suggestion

In [3]:
client = connect_to_database()

In [7]:
top_topics = get_top_topics(client, 20)
evaluation_sample = get_random_links(client, 500, top_topics)

In [22]:
evaluation_df = pd.DataFrame([(_.url, _.mainTopic.prettyName) for _ in evaluation_sample], columns = ['url', 'target'])

### Generating topic descriptions

In [5]:
instruction_prompt = """
I want you to act as a description generator. I will give you a topic name and your goal is to write short topic description (no longer then 100 words).
Description must be understandable. Also you have to use specific words for this topic.

Your output is description and nothing more.
Use keywords related to topic.
Description must be no longer then 100 words.
""".strip().replace('\n', ' ')

token = getpass()
model = ChatOpenAI(model="gpt-3.5-turbo-0125", 
                   temperature=0, 
                   api_key=token)

In [None]:
topic_descriptions = {}
for topic_name in tqdm(top_topics):
  messages = [
    ("system", instruction_prompt.strip()),
    ("human", f"Topic_name: {topic_name}"),
  ]
  description = model.invoke(messages).content
  topic_descriptions[topic_name] = description
  
  print(topic_name, 'processed')

In [9]:
with open('./topic_descriptions_test.json', 'w') as json_file:
    json.dump(topic_descriptions, json_file)

### Evaluating topic-suggestion

In [40]:
labels = [key for key in topic_descriptions.keys()]
corpus = [val for val in topic_descriptions.values()]

tokenized_corpus = [doc.lower().split(" ") for doc in corpus]
model = fastbm25(tokenized_corpus)

def predict(x):
  bm_pred = model.top_k_sentence(parse_text_from_url(x), 1)
  if bm_pred:
    return labels[bm_pred[0][1]]
  return ''


In [123]:
def predict(x):
  bm_pred = model.top_k_sentence(parse_text_from_url(x), 1)
  if bm_pred:
    return labels[bm_pred[0][1]]
  return ''

def predict2(x):
  bm_pred = model.top_k_sentence(parse_text_from_url_v2(x), 1)
  if bm_pred:
    return labels[bm_pred[0][1]]
  return ''

In [124]:
evaluation_df_sample = evaluation_df.head(100)

In [127]:
evaluation_df_sample['pred'] = evaluation_df_sample.url.apply(lambda x: predict(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_df_sample['pred'] = evaluation_df_sample.url.apply(lambda x: predict(x))


In [129]:
evaluation_df_sample['pred2'] = evaluation_df_sample.url.apply(lambda x: predict2(x))

Error occurred while parsing URL: 400 Client Error: Bad Request for url: https://twitter.com/thomasg_eth/status/1492663192404779013
Error occurred while parsing URL: 404 Client Error: Not Found for url: https://media.defense.gov/2021/Aug/03/2002820425/-1/-1/1/CTR_KUBERNETES%20HARDENING%20GUIDANCE.PDF
Error occurred while parsing URL: 403 Client Error: Forbidden for url: http://superfastpython.com/python-asyncio


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_df_sample['pred2'] = evaluation_df_sample.url.apply(lambda x: predict2(x))


In [136]:
print(f"""
Top1 suggested topic is correct in {sum(evaluation_df_sample['target'] == evaluation_df_sample['pred'])} from 100 cases for baseline solution.
Top1 suggested topic is correct in {sum(evaluation_df_sample['target'] == evaluation_df_sample['pred2'])} from 100 cases for solution that uses parsed text from link.
""")


Top1 suggested topic is correct in 19 from 100 cases for baseline solution.
Top1 suggested topic is correct in 39 from 100 cases for solution that uses parsed text from link.



todo:
- improve accuracy
- calculate accuracy via top3
- create prediction on case when bm25 outputs nothing