In [1]:
import arxiv
import datetime
from langchain.prompts import PromptTemplate
from openai import OpenAI
from assistants import generate_response
from tqdm import tqdm
from helpers import extract_scores
import pandas as pd
import os

In [2]:
query = 'AI ML NLP deeplearning reinforcement learning'

current_date = datetime.datetime.now(datetime.timezone.utc)
seven_days_ago = current_date - datetime.timedelta(days=7)


client = arxiv.Client(num_retries=20, page_size=500)
search = arxiv.Search(query=query,
                      max_results = 5000, 
                      sort_by=arxiv.SortCriterion.SubmittedDate)
results = client.results(search)

jresults = []

for result in tqdm(results):
    if result is not None and result.primary_category.startswith("cs") and result.primary_category != "cs.SE":
        r = dict()
        r["entry_id"] = result.entry_id
        r["updated"] = str(result.updated)
        r["published"] = str(result.published)
        r["title"] = result.title
        r["summary"] = result.summary
        r["primary_category"] = result.primary_category
        r["categories"] = result.categories
        r["links"] = [str(link) for link in result.links]
        r["pdf_url"] = result.pdf_url
        jresults.append(r)
        
results_list = [x for x in jresults if (datetime.datetime.strptime(x['updated'], '%Y-%m-%d %H:%M:%S%z')>=seven_days_ago) \
                        or (datetime.datetime.strptime(x['published'], '%Y-%m-%d %H:%M:%S%z')>=seven_days_ago) ]

5000it [01:12, 69.43it/s]


In [5]:
template = """Title: {title}

Summary: {summary}
"""
prompt = PromptTemplate.from_template(template)

OPENAI_API_KEY_Assiatant = os.environ.get('OPENAI_API_KEY_Assiatant')

client = OpenAI(api_key=OPENAI_API_KEY_Assiatant)

for entry in tqdm(results_list):
    title = entry['title']
    summary = entry['summary']

    message_body = prompt.format(title=title,summary=summary)
    rating = generate_response(client,message_body)

    entry['raw_score_data'] = rating


100%|██████████| 968/968 [1:08:27<00:00,  4.24s/it]  


In [6]:
for idx,data in tqdm(enumerate(results_list)):
    score_data = data['raw_score_data']
    extracted_data = extract_scores(score_data)
    
    results_list[idx] = results_list[idx] |extracted_data

final_data = []
for data in results_list:
    record = {}
    record['title'] = data['title']
    record['summary'] = data['summary']
    record['link'] = data['links'][0]
    record['score']= data['score']
    record['innovation']= data['innovation']
    record['newness']=data['newness']
    record['potential']=data['potential']
    record['clarity']= data['clarity']
    record['relevance']= data['relevance']
    
    final_data.append(record)


968it [00:00, 94085.84it/s]


In [25]:
df = pd.DataFrame(final_data)
df = df.sort_values(by='score', ascending=False)
df = df.reset_index(drop=True) 

csv_filename = "./data/data_14_02_2024.csv"

df.to_csv(csv_filename, index=False)

markdown_text = ""
for index, row in df.head(10).iterrows():
    markdown_text += f"#### {index+1}.{row['title']}. [Link]({row['link']}) \n"
    markdown_text += f"#### GPT Score: {row['score']}\n"
    markdown_text += f"Innovation:{row['innovation']}, Newness:{row['newness']}, Potential:{row['potential']}, Clarity:{row['clarity']}, Relevance:{row['relevance']}\n" 
    markdown_text += f"### Summary\n"
    markdown_text += f"{row['summary']}\n\n"


# Saving the Markdown formatted text to a file
markdown_filename = "./data/markdown_text.md"

with open(markdown_filename, 'w') as file:
    file.write(markdown_text)