#### Initialization

In [37]:
from openai import OpenAI
import pandas as pd,numpy as np,json,re
from json_repair import repair_json

local_client=OpenAI(base_url='http://localhost:11434/v1/',api_key='12345')

def topic_extraction_msgs(passage):
  
  return [{'role':'system',
          'content':
          """
You are a helpful assistant. Here are the instructions:
1. The user will provide a passage of text after these instructions.
2. Read the text and determine the main research domain or domains
Please limit the response to  domains selected from the following list 
["Deep Learning", "Neural Networks", "Probabilistic methods", "Agents", "Reinforcement Learning",
"Decision Trees", "Clustering", "Visualization", "Kernel methods"]
3. Provide the response in the form of a json object { "main_topic": topic_1, "secondary_topic": topic_2  }.
4. Again, remember to use ONLY topics provided in the list above. If NONE of them fit, you can fill in "N/A"
5. If there is only one matching topic, or if there is really only one main topic, set the "secondary_topic" to "N/A",   
6. ONLY provide the json response, with no additional comments or text. 

          """
          },
         {'role':'user',
          'content':f"""
Here is the passage:
--------------------

{passage}
          """
          }]

def extract_topics(passage,
                  the_model='mistral-nemo:12b',
                  client=local_client,
                  retries=3):

  for count in range(retries):

    try:
      resp=client.chat.completions.create(
        model=the_model,
        messages=topic_extraction_msgs(pf2010.abstract[1])
      )
      
      js=json.loads(repair_json(resp.choices[0].message.content))
      return { 
        'main_topic' : js.get("main_topic","N/A"),
        'secondary_topic' : js.get("secondary_topic","N/A"),
        'model' : the_model,
        'num_retries' : count
      }
    except:
      pass

    # Give up
    return { 'main_topic':'N/A' , 'secondary_topic':'N/A' , 'model':the_model, 'num_retries':-1}
          

# Loading in the data

# Query openalex from the website, set source is Proceedings of the AAAI Conference on Artificial Intelligence, then year is <year>
pf2023=pd.read_csv("./aaai_data/works-2024-09-06T22-53-09_2023.csv")
pf2020=pd.read_csv("./aaai_data/works-2024-09-06T21-59-15_aaai_2020.csv")
pf2010=pd.read_csv("./aaai_data/works-2024-09-06T22-01-39_aaai_2010.csv")

In [41]:
# Sanity check
testf=pf2010.head(2)
testf['topics']=testf['abstract'].map(lambda x:extract_topics(x,the_model='mistral-nemo:12b'))    #'gemma2:9b-instruct-q5_1'))

pd.set_option('display.max_colwidth',None)
testf[['abstract','topics']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testf['topics']=testf['abstract'].map(lambda x:extract_topics(x,the_model='mistral-nemo:12b'))    #'gemma2:9b-instruct-q5_1'))


Unnamed: 0,abstract,topics
0,"We consider here the problem of building a never-ending language learner; that is, an intelligent computer agent that runs forever and that each day must (1) extract, or read, information from the web to populate a growing structured knowledge base, and (2) learn to perform this task better than on the previous day. In particular, we propose an approach and a set of design principles for such an agent, describe a partial implementation of such a system that has already learned to extract a knowledge base containing over 242,000 beliefs with an estimated precision of 74% after running for 67 days, and discuss lessons learned from this preliminary attempt to build a never-ending learning agent.","{'main_topic': 'Reinforcement Learning', 'secondary_topic': 'Neural Networks', 'model': 'mistral-nemo:12b', 'num_retries': 0}"
1,"Policy search is a successful approach to reinforcement learning. However, policy improvements often result in the loss of information. Hence, it has been marred by premature convergence and implausible solutions. As first suggested in the context of covariant policy gradients, many of these problems may be addressed by constraining the information loss. In this paper, we continue this path of reasoning and suggest the Relative Entropy Policy Search (REPS) method. The resulting method differs significantly from previous policy gradient approaches and yields an exact update step. It can be shown to work well on typical reinforcement learning benchmark problems.","{'main_topic': 'Reinforcement Learning', 'secondary_topic': 'Probabilistic methods', 'model': 'mistral-nemo:12b', 'num_retries': 0}"


#### Analysis

In [None]:
topics2023=pf2023.abstract[:60].map(extract_topics)
topics2020=pf2020.abstract[:60].map(extract_topics)
topics2010=pf2010.abstract[:60].map(extract_topics)

In [None]:
print("2010:\n-----")
print(pd.DataFrame(topics2010.tolist()).main_topic.value_counts())

print("\n2020:\n-----")
print(pd.DataFrame(topics2020.tolist()).main_topic.value_counts())