#### Initialization

In [106]:
from openai import OpenAI
import pandas as pd,numpy as np,json,re
from json_repair import repair_json

local_client=OpenAI(base_url='http://localhost:11434/v1/',api_key='12345')

def topic_extraction_msgs(passage):
  
  return [{'role':'system',
          'content':
          """
You are a helpful assistant. Here are the instructions:
1. The user will provide a passage of text after these instructions.
2. Read the text and determine the main research domain or domains
Please limit the response to  domains selected from the following list 
["Neural Networks", "Probabilistic methods", "Decision Trees", "Clustering", "Visualization", "Kernel methods"]
3. Provide the response in the form of a json object { "main_topic": topic_1, "secondary_topic": topic_2  }.
4. Again, remember to use ONLY topics provided in the list above. If NONE of them fit, you can fill in "N/A"
5. If there is only one matching topic, or if there is really only one main topic, set the "secondary_topic" to "N/A",   
6. ONLY provide the json response, with no additional comments or text. 

          """
          },
         {'role':'user',
          'content':f"""
Here is the passage:
--------------------

{passage}
          """
          }]

# Loading in the data

# Query openalex from the website, set source is Proceedings of the AAAI Conference on Artificial Intelligence, then year is <year>
pf2020=pd.read_csv("./aaai_data/works-2024-09-06T21-59-15_aaai_2020.csv")
pf2010=pd.read_csv("./aaai_data/works-2024-09-06T22-01-39_aaai_2010.csv")

#### Analysis

In [113]:
def extract_topics(passage,
                  the_model='phi3.5:latest',#'mistral-nemo:12b',
                  client=local_client,
                  retries=3):

  for count in range(retries):

    try:
      resp=client.chat.completions.create(
        model=the_model,
        messages=topic_extraction_msgs(pf2010.abstract[1])
      )
      
      js=json.loads(repair_json(resp.choices[0].message.content))
      return { 
        'main_topic' : js.get("main_topic","N/A"),
        'secondary_topic' : js.get("secondary_topic","N/A"),
        'model' : the_model
      }
    except:
      pass

    # Give up
    return { 'main_topic':'N/A' , 'secondary_topic':'N/A' , 'model':the_model}
          

In [None]:
topics2020=pf2020.abstract[:30].map(extract_topics)
topics2010=pf2010.abstract[:30].map(extract_topics)

In [None]:
print("2010:\n-----")
print(pd.DataFrame(topics2010.tolist()).main_topic.value_counts())

print("\n2020:\n-----")
print(pd.DataFrame(topics2020.tolist()).main_topic.value_counts())