# Load functions

In [None]:
import openai
import pandas as pd
import numpy as np
import json
import time

In [None]:
publication_filter = ['sigmod', 'vldb', 'icde', 'icdt', 'edbt', 'pods', 'kdd', 'www',
                      'sdm', 'pkdd', 'icdm', 'cikm', 'aaai', 'icml', 'ecml', 'colt',
                      'uai', 'soda', 'focs', 'stoc', 'stacs']
pattern = '|'.join(publication_filter)
print(pattern)

sigmod|vldb|icde|icdt|edbt|pods|kdd|www|sdm|pkdd|icdm|cikm|aaai|icml|ecml|colt|uai|soda|focs|stoc|stacs


In [None]:
def filter_publication(df, column_to_filter, pattern):
    pattern = '|'.join(publication_filter)
    df = df[df[column_to_filter].str.lower().str.contains(pattern)]
    return df

In [None]:
def groupby_count_from_json(list_of_data, col_to_count = "expertise"):

    from collections import defaultdict


    # Initialize an empty defaultdict to accumulate counts
    expertise_count_dict = defaultdict(int)

    # Iterate over each dataset in the list of data
    for data in list_of_data:
        # Iterate over each entry in the dataset
        for entry in data:
            for item in entry[col_to_count]:
                expertise_count_dict[item.lower()] += 1

    # Convert the accumulated counts into a DataFrame
    df_expertise_count = (pd.DataFrame(list(expertise_count_dict.items()), columns=[col_to_count, "count"])
    .sort_values("count", ascending = False)
    .reset_index().drop(["index"], axis=1))

    return df_expertise_count

In [None]:
def eda(pdf, list_col):
    import matplotlib.pyplot as plt
    for c in list_col:
        pdf[c].plot.hist(bins =120, alpha = 0.5)
        plt.title(f"Distribution of {c}")
        plt.show()
        plt.clf()

# Set OpenAI API and system massage

In [None]:
API_KEY = open("OPENAI_API_DBLP.txt", "r").read()


client = openai.OpenAI(api_key=API_KEY)

In [None]:
# v4, expert and label refined
def get_completion(prompt, model="gpt-4o-mini"):
    messages = [
        {"role": "system"
         , "content":
"""You are a helpful and honest assistant for labeling expertise required from authors to write academic research papers or publications,
as well as labeling research topics of academic research papers or publications.
Expertise means the expert knowledge or skills the authors must have to write the paper or publication.
Research topic means the subjects or issues that the paper or publication researches into.
The paper or publication's title, abstract, keywords provided by authors, field of study labeled by others and publication venue are provided and delimited by triple backticks.
Please provide top ten expertise required from authors to write it, as well as top ten research topics.
Both the expertise and research topics you generate should be in order from generic to specific. All output should be in English.
You must be at least 70% confident about the expertise and research topic. Otherwise, generate "NA".
Provide your answer in JSON structure like this {"expertise": <The expertise you generate>, "topic": <The research topics you generate>}
"""
        }
        , {"role": "user", "content": prompt}]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
        response_format={ "type": "json_object"}
    )

    return response.choices[0].message.content

In [None]:

def feed_data_into_openai(
    gpt_model
    , sublist_number
    , subset_df
    , save_interval = 10  # Save results after every 10 rows
    ):

    output_file = f"data/label/results_{sublist_number}_{gpt_model}"
    results = []

    for i, (_, row) in enumerate(subset_df.iterrows()):

        # ==== create a prompt for loop each paper
        #print("\n")
        #print(str(i), row["id"])
        prompt = f"""
        Title: ```{row["title"]}```
        Abstract: ```{row["abstract"]}```
        Keywords provided by authors: ```{row["keywords"]}```
        Field of study labeled by others: ```{row["list_of_field_of_study"]}```
        Publication venue: ```{row["list_of_venue"]}```
        """
        #print(prompt)

        # ==== get response from openai, if openai returns error, append error massage to "results" list
        try:
            response = get_completion(prompt, model = gpt_model)
            #print(response)
            # Add the ID from the current row to the result JSON
            result_json_temp = json.loads(response)
            # save paper id
            result_json = {"id": row["id"]
                        , "expertise": result_json_temp["expertise"]
                        , "topic": result_json_temp["topic"]
                        #, "prompt": prompt
                        }

        except json.JSONDecodeError:
                # Handle cases where the result is not valid JSON
                result_json = {"id": row["id"], "error": "Invalid JSON", "response": response}

        except Exception as e:#openai.error.OpenAIError as e:
            print(f"Error: {e}")
            result_json = {"id": row["id"], "error": str(e), "prompt": prompt
                        }

        # append the json object to list
        results.append(result_json)

        # ==== save results after every {save_interval} paper is passed to openai
        if (i + 1) % save_interval == 0 or (i + 1) == len(subset_df):
            with open(output_file + ".json", 'a') as f:
                for result in results:
                    json.dump(result, f)
                    # newline to separate JSON objects
                    f.write('\n')

            # clear the results list after saving
            results = []

            #print(f"\nProcessed and saved {i + 1}/{len(subset_df)} rows.")

            # delay to handle rate limits or connection stability
            time.sleep(1)


In [None]:


def check_result(paper, gpt_model = "gpt-4o", num_sublists = 50):
    # ======== get a dict of each sub df's id count
    dict_count_cs_paper = {}

    # load the sublists


    for sublist_number in range(num_sublists):
        with open(f'data/index_list/sublist_{sublist_number}.json', 'r') as f:
            sublist = json.load(f)

        #print(f"Started to work on sublist {sublist_number}")
        subset_df = paper.loc[sublist]
        #print(f"Shape of sub df: {subset_df.shape}")

        subset_df = filter_publication(df = subset_df
                        , column_to_filter = "list_of_venue"
                        , pattern = pattern)
        #print(f"After filtering, shape of sub df: {subset_df.shape}")
        dict_count_cs_paper[sublist_number] = subset_df.shape[0]

    # ======== get a dict of each sub df's result's id count


    dict_count_cs_paper_res = {}

    for sublist_number in range(num_sublists):
        loaded_results = []
        with open(f"data/label/results_{sublist_number}_{gpt_model}.json", 'r') as f:
            for line in f:
                try:
                    result = json.loads(line)
                    loaded_results.append(result)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")

        dict_count_cs_paper_res[sublist_number] = len(loaded_results)

    # ======== get a list of subdf id that have missing id
    list_sublist_number_has_missing_id = []

    for sublist_number in range(num_sublists):
        l1 = dict_count_cs_paper[sublist_number]
        l2 = dict_count_cs_paper_res[sublist_number]
        if l1 != l2:
            print(sublist_number, l1, l2)
            list_sublist_number_has_missing_id.append(sublist_number)

    # ========
    # if the length if id in each sub df == length of result, do nothing
    # otherwise, rerun openai for the missing ids for respective sub df
    # ========
    print(list_sublist_number_has_missing_id)
    if len(list_sublist_number_has_missing_id) == 0:
        print("\nLength of results of all subsets match")
    else:
        print("\nStarted to work on missing id")
        # ======== get the ids of each sub df which did not get result
        dict_count_cs_paper_missed = {}

        for sublist_number in list_sublist_number_has_missing_id:

            # all id
            with open(f'data/index_list/sublist_{sublist_number}.json', 'r') as f:
                sublist = json.load(f)


            print(f"Started to work on sublist {sublist_number}")
            subset_df = paper.loc[sublist]
            print(f"Shape of sub df: {subset_df.shape}")

            subset_df = filter_publication(df = subset_df
                            , column_to_filter = "list_of_venue"
                            , pattern = pattern)
            print(f"After filtering, shape of sub df: {subset_df.shape}")
            dict_count_cs_paper[sublist_number] = subset_df.shape[0]

            all_id = subset_df["id"].to_list()

            # result id
            loaded_results = []
            with open(f"data/label/results_{sublist_number}_{gpt_model}.json", 'r') as f:
                for line in f:
                    try:
                        result = json.loads(line)
                        loaded_results.append(result)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON: {e}")

            res_id = []
            for i in range(len(loaded_results)):
                res_id.append(loaded_results[i]["id"])


            # missing id
            missing_id = [c for c in all_id if c not in res_id]
            print(f"\nMissing id of sub df {sublist_number}: {len(missing_id)}\n")
            dict_count_cs_paper_missed[sublist_number] = missing_id

        print(dict_count_cs_paper_missed)
        # ======== Feed missed id into OpenAI agian for each sub df
        for sublist_number in dict_count_cs_paper_missed.keys():
            with open(f'data/index_list/sublist_{sublist_number}.json', 'r') as f:
                sublist = json.load(f)

            print(f"Started to work on sublist {sublist_number}")
            subset_df = paper[paper["id"].isin(dict_count_cs_paper_missed[sublist_number])]
            print(f"Shape of sub df: {subset_df.shape}")

            # Feed data into OpenAI
            feed_data_into_openai(gpt_model = "gpt-4o"# "gpt-4o-mini"
                                  , sublist_number = sublist_number
                                  , subset_df = subset_df
                                  , save_interval = 10  # Save results after every 10 rows
                                  )

# Load in data

In [None]:
paper = pd.read_csv("./output/paper.csv"
                    , dtype = {"id": "string"
                                , "year": int
                                , "lang": "string"
                                , "title": "string"
                                , "abstract" : "string"
                                , "keywords" : "string"
                                , "abstract_length": int
                                , "list_of_venue": object
                                , "list_of_field_of_study": object})
print(paper.shape)
print(paper.isna().sum())
display(paper.head())
display(paper.groupby("lang").size())

paper.dtypes
print(type(paper["keywords"][0]))
print(type(paper["list_of_venue"][0]))

print(paper["keywords"][0])
print(paper["list_of_venue"][0])
# print(", ".join(paper["keywords"][0]))

(3216456, 9)
id                             0
year                           0
lang                      285681
title                          0
abstract                       0
keywords                       0
abstract_length                0
list_of_venue                  0
list_of_field_of_study    750463
dtype: int64


Unnamed: 0,id,year,lang,title,abstract,keywords,abstract_length,list_of_venue,list_of_field_of_study
0,53e99785b7602d9701f43f95,2004,en,Bookshelf,Reviewed: Succeeding with Open Source Quality ...,"['415', '460', ' P.K.', ' Volume 4: Anticipati...",152,"['Software, IEEE']",
1,53e9978ab7602d9701f4a038,2002,en,Neurointerfaces,A neurointerface is a nonlinear filtering syst...,"['Humans', 'Automatic control', 'Nonlinear equ...",835,['Neurointerfaces'],
2,53e9978ab7602d9701f4bc56,2007,en,Lisp,This report covers the activities of the 3rd E...,"['Nick Levine', 'relevant organisational aspec...",335,['European Conference on Object-Oriented Progr...,
3,53e9978db7602d9701f50771,2016,en,interactions,"Given a C ∗ -algebra B , a closed * -subalgebr...","['transfer operator', 'generalized corresponde...",1123,['interactions'],
4,53e99792b7602d9701f547ef,2007,en,Synergetics,"Synergetics, a rather new interdisciplinary fi...","['General Principle', ' Functional Structure',...",632,['Physica B+C'],


lang
de          7
en    2930756
es          1
fr          5
it          1
zh          5
dtype: int64

<class 'str'>
<class 'str'>
['415', '460', ' P.K.', ' Volume 4: Anticipating Change Waltzing with Bears: Managing Risk on Software Projects', '05', '10', '010', 'Bollinger', '106', 'Journal paper', 'Bookshelf', ' T.', 'Janert', 'Barker', '210', ' M.', '416', 'Reviewed: Succeeding with Open Source Quality Software Management']
['Software, IEEE']


In [None]:
paper["abstract_length"].describe()

count    3.216456e+06
mean     9.755675e+02
std      4.212496e+02
min      1.010000e+02
25%      6.780000e+02
50%      9.410000e+02
75%      1.224000e+03
max      4.999000e+03
Name: abstract_length, dtype: float64

# Load in list of index and feed data into Open AI

In [None]:
# load the sublists
num_sublists = 50

#for sublist_number in range(num_sublists):
for sublist_number in range(15, 50):
    with open(f'data/index_list/sublist_{sublist_number}.json', 'r') as f:
        sublist = json.load(f)



    print(f"Started to work on sublist {sublist_number}")
    subset_df = paper.loc[sublist]
    print(f"Shape of sub df: {subset_df.shape}")

    subset_df = filter_publication(df = subset_df
                    , column_to_filter = "list_of_venue"
                    , pattern = pattern)
    print(f"After filtering, shape of sub df: {subset_df.shape}")

    # Feed data into OpenAI
    feed_data_into_openai(gpt_model = "gpt-4o"# "gpt-4o-mini"
                          , sublist_number = sublist_number
                          , subset_df = subset_df
                          , save_interval = 10  # Save results after every 10 rows
                          )

Started to work on sublist 15
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1314, 9)
Started to work on sublist 16
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1241, 9)
Started to work on sublist 17
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1298, 9)
Started to work on sublist 18
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1249, 9)
Started to work on sublist 19
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1316, 9)
Started to work on sublist 20
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1296, 9)
Started to work on sublist 21
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1250, 9)
Started to work on sublist 22
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1315, 9)
Started to work on sublist 23
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1295, 9)
Started to work on sublist 24
Shape of sub df: (64329, 9)
After filtering, shape o

# Check results until there are no missing results

In [None]:
# run first time to fill missing id
check_result(paper = paper, gpt_model = "gpt-4o")

49 1319 560
[49]

Started to work on missing id
Started to work on sublist 49
Shape of sub df: (64329, 9)
After filtering, shape of sub df: (1319, 9)

Missing id of sub df 49: 759

{49: ['53e9b1f8b7602d9703c8ab0d', '53e99a2ab7602d9702283e05', '53e9a263b7602d9702b67cdf', '53e9a64ab7602d9702f77205', '53e9b7eeb7602d970439fe62', '53e9999eb7602d97021e27dc', '53e99f4fb7602d97028208cf', '53e9b7cdb7602d970437813c', '61b022c05244ab9dcb5c55e5', '5736972a6e3b12023e61b6cb', '53e9bbe5b7602d970483ae35', '53e9b239b7602d9703cd6f27', '53e9a88cb7602d97031d7f12', '5550413b45ce0a409eb3980a', '5736980d6e3b12023e6e45f8', '599c7ef5601a182cd28dc744', '5b67b47917c44aac1c8638aa', '53e9a3dbb7602d9702cefb76', '53e9bb80b7602d97047c838e', '53e9b621b7602d970417895d', '5b67b47917c44aac1c86393a', '619b70601c45e57ce9363a48', '53e9a0dfb7602d97029c92f0', '5c20b20fda5629702063b1de', '53e9a806b7602d9703148c5e', '53e9a81fb7602d9703165307', '5e5e192493d709897ce54e3e', '57d063f9ac44367354297a5b', '53e9ae84b7602d97038a2eeb', '

In [None]:
# run second time to make sure there is no missing id any more
check_result(paper = paper, gpt_model = "gpt-4o")

[]

Length of results of all subsets match


# Count how many papers we feed into OpenAI all together

In [None]:
dict_count_cs_paper_res = {}
num_sublists = 50
gpt_model = "gpt-4o"
cnt = 0

for sublist_number in range(num_sublists):
    loaded_results = []
    with open(f"data/label/results_{sublist_number}_{gpt_model}.json", 'r') as f:
        for line in f:
            try:
                result = json.loads(line)
                loaded_results.append(result)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")

    cnt += len(loaded_results)

    print(sublist_number, len(loaded_results), cnt)
print(f"\nTotal finished: {cnt}")

0 1279 1279
1 1248 2527
2 1287 3814
3 1294 5108
4 1323 6431
5 1256 7687
6 1317 9004
7 1281 10285
8 1289 11574
9 1251 12825
10 1275 14100
11 1236 15336
12 1278 16614
13 1332 17946
14 1269 19215
15 1314 20529
16 1241 21770
17 1298 23068
18 1249 24317
19 1316 25633
20 1296 26929
21 1250 28179
22 1315 29494
23 1295 30789
24 1252 32041
25 1237 33278
26 1283 34561
27 1308 35869
28 1312 37181
29 1301 38482
30 1303 39785
31 1256 41041
32 1289 42330
33 1340 43670
34 1300 44970
35 1301 46271
36 1317 47588
37 1328 48916
38 1309 50225
39 1262 51487
40 1254 52741
41 1302 54043
42 1234 55277
43 1250 56527
44 1244 57771
45 1319 59090
46 1208 60298
47 1226 61524
48 1221 62745
49 1319 64064

Total finished: 64064
