Extract Responses from OpenAI and merge Open AI data with Google Scholar data
---


In [None]:
import pandas as pd
import os
import getpass
import openai
import time
import datetime

In [None]:
# store api key to env var
os.environ["openai.api_key"] = getpass.getpass()

In [None]:
# add key to api lib
openai.api_key = os.environ["openai.api_key"]

In [None]:
# api pram
models = ["text-davinci-003"]
temperatures = [0]
max_tokens=3963

Use the output from the GoogleScholarExtractData.ipynb Notebook as input for this notebook.

In [None]:
# read scolar data
df_scolar_authors = pd.read_csv("AuthorsData.csv")

In [None]:
# get current date and time
current_datetime = datetime.datetime.now()

# format the date and time as a string
formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")

# generate the file name using the formatted date and time
file_name = f"output_{formatted_datetime}.csv"

responses = []
#loop over models
for model in models:
    #loop over temperatures
    for temperature in temperatures:
        # loop over authors and call openai api to get list of co-authers
        for index, row in df_scolar_authors.iterrows():
            name = row['Name']
            affiliation = row['Affiliation']
            num_co_auth = row['Co-Author Count']
            scholar_id = row['Scholar ID']
            prompt = prompt_generated.format(name,affiliation,num_co_auth)
            # check if author has co-authers -- if not then skip
            if num_co_auth > 0:
                try:
                    # call openai api to get list of co-authers
                    start_time = time.time()
                    response = openai.Completion.create(
                        model=model,
                        prompt=prompt,
                        temperature=temperature,
                        max_tokens=max_tokens,
                        top_p=1,
                        frequency_penalty=0,
                        presence_penalty=0
                    )
                    end_time = time.time()
                    #add api response to responses df
                    response_time = end_time - start_time
                    responses.append({
                        'name': name,
                        'affiliation':affiliation,
                        'num_co_auth' :num_co_auth,
                        'scholar_id':scholar_id,
                        'prompt': prompt,
                        'model': model,
                        'temperature': temperature,
                        'response': response.choices[0].text,
                        'api_response': response,
                        'response_time': response_time,
                        'error': False,
                        'error_msg': "",
                        'response_time': response_time
                    })
                    df = pd.DataFrame(responses)
                    df.to_csv(file_name, index=False)
                except openai.error.OpenAIError as e:
                    #add api error to responses df
                    responses.append({
                        'name': name,
                        'affiliation':affiliation,
                        'num_co_auth' :num_co_auth,
                        'scholar_id':scholar_id,
                        'prompt': prompt,
                        'model': model,
                        'temperature': temperature,
                        'response': "",
                        'api_response': response,
                        'response_time': 0,
                        'error': True,
                        'error_msg': str(e),
                        'response_time': 0
                    })
                    df = pd.DataFrame(responses)
                    df.to_csv(file_name, index=False)
                    continue




In [None]:
# read api ouput csv
df_api_responses = pd.read_csv(file_name)

In [None]:
# collect reponses where model api response  did not porvide co-authors list but no error  and remove them
df_api_res_unfortunately = df_api_responses[df_api_responses['response'].astype(str).str.startswith('\n\nUnfortunately,')]
df_api_responses = df_api_responses[~df_api_responses['scholar_id'].isin(df_api_res_unfortunately['scholar_id'


In [None]:
# collect reponses where model api responsed with an error and remove them
df_api_responses = df_api_responses[df_api_responses.error == False]

In [None]:
# save final openai api output excel file
df_api_responses.to_excel("ResultGPT.xlsx")