In [1]:
from openai import OpenAI
import pandas as pd
import numpy as np
import os
import json

# Set up OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Read the data
def read_data(filepath):
    df_full = pd.read_csv(filepath)
    if df_full.shape[1] > 3:
        print('Data with labels generated by GPT loaded')
        df = df_full[['problem', 'solution', 'relevance_problem', 'clarity_problem', 'suitability_solution', 'clarity_solution']]
        contains_labels = True
    else:
        print('Data loaded')
        df = df_full[['problem', 'solution']]
        contains_labels = False
    return df_full, df, contains_labels

def process_df_into_dict(df):
    return df.to_dict(orient='records')

def get_response(client, system_content, user_content, finetuned=False):
    if finetuned:
        model = 'ft:gpt-3.5-turbo-1106:personal::8e9YXb9p'
    else:
        model = "gpt-3.5-turbo-1106"
    response = client.chat.completions.create(
        model=model,
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content}
            ],
        temperature=0,
        seed=0
    )
    return json.loads(response.choices[0].message.content)

def get_metrics(df_dict, client, system_content, finetuned):
    response_list = []
    for i in range(len(df_dict)):
        user_content = str(df_dict[i])
        response = get_response(client, system_content, user_content, finetuned)
        response_list.append(response)

    df_metrics = pd.DataFrame(response_list)
    return df_metrics

def get_metrics_for_filtering_ideas(df, client, finetuned=True):
    df_dict = process_df_into_dict(df)
    system_content = "You are a venture capital expert evaluating potential circular economy startup pitches. \
    Mark the startup idea (problem and solution) \
    from 1 to 3 in integer numbers (where 1 is bad, 2 is okay, and 3 is good) \
    in each of four criteria: \
    relevance of the problem to the circular economy (relevance_problem), \
    clarity of the problem (clarity_problem), \
    suitability of solution to the problem (suitability_solution) and \
    clarity of the solution (clarity_solution). \
    Return the following fields in a JSON dict: \
    'relevance_problem', 'clarity_problem', 'suitability_solution' and 'clarity_solution'."
    df_metrics = get_metrics(df_dict, client, system_content, finetuned)
    return df_metrics

In [2]:
df_full, df, contains_labels = read_data('../data/train_data.csv')
df = df.iloc[:3,:]
get_metrics_for_filtering_ideas(df, client, finetuned=False)

Data loaded
