In [1]:
from openai import OpenAI
import pandas as pd
import numpy as np
import os
import json

# Preprocess df

In [2]:
# Preprocess df
df = pd.read_csv('../data/test_data_labeled.csv')[['problem', 'solution', 'relevance_problem', 'clarity_problem', 'suitability_solution', 'clarity_solution']]
df

Unnamed: 0,problem,solution,relevance_problem,clarity_problem,suitability_solution,clarity_solution
0,Plastic waste pollution,recycling single-use plastic waste and convert...,3,1,3,1
1,The problem that the solution of composting wa...,I worked on a project that collected waste veg...,3,3,3,3
2,The solution can solve rising problems of poll...,A circular economy takes a step forward from t...,1,1,1,2
3,"Project Shrimati, an initiative by students of...",Our objective with Project Shrimati is to tack...,3,3,3,3
4,The fashion industry is notably one of the mos...,I propose a 'Fashion Rental and Resell' model ...,3,3,3,3
5,The high energy consumption and environmental ...,"My proposal is to implement a """"Closed-Loop Ma...",3,2,3,2
6,The production and disposal of single-use prod...,"A rent, return, and refill model for product p...",3,2,3,3
7,"Green house emissions, depleting resources",Food waste is a major contributor to greenhous...,2,1,3,2
8,"Every year, the fashion industry produces bill...",We propose a solution in the form of a decen...,3,3,3,3
9,The global textile industry is a significant c...,My solution is a double-pronged approach of '...,3,3,3,3


In [7]:
# Save labeled train data as json
# json_filepath = '../data/json/test_data_labeled.json'
df_dict = df.to_dict(orient='records')
df_dict
# df_dict = json.load(df_json)

df_dict[0]

{'problem': 'Plastic waste pollution',
 'solution': 'recycling single-use plastic waste and converting it into interlocking tiles.',
 'relevance_problem': 3,
 'clarity_problem': 1,
 'suitability_solution': 3,
 'clarity_solution': 1}

In [14]:
# Separate key-value pairs into two lists of dicts: 
# prompt (problem, solution) and response (relevance_problem, clarity_problem, suitability_solution, clarity_solution
df_dict_prompt = [{k: v for k, v in d.items() if k in ['problem', 'solution']} for d in df_dict]
df_dict_response = [{k: v for k, v in d.items() if k in ['relevance_problem', 'clarity_problem', 'suitability_solution', 'clarity_solution']} for d in df_dict]

In [15]:
df_dict_prompt[0]

{'problem': 'Plastic waste pollution',
 'solution': 'recycling single-use plastic waste and converting it into interlocking tiles.'}

In [16]:
df_dict_response[0]

{'relevance_problem': 3,
 'clarity_problem': 1,
 'suitability_solution': 3,
 'clarity_solution': 1}

# Format train data into training examples for fine-tuning

In [17]:
# Define system content
system_content = "You are a venture capital expert evaluating potential circular economy startup pitches. \
Mark the startup idea (problem and solution) \
from 1 to 3 in integer numbers (where 1 is bad, 2 is okay, and 3 is good) \
in each of four criteria: \
relevance of the problem to the circular economy (relevance_problem), \
clarity of the problem (clarity_problem), \
suitability of solution to the problem (suitability_solution) and \
clarity of the solution (clarity_solution). \
Return the following fields in a JSON dict: \
'relevance_problem', 'clarity_problem', 'suitability_solution' and 'clarity_solution'."

In [18]:
def generate_example(system_content, user_content, assistant_content):
    # Generate prompts  
    system_dict = {"role": "system", "content": system_content} # system prompt
    user_dict = {"role": "user", "content": str(user_content)} # user prompt
    assistant_dict = {"role": "assistant", "content": str(assistant_content)} # assistant prompt

    # Combine to generate training/validation example
    example = {"messages": [system_dict, user_dict, assistant_dict]}

    return example

In [19]:
# Generate training/validation examples
examples = [generate_example(system_content, user_content, assistant_content) for user_content, assistant_content in zip(df_dict_prompt, df_dict_response)]

# Save as json
examples_filepath = '../data/json/test_examples.jsonl'
with open(examples_filepath, 'w') as f:
    for example in examples:
        json.dump(example, f)
        f.write('\n')

examples[0]

{'messages': [{'role': 'system',
   'content': "You are a venture capital expert evaluating potential circular economy startup pitches. Mark the startup idea (problem and solution) from 1 to 3 in integer numbers (where 1 is bad, 2 is okay, and 3 is good) in each of four criteria: relevance of the problem to the circular economy (relevance_problem), clarity of the problem (clarity_problem), suitability of solution to the problem (suitability_solution) and clarity of the solution (clarity_solution). Return the following fields in a JSON dict: 'relevance_problem', 'clarity_problem', 'suitability_solution' and 'clarity_solution'."},
  {'role': 'user',
   'content': "{'problem': 'Plastic waste pollution', 'solution': 'recycling single-use plastic waste and converting it into interlocking tiles.'}"},
  {'role': 'assistant',
   'content': "{'relevance_problem': 3, 'clarity_problem': 1, 'suitability_solution': 3, 'clarity_solution': 1}"}]}