In [1]:
from openai import OpenAI
import pandas as pd
import numpy as np
import os
import json

# Preprocess df

In [2]:
# Preprocess df
df = pd.read_csv('../data/validation_data_labeled.csv')[['problem', 'solution', 'relevance_problem', 'clarity_problem', 'suitability_solution', 'clarity_solution']]
df

Unnamed: 0,problem,solution,relevance_problem,clarity_problem,suitability_solution,clarity_solution
0,I want to turn my work into digital,I want everyone to be efficient,1,1,1,1
1,Traditional fast-fashion businesses operate on...,Refashioning the 'Fast Fashion' industry with ...,3,3,3,3
2,The escalating problem of single-use packaging...,The Loop system is a revolutionary circular ec...,3,3,3,3
3,The significant amount of packaging waste gene...,Our proposal is 'Community Hub for Reusable Pa...,3,3,3,3
4,The modern consumer market suffers from proble...,The development of a QR-Code Recycling Informa...,3,3,2,3
5,The accelerating culture of disposability of e...,Drawing on the spiral pattern recurrent in nat...,3,3,3,3
6,The fast fashion industry generates a large pe...,We propose an upgraded version of âWardrobe ...,3,3,3,3
7,Make it more human-centered. Appeal to the emo...,"Simplify your points, and reinforce the motiva...",1,1,1,1
8,"Everyday business operations, particularly pac...",An improved circular economy idea to address t...,3,3,3,3
9,Plastic pollution is a big problem in the worl...,Clay Pots can be use to drink tea and coffee. ...,2,2,2,2


In [3]:
# Save labeled train data as json
json_filepath = '../data/json/validation_data_labeled.json'
df_json = df.to_json(json_filepath, orient='records', indent=4)

# Load json and show as list of dicts
with open(json_filepath) as f:
    df_dict = json.load(f)

df_dict[0]

{'problem': 'I want to turn my work into digital',
 'solution': 'I want everyone to be efficient',
 'relevance_problem': 1,
 'clarity_problem': 1,
 'suitability_solution': 1,
 'clarity_solution': 1}

In [4]:
# Separate key-value pairs into two lists of dicts: 
# prompt (problem, solution) and response (relevance_problem, clarity_problem, suitability_solution, clarity_solution
df_dict_prompt = [{k: v for k, v in d.items() if k in ['problem', 'solution']} for d in df_dict]
df_dict_response = [{k: v for k, v in d.items() if k in ['relevance_problem', 'clarity_problem', 'suitability_solution', 'clarity_solution']} for d in df_dict]

In [5]:
df_dict_prompt[0]

{'problem': 'I want to turn my work into digital',
 'solution': 'I want everyone to be efficient'}

In [6]:
df_dict_response[0]

{'relevance_problem': 1,
 'clarity_problem': 1,
 'suitability_solution': 1,
 'clarity_solution': 1}

# Format train data into training examples for fine-tuning

In [7]:
# Define system content
system_content = "You are a venture capital expert evaluating potential circular economy startup pitches. \
Mark the startup idea (problem and solution) \
from 1 to 3 in integer numbers (where 1 is bad, 2 is okay, and 3 is good) \
in each of four criteria: \
relevance of the problem to the circular economy (relevance_problem), \
clarity of the problem (clarity_problem), \
suitability of solution to the problem (suitability_solution) and \
clarity of the solution (clarity_solution). \
Return the following fields in a JSON dict: \
'relevance_problem', 'clarity_problem', 'suitability_solution' and 'clarity_solution'."

In [8]:
def generate_example(system_content, user_content, assistant_content):
    # Generate prompts  
    system_dict = {"role": "system", "content": system_content} # system prompt
    user_dict = {"role": "user", "content": str(user_content)} # user prompt
    assistant_dict = {"role": "assistant", "content": str(assistant_content)} # assistant prompt

    # Combine to generate training/validation example
    example = {"messages": [system_dict, user_dict, assistant_dict]}

    return example

In [9]:
# Generate training/validation examples
examples = [generate_example(system_content, user_content, assistant_content) for user_content, assistant_content in zip(df_dict_prompt, df_dict_response)]

# Save as json
examples_filepath = '../data/json/validation_examples.jsonl'
with open(examples_filepath, 'w') as f:
    for example in examples:
        json.dump(example, f)
        f.write('\n')

examples[0]

{'messages': [{'role': 'system',
   'content': "You are a venture capital expert evaluating potential circular economy startup pitches. Mark the startup idea (problem and solution) from 1 to 3 in integer numbers (where 1 is bad, 2 is okay, and 3 is good) in each of four criteria: relevance of the problem to the circular economy (relevance_problem), clarity of the problem (clarity_problem), suitability of solution to the problem (suitability_solution) and clarity of the solution (clarity_solution). Return the following fields in a JSON dict: 'relevance_problem', 'clarity_problem', 'suitability_solution' and 'clarity_solution'."},
  {'role': 'user',
   'content': "{'problem': 'I want to turn my work into digital', 'solution': 'I want everyone to be efficient'}"},
  {'role': 'assistant',
   'content': "{'relevance_problem': 1, 'clarity_problem': 1, 'suitability_solution': 1, 'clarity_solution': 1}"}]}