In [1]:
from openai import OpenAI
import pandas as pd
import numpy as np
import os
import json

# Preprocess df

In [2]:
# Preprocess df
df = pd.read_csv('../data/train_data_labeled.csv')[['problem', 'solution', 'relevance_problem', 'clarity_problem', 'suitability_solution', 'clarity_solution']]
df = df.iloc[:100, :]
df

Unnamed: 0,problem,solution,relevance_problem,clarity_problem,suitability_solution,clarity_solution
0,The production and disposal of electronic devi...,A Circular Economy solution to this problem in...,3.0,3.0,3.0,3.0
1,Single-use plastic packaging is one of the lar...,My solution is to replace single-use plastic p...,3.0,3.0,3.0,3.0
2,Single-Use Plastic Waste â Single-use plasti...,Implement Up-Cycling Plastic Programs - Rather...,3.0,3.0,3.0,3.0
3,The problem I am addressing is the massive amo...,"My solution is to promote a """"Device as a Serv...",3.0,3.0,3.0,3.0
4,The vast amount of waste continuously produced...,"The enhanced solution presents an integrated, ...",3.0,2.0,3.0,3.0
...,...,...,...,...,...,...
95,Single-use plastics are causing significant ha...,I propose an upgrade to the rental/lease model...,3.0,3.0,2.0,3.0
96,The high energy consumption and environmental ...,The introduction of an Unmanned Aerostat with ...,2.0,3.0,2.0,3.0
97,The amount of food waste produced by restauran...,A real-time digital platform that allows netwo...,3.0,3.0,3.0,3.0
98,The fashion industry contributes massively to ...,"The """"Sustainable Fashion Blockchain Model"""" c...",3.0,2.0,3.0,3.0


In [3]:
# Save labeled train data as json
json_filepath = '../data/json/train_data_labeled.json'
df_json = df.to_json(json_filepath, orient='records', indent=4)

# Load json and show as list of dicts
with open(json_filepath) as f:
    df_dict = json.load(f)

df_dict[0]

{'problem': 'The production and disposal of electronic devices or e-waste pose severe environmental consequences due to the release of harmful substances and the exhaustion of natural resources. Furthermore, the current model of electronic device consumption is linear â\x80\x93 where the consumers buy, use, and discard - creating a massive burden of e-waste that is harmful to the environment and also represents a missed business opportunity.  ',
 'solution': "A Circular Economy solution to this problem involves developing a cloud-based service model for electronic devices, similar to the concept of cloud computing. Companies can provide the processing power, storage, software, and other functionalities of electronic devices as a cloud-based service. Users would lease these services on a subscription basis for a limited period, which eliminates the need for physical ownership of devices.  The infrastructure hosting these services would be built with energy-efficient and sustainable tech

In [4]:
# Separate key-value pairs into two lists of dicts: 
# prompt (problem, solution) and response (relevance_problem, clarity_problem, suitability_solution, clarity_solution
df_dict_prompt = [{k: v for k, v in d.items() if k in ['problem', 'solution']} for d in df_dict]
df_dict_response = [{k: v for k, v in d.items() if k in ['relevance_problem', 'clarity_problem', 'suitability_solution', 'clarity_solution']} for d in df_dict]

In [5]:
df_dict_prompt[0]

{'problem': 'The production and disposal of electronic devices or e-waste pose severe environmental consequences due to the release of harmful substances and the exhaustion of natural resources. Furthermore, the current model of electronic device consumption is linear â\x80\x93 where the consumers buy, use, and discard - creating a massive burden of e-waste that is harmful to the environment and also represents a missed business opportunity.  ',
 'solution': "A Circular Economy solution to this problem involves developing a cloud-based service model for electronic devices, similar to the concept of cloud computing. Companies can provide the processing power, storage, software, and other functionalities of electronic devices as a cloud-based service. Users would lease these services on a subscription basis for a limited period, which eliminates the need for physical ownership of devices.  The infrastructure hosting these services would be built with energy-efficient and sustainable tech

In [6]:
df_dict_response[0]

{'relevance_problem': 3.0,
 'clarity_problem': 3.0,
 'suitability_solution': 3.0,
 'clarity_solution': 3.0}

# Format train data into training examples for fine-tuning

In [7]:
# Define system content
system_content = "You are a venture capital expert evaluating potential circular economy startup pitches. \
Mark the startup idea (problem and solution) \
from 1 to 3 in integer numbers (where 1 is bad, 2 is okay, and 3 is good) \
in each of four criteria: \
relevance of the problem to the circular economy (relevance_problem), \
clarity of the problem (clarity_problem), \
suitability of solution to the problem (suitability_solution) and \
clarity of the solution (clarity_solution). \
Return the following fields in a JSON dict: \
'relevance_problem', 'clarity_problem', 'suitability_solution' and 'clarity_solution'."

In [8]:
def generate_example(system_content, user_content, assistant_content):
    # Generate prompts  
    system_dict = {"role": "system", "content": system_content} # system prompt
    user_dict = {"role": "user", "content": str(user_content)} # user prompt
    assistant_dict = {"role": "assistant", "content": str(assistant_content)} # assistant prompt

    # Combine to generate training/validation example
    example = {"messages": [system_dict, user_dict, assistant_dict]}

    return example

In [9]:
# Generate training/validation examples
examples = [generate_example(system_content, user_content, assistant_content) for user_content, assistant_content in zip(df_dict_prompt, df_dict_response)]

# Save as json
examples_filepath = '../data/json/train_examples.jsonl'
with open(examples_filepath, 'w') as f:
    for example in examples:
        json.dump(example, f)
        f.write('\n')

examples[0]

{'messages': [{'role': 'system',
   'content': "You are a venture capital expert evaluating potential circular economy startup pitches. Mark the startup idea (problem and solution) from 1 to 3 in integer numbers (where 1 is bad, 2 is okay, and 3 is good) in each of four criteria: relevance of the problem to the circular economy (relevance_problem), clarity of the problem (clarity_problem), suitability of solution to the problem (suitability_solution) and clarity of the solution (clarity_solution). Return the following fields in a JSON dict: 'relevance_problem', 'clarity_problem', 'suitability_solution' and 'clarity_solution'."},
  {'role': 'user',
   'content': '{\'problem\': \'The production and disposal of electronic devices or e-waste pose severe environmental consequences due to the release of harmful substances and the exhaustion of natural resources. Furthermore, the current model of electronic device consumption is linear â\\x80\\x93 where the consumers buy, use, and discard - c