In [2]:
import os
import json
import pandas as pd


In [3]:
# Reading in the answers

df_answers = pd.read_csv('reddit_answers_big.csv', sep=';')
df_answers.head(3)

Unnamed: 0.1,Unnamed: 0,q_id,text,votes
0,0,hvbvpz,Two pet ducks. You may be tempted to go for on...,2359.0
1,1,hvbvpz,Nice try Jeff Bezos,764.0
2,2,hvbvpz,A curved shower rod. Seriously. $10 for a tens...,1525.0


In [5]:
# Grabbing top answers
df_top_votes = df_answers.groupby('q_id')['votes'].idxmax()
df_top_answers = df_answers.loc[df_top_votes]

df_top_answers.rename(columns={'text': 'answer'}, inplace=True)
df_top_answers.rename(columns={'q_id': 'id'}, inplace=True)
df_top_answers.rename(columns={'votes': 'answer_votes'}, inplace=True)

df_top_answers.head(3)

Unnamed: 0.1,Unnamed: 0,id,answer,answer_votes
1817014,1875645,1001ag,Tell him to go to a hospital. I can't stress t...,30.0
1591462,1643710,10029x,NOTE: Detail may not sum to totals because of ...,3.0
96052,99426,1004g5,Blow Me Away by Breaking Benjamin http://www....,7.0


In [6]:
# Reading in the questions
df_questions = pd.read_csv('reddit_questions.csv', sep=';')
df_questions.head(3)

# Renaming some columns
df_questions.rename(columns={'text': 'question'}, inplace=True)
df_questions.rename(columns={'votes': 'question_votes'}, inplace=True)

df_questions.head(3)

Unnamed: 0,id,question,question_votes,timestamp,datetime
0,izucgz,What's the purpose of life?,8,1601076000.0,Fri Sep 25 23:13:31 2020 UTC
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,1250712000.0,Wed Aug 19 19:58:54 2009 UTC
2,iylxwl,"For those who have a slave master last name, w...",0,1600904000.0,Wed Sep 23 23:35:15 2020 UTC


In [8]:
# Joining questions and answers on id
merged_df = df_questions.merge(df_top_answers, on='id')
merged_df.head(3)

Unnamed: 0.1,id,question,question_votes,timestamp,datetime,Unnamed: 0,answer,answer_votes
0,izucgz,What's the purpose of life?,8,1601076000.0,Fri Sep 25 23:13:31 2020 UTC,1254710,Breed and die.,5.0
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,1250712000.0,Wed Aug 19 19:58:54 2009 UTC,4217572,The secret to quitting smoking is to tell your...,4.0
2,iylxwl,"For those who have a slave master last name, w...",0,1600904000.0,Wed Sep 23 23:35:15 2020 UTC,5464942,No. My last name sounds badass.,4.0


In [9]:
# Clean this data up a bit
merged_df.drop(columns=['timestamp', 'datetime','Unnamed: 0'], axis=1, inplace=True)
merged_df = merged_df.reindex(columns=['id', 'question', 'answer', 'question_votes', 'answer_votes'])
merged_df.head(3)

Unnamed: 0,id,question,answer,question_votes,answer_votes
0,izucgz,What's the purpose of life?,Breed and die.,8,5.0
1,9c784/,"I've tried to quit smoking, this is my seventh...",The secret to quitting smoking is to tell your...,11,4.0
2,iylxwl,"For those who have a slave master last name, w...",No. My last name sounds badass.,0,4.0


In [10]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181311 entries, 0 to 181310
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              181311 non-null  object 
 1   question        181311 non-null  object 
 2   answer          181296 non-null  object 
 3   question_votes  181311 non-null  int64  
 4   answer_votes    181311 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 6.9+ MB


In [11]:
merged_df = merged_df.sort_values(by='answer_votes', ascending=False)
merged_df_1k = merged_df[:1000]

In [12]:
questions, answers = merged_df_1k['question'], merged_df_1k['answer']

In [15]:
# Format the training data
qa_openai_format = [{'messages': [{'role': 'system', 'content': 'Marv is a factual chatbot and reddit expert who likes to answer with bullets.'}, 
                                  {'role': 'user', 'content': q},
                                  {'role': 'assistant', 'content': a}]} for q, a in zip(questions, answers)]

qa_openai_format[:2]

[{'messages': [{'role': 'system',
    'content': 'Marv is a factual chatbot and reddit expert who likes to answer with bullets.'},
   {'role': 'user', 'content': 'What is something that has aged well?'},
   {'role': 'assistant', 'content': 'The word cool'}]},
 {'messages': [{'role': 'system',
    'content': 'Marv is a factual chatbot and reddit expert who likes to answer with bullets.'},
   {'role': 'user',
    'content': "What's the most amazing thing about the universe?"},
   {'role': 'assistant',
    'content': "It must be true that either  It didn't exist, then it did  or  It has always existed"}]}]

In [16]:
# Write the training data to a JSONL file
with open('training_data.jsonl', 'w') as f:
    for entry in qa_openai_format:
        f.write(json.dumps(entry))
        f.write('\n')
                

In [17]:
# Search for errors
from collections import defaultdict

data_path = "training_data.jsonl"

with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

print(f"Number of examples: {len(dataset)}")



# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Number of examples: 1000
No errors found


In [3]:
import openai
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "sk-proj-Fs2QEYX4U4G8rBTTeujhT3BlbkFJFFlX5o0egM0DWVVLKPSV"
openai.api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI()

In [3]:
# Time to finally start fine tuning (sending the data to OpenAI)
client.files.create(
    file=open('training_data.jsonl', 'rb'),
    purpose='fine-tune',
)

FileObject(id='file-uOgFIEt02cmfVdZKey0iyKbP', bytes=490462, created_at=1721516352, filename='training_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [20]:
client.fine_tuning.jobs.create(
    training_file='file-SHfScQwnFnIjzwLqzXwFmVri',
    model='gpt-3.5-turbo'
    )

FineTuningJob(id='ftjob-kIbBxAVKH9ojBZAqU9ohDIWx', created_at=1721493556, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-vKgpXsH9oLuBJRfzRJiIXO2J', result_files=[], seed=323145236, status='validating_files', trained_tokens=None, training_file='file-SHfScQwnFnIjzwLqzXwFmVri', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [17]:
job_id = 'ftjob-kIbBxAVKH9ojBZAqU9ohDIWx'  # Replace with your actual job ID
job_details = client.fine_tuning.jobs.retrieve(job_id)
print(job_details.status)

succeeded


In [16]:
system_prompt = "Marv is a factual chatbot and reddit expert who likes to answer with bullets"
user_question = "best food you've ever had?"

In [17]:
response = client.chat.completions.create(
    model="ft:gpt-3.5-turbo-0125:personal::9n8PSYHk",
    messages=[
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_question
        }
    ]
)

print(response.choices[0].message.content)

/*order 12 donuts from the Amish because why not? Try one, pretty freaking good. But everyone who says Amish donuts are the best donuts are whack because nothing beats Big-Butts from the heart of NC. What in the world? Why are they so soft and sweet? Try another just to be sure. Then try another to double check. Then not stopping until there all gone including giving 2 of my family members the middle finger when asked if looks like itll be a while before the next opportunity for a trip to the Amish. Im sorry. I have sinned. Yours truly, me. Holy crap they are freaking good. Only problem now is working it out with the Amish that they have 2 shipments leaving daily going south on I-77 and can meet me at the Gilead road exit. So yeah, Amish donuts. No going back now. Holy Crap amazing. /*
