In [None]:
# Usefull imports
import os
import pandas as pd
import numpy as np
import collections
import gpt_wrapper
from gpt_wrapper.chat import Chat
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:
# Create the "../data" folder if it does not exist
if not os.path.exists("../data"):
    os.makedirs("../data")

# Create the "../data/test" folder if it does not exist
if not os.path.exists("../data/test"):
    os.makedirs("../data/test")

# Pre-processing

### Collaborative dataset

In [None]:
# Load the dataset
preferences_df = pd.read_json('M1_preference_data_15052024.json')

In [None]:
# Get some insights about the dataset
display(preferences_df.head())
print("Length of the dataset: ", len(preferences_df))

Unnamed: 0,question_id,question_complete,course_id,preference
0,0,Question: Consider the following contains func...,15000,[{'A': 'The asymptotic depth of the contains f...
1,3,Question: What is the asymptotic work of <code...,15000,"[{'A': '...', 'B': '...', 'overall': 'A', 'cri..."
2,4,Question: We have a collection of rectangles i...,15000,[{'A': 'Facts: - Rectangles in the plane have ...
3,5,Question: Which of the following scheduler pol...,15005,[{'A': 'Preemptive scheduling policies allow t...
4,7,"Question: In this week's lecture, you have bee...",15000,"[{'A': 'For the computation g(g(1, x1), g(x2, ..."


Length of the dataset:  1522


In [None]:
# Separate the preferences answers into different rows
preferences_df = preferences_df.explode('preference')

# Create relevant columns
preferences_df['prompt'] = preferences_df['question_complete']
preferences_df["chosen"] = preferences_df["preference"].apply(lambda x: x["A"] if x["overall"] == "A" else x["B"])
preferences_df["rejected"] = preferences_df["preference"].apply(lambda x: x["B"] if x["overall"] == "A" else x["A"])

# Keep only the relevant columns
preferences_df = preferences_df[['prompt', 'chosen', 'rejected']]


In [None]:
# Verify the dataset
display(preferences_df.head())
print("Length of the dataset: ", len(preferences_df))

Unnamed: 0,prompt,chosen,rejected
0,Question: Consider the following contains func...,"When `contains` is called on a List, the `drop...",The asymptotic depth of the contains function ...
0,Question: Consider the following contains func...,To determine the asymptotic depth of the `cont...,The asymptotic depth of the contains function ...
0,Question: Consider the following contains func...,To determine the asymptotic depth of the `cont...,The asymptotic depth of the `contains` functio...
0,Question: Consider the following contains func...,To determine the asymptotic depth of the `cont...,The contains function is a recursive function ...
0,Question: Consider the following contains func...,The asymptotic depth of the contains function ...,When the contains function is called on a List...


Length of the dataset:  26738


In [None]:
# Save the dataset in a jsonl file
preferences_df.to_json('../data/collaborative_dataset.jsonl', orient='records', lines=True)

### Argilla dataset

In [None]:
# Load the dataset from: https://huggingface.co/datasets/argilla/ultrafeedback-binarized-preferences
argilla_dataset = load_dataset("argilla/ultrafeedback-binarized-preferences")

In [None]:
# Create a pandas dataframe from the dataset
argilla_df = argilla_dataset['train'].to_pandas()

# Get some insights about the dataset
display(argilla_df.head())
print("Length of the dataset: ", len(argilla_df))

Unnamed: 0,source,instruction,chosen_response,rejected_response,chosen_avg_rating,rejected_avg_rating,chosen_model
0,evol_instruct,Can you write a C++ program that prompts the u...,Here's a C++ program that prompts the user to ...,"Sure, here is the program using the C++11 algo...",5.0,1.25,starchat
1,evol_instruct,Suppose you are a content creator and want to ...,To use GPT for generating compelling titles an...,Sure! Although GPT cannot generate text on its...,4.75,3.25,gpt-4
2,evol_instruct,"Identify the interrelated economic, political,...","The stock market crash of 1929, which marked t...",The stock market crash of 1929 was a result of...,4.5,4.25,vicuna-33b
3,evol_instruct,How can I convert the decimal number 31 to bin...,"Yes, I can provide the JavaScript code for con...",The following JavaScript code can be used to c...,5.0,3.75,mpt-30b-chat
4,evol_instruct,Can you modify the C++ code provided below to ...,"Unfortunately, I cannot modify the given C++ c...","Sure, I can help you with that. Here's how you...",4.0,2.0,ultralm-13b


Length of the dataset:  63619


In [None]:
# Rename the columns
argilla_df = argilla_df.rename(columns={'instruction': 'prompt', 'chosen_response': 'chosen', 'rejected_response': 'rejected'})

# Keep only the relevant columns
argilla_df = argilla_df[['prompt', 'chosen', 'rejected']]

In [None]:
# Verify the dataset
display(argilla_df.head())
print("Length of the dataset: ", len(argilla_df))

Unnamed: 0,prompt,chosen,rejected
0,Can you write a C++ program that prompts the u...,Here's a C++ program that prompts the user to ...,"Sure, here is the program using the C++11 algo..."
1,Suppose you are a content creator and want to ...,To use GPT for generating compelling titles an...,Sure! Although GPT cannot generate text on its...
2,"Identify the interrelated economic, political,...","The stock market crash of 1929, which marked t...",The stock market crash of 1929 was a result of...
3,How can I convert the decimal number 31 to bin...,"Yes, I can provide the JavaScript code for con...",The following JavaScript code can be used to c...
4,Can you modify the C++ code provided below to ...,"Unfortunately, I cannot modify the given C++ c...","Sure, I can help you with that. Here's how you..."


Length of the dataset:  63619


In [None]:
# Save the dataset in a jsonl file
argilla_df.to_json('../data/argilla_dataset.jsonl', orient='records', lines=True)

### Intel dataset

In [None]:
# Load the dataset from: https://huggingface.co/datasets/Intel/orca_dpo_pairs
intel_dataset = load_dataset("Intel/orca_dpo_pairs")

In [None]:
# Create a pandas dataframe from the dataset
intel_df = intel_dataset['train'].to_pandas()

# Get some insights about the dataset
display(intel_df.head())
print("Length of the dataset: ", len(intel_df))

Unnamed: 0,system,question,chosen,rejected
0,,You will be given a definition of a task first...,"[\n [""AFC Ajax (amateurs)"", ""has ground"", ""Sp...","Sure, I'd be happy to help! Here are the RDF ..."
1,You are an AI assistant. You will be given a t...,Generate an approximately fifteen-word sentenc...,Midsummer House is a moderately priced Chinese...,Sure! Here's a sentence that describes all th...
2,"You are a helpful assistant, who always provid...",What happens next in this paragraph?\n\nShe th...,C. She then dips the needle in ink and using t...,"Ooh, let me think! *giggle* Okay, I know what..."
3,You are an AI assistant. You will be given a t...,Please answer the following question: I want t...,"Based on the passage, discuss the primary moti...",Certainly! Here's a detailed and long answer ...
4,You are an AI assistant that helps people find...,James runs a TV show and there are 5 main char...,"James pays the minor characters $15,000 each e...","Sure, I'd be happy to help! To calculate how ..."


Length of the dataset:  12859


In [None]:
# Rename the columns
intel_df = intel_df.rename(columns={'question': 'prompt'})

# Keep only the relevant columns
intel_df = intel_df[['prompt', 'chosen', 'rejected']]

In [None]:
# Verify the dataset
display(intel_df.head())
print("Length of the dataset: ", len(intel_df))

Unnamed: 0,prompt,chosen,rejected
0,You will be given a definition of a task first...,"[\n [""AFC Ajax (amateurs)"", ""has ground"", ""Sp...","Sure, I'd be happy to help! Here are the RDF ..."
1,Generate an approximately fifteen-word sentenc...,Midsummer House is a moderately priced Chinese...,Sure! Here's a sentence that describes all th...
2,What happens next in this paragraph?\n\nShe th...,C. She then dips the needle in ink and using t...,"Ooh, let me think! *giggle* Okay, I know what..."
3,Please answer the following question: I want t...,"Based on the passage, discuss the primary moti...",Certainly! Here's a detailed and long answer ...
4,James runs a TV show and there are 5 main char...,"James pays the minor characters $15,000 each e...","Sure, I'd be happy to help! To calculate how ..."


Length of the dataset:  12859


In [None]:
# Save the dataset in a jsonl file
intel_df.to_json('../data/intel_dataset.jsonl', orient='records', lines=True)

### Nectar dataset

In [None]:
# Load the dataset from: https://huggingface.co/datasets/berkeley-nest/Nectar
nectar_dataset = load_dataset("berkeley-nest/Nectar")

In [None]:
# Create a pandas dataframe from the dataset
nectar_df = nectar_dataset['train'].to_pandas()

# Get some insights about the dataset
display(nectar_df.head())
print("Length of the dataset: ", len(nectar_df))

Unnamed: 0,prompt,answers,turns,num_responses,source,good_natured
0,\n\nHuman: 0.002 = 1000 \n1 = x?\n\nAssistant:,"[{'answer': 'To find the value of x, we can se...",1,7,[sharegpt],True
1,\n\nHuman: 0:00\nwhat's going on guys it's NAM...,[{'answer': 'Hello! It seems like you're shari...,1,7,[lmsys-chat-1m],True
2,\n\nHuman: 01011001 01001111 01010101 00100000...,[{'answer': 'The binary code you provided tran...,1,7,[anthropic-hh],True
3,"\n\nHuman: ""012345"", ""001122"", ""ee44aa"", ""abcd...","[{'answer': 'Sure, I can help you write a func...",1,7,[lmsys-chat-1m],True
4,\n\nHuman: #01 You are an assistant that helps...,"[{'answer': '{  ""thoughts"": ""Based on the c...",1,7,[lmsys-chat-1m],True


Length of the dataset:  182954


In [None]:
# Create relevant columns
nectar_df['chosen'] = nectar_df['answers'].apply(lambda x: x[0]['answer'])
nectar_df['rejected'] = nectar_df['answers'].apply(lambda x: x[-1]['answer'])

# Keep only the relevant columns
nectar_df = nectar_df[['prompt', 'chosen', 'rejected']]

In [None]:
# Verify the dataset
display(nectar_df.head())
print("Length of the dataset: ", len(nectar_df))

Unnamed: 0,prompt,chosen,rejected
0,\n\nHuman: 0.002 = 1000 \n1 = x?\n\nAssistant:,"To find the value of x, we can set up a propor...",It seems like you are asking for the value of ...
1,\n\nHuman: 0:00\nwhat's going on guys it's NAM...,Hello! It seems like you're sharing a transcri...,how much time?
2,\n\nHuman: 01011001 01001111 01010101 00100000...,"The binary code you provided translates to ""YO...",Hello! How can I assist you today?
3,"\n\nHuman: ""012345"", ""001122"", ""ee44aa"", ""abcd...","Sure, I can help you write a function in Go th...","After some research and development, I have cr..."
4,\n\nHuman: #01 You are an assistant that helps...,"{\n ""thoughts"": ""Based on the current traff...","{\n""thoughts"": ""I understand, I will follow th..."


Length of the dataset:  182954


In [None]:
# Take only half of the nectar_df data because it is too large
half_nectar_df = nectar_df.sample(frac=0.5, random_state=42)

# Split the DataFrame into train and test sets
train_nectar_df, test_nectar_df = train_test_split(half_nectar_df, test_size=0.3, random_state=42)

In [None]:
print(len(half_nectar_df))
print(len(train_nectar_df))
print(len(test_nectar_df))

91477
64033
27444


In [None]:
# Save the train and test datasets in a jsonl file
train_nectar_df.to_json('../data/nectar_train_dataset.jsonl', orient='records', lines=True)
test_nectar_df.to_json('../data/test/nectar_test_dataset.jsonl', orient='records', lines=True)

### MMLU MCQ dataset

In [None]:
# Load the dataset from: https://huggingface.co/datasets/cais/mmlu
mmlu_dataset = load_dataset("cais/mmlu", 'all')

Using the latest cached version of the dataset since cais/mmlu couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'all' at /Users/iliasmerigh/.cache/huggingface/datasets/cais___mmlu/all/0.0.0/c30699e8356da336a370243923dbaf21066bb9fe (last modified on Thu May 23 18:23:50 2024).


In [None]:
# Create a pandas dataframe from the dataset
mmlu_df = mmlu_dataset['auxiliary_train'].to_pandas()

# Get some insights about the dataset
display(mmlu_df.head())
print("Length of the dataset: ", len(mmlu_df))

Unnamed: 0,question,subject,choices,answer
0,Davis decided to kill Adams. He set out for Ad...,,"[Adams only., Brooks only., Case only., Adams ...",1
1,A state statute requires any person licensed t...,,"[guilty, because this is a public welfare offe...",3
2,"Lender met Borrower on the street, demanded th...",,"[Yes, because Mann threatened to use deadly fo...",2
3,Peter sued Don for breach of contract. The cou...,,[must permit Don to answer if he had objected ...,1
4,Ames had painted Bell's house under a contract...,,[partial breach of contract only if Ames had p...,2


Length of the dataset:  99842


In [None]:
# Map the answers to the corresponding letters
mmlu_df['answer'] = mmlu_df['answer'].replace({0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'})

In [None]:
# Sanitize the options column
def concatenate_with_separators(strings, separators = ["\nA:", "\nB:", "\nC:", "\nD:", "\nE:"]):
    """
    Concatenate a list of strings with a list of separators.

    Args:
    strings: list of strings
    separators: list of strings

    Returns:
    result: string
    """
    # Ensure there is one fewer separator than there are strings, as separators go between strings
    if len(separators) < len(strings):
        raise ValueError("The number of separators must be one less than the number of strings")

    # Start with empty string
    result = ""

    # Append each subsequent string with its corresponding separator
    for i, string in enumerate(strings, start=1):
        result += separators[i - 1] + string

    return result

In [None]:
# Create the relevant columns to have the same format as the template
mmlu_df['options'] = mmlu_df['choices'].apply(lambda x: concatenate_with_separators(x))
mmlu_df['question'] = "Question: " + mmlu_df['question'] + "\n\nOptions:" + mmlu_df['options'] + "\nAnswer:"

# Keep only the relevant columns
mmlu_df = mmlu_df[['subject', 'question', 'answer']]

In [None]:
# Try to fill the subject column with the gpt wrapper given in the Milestone 1
gpt_wrapper.api_base = "http://mnlp-backend-938795011.eu-central-1.elb.amazonaws.com"
gpt_wrapper.api_key = "769fc7d9-29e5-4864-99cf-59e45abff8eb"

In [None]:
# According to the documentation of the dataset, the categories are the following
categories = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']

# Fill the subject column on a sample of 20 rows to see if the results are coherent
for i, row in mmlu_df.head(20).iterrows():
    chat = Chat.create("mmlu" + str(i))
    question = row['question']
    category = chat.ask(question, instruction= "Which category best describes the subject of the question? Choose one of the following:" + ", ".join(categories))
    mmlu_df.at[i, 'subject'] = category

display(mmlu_df.head(20))

Unnamed: 0,subject,question,answer
0,jurisprudence,Question: Davis decided to kill Adams. He set ...,B
1,professional_law,Question: A state statute requires any person ...,D
2,professional_law,"Question: Lender met Borrower on the street, d...",C
3,jurisprudence,Question: Peter sued Don for breach of contrac...,B
4,professional_law,Question: Ames had painted Bell's house under ...,C
5,professional_law,Question: Ames had painted Bell's house under ...,C
6,professional_law,Question: Ames had painted Bell's house under ...,A
7,professional_law,Question: The State of Aurora requires license...,A
8,professional_law,Question: The State of Aurora requires license...,D
9,professional_law,Question: The State of Aurora requires license...,D


The results are coherent, but the process is too time-consuming and resource-intensive. Therefore, we will ignore the "subject" feature for this dataset.

In [None]:
# Reset the subject column
mmlu_df['subject'] = ''

In [None]:
# Verify the dataset
display(mmlu_df.head())
print("Length of the dataset: ", len(mmlu_df))

Unnamed: 0,subject,question,answer
0,,Question: Davis decided to kill Adams. He set ...,B
1,,Question: A state statute requires any person ...,D
2,,"Question: Lender met Borrower on the street, d...",C
3,,Question: Peter sued Don for breach of contrac...,B
4,,Question: Ames had painted Bell's house under ...,C


Length of the dataset:  99842


The output format is similar to the template.

In [None]:
# Split the DataFrame into train and test sets
train_mmlu_df, test_mmlu_df = train_test_split(mmlu_df, test_size=0.15, random_state=42)

In [None]:
# Save the train dataset in a jsonl file
train_mmlu_df.to_json('../data/mcq_mmlu_dataset.jsonl', orient='records', lines=True)

### MathQA MCQ dataset

In [None]:
# Load the dataset from: https://huggingface.co/datasets/allenai/math_qa
mathqa_dataset = load_dataset("math_qa")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
# Create a pandas dataframe from the dataset
mathqa_df = mathqa_dataset['train'].to_pandas()

# Get some insights about the dataset
display(mathqa_df.head())
print("Length of the dataset: ", len(mathqa_df))

Unnamed: 0,Problem,Rationale,options,correct,annotated_formula,linear_formula,category
0,the banker ' s gain of a certain sum due 3 yea...,"""explanation : t = 3 years r = 10 % td = ( bg ...","a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d...",a,"divide(multiply(const_100, divide(multiply(36,...","multiply(n2,const_100)|multiply(n0,n1)|divide(...",gain
1,average age of students of an adult school is ...,"""explanation : let the original no . of studen...","a ) 1200 , b ) 120 , c ) 360 , d ) 240 , e ) n...",d,"multiply(divide(subtract(multiply(add(32, 4), ...","add(n2,n3)|multiply(n1,n2)|multiply(n1,#0)|sub...",general
2,sophia finished 2 / 3 of a book . she calculat...,let xx be the total number of pages in the boo...,"a ) 229 , b ) 270 , c ) 877 , d ) 266 , e ) 281",b,"divide(90, subtract(const_1, divide(2, 3)))","divide(n0,n1)|subtract(const_1,#0)|divide(n2,#1)",general
3,120 is what percent of 50 ?,"""50 * x = 120 - - > x = 2.4 - - > 2.4 expresse...","a ) 5 % , b ) 240 % , c ) 50 % , d ) 2 % , e )...",b,"multiply(divide(120, 50), const_100)","divide(n0,n1)|multiply(#0,const_100)|",gain
4,there are 10 girls and 20 boys in a classroom ...,"if girls is 10 and boys is 20 , then 10 / 20 ....","a ) 1 / 2 , b ) 1 / 3 , c ) 1 / 5 , d ) 10 / 3...",a,"divide(10, 20)","divide(n0,n1)",other


Length of the dataset:  29837


In [None]:
# Upper case the answer column to fit the template
mathqa_df['answer'] = mathqa_df['correct'].str.upper()

# Sanitize the options column to fit the answer column
mathqa_df['options'] = mathqa_df['options'].str.replace("a ) ", "\nA:", regex=False)
mathqa_df['options'] = mathqa_df['options'].str.replace(" , b ) ", "\nB:", regex=False)
mathqa_df['options'] = mathqa_df['options'].str.replace(" , c ) ", "\nC:", regex=False)
mathqa_df['options'] = mathqa_df['options'].str.replace(" , d ) ", "\nD:", regex=False)
mathqa_df['options'] = mathqa_df['options'].str.replace(" , e ) ", "\nE:", regex=False)

In [None]:
# Get insights about the categories
print(mathqa_df['category'].value_counts())

general        13273
physics         7063
gain            5120
geometry        2117
other           1814
probability      450
Name: category, dtype: int64


In [None]:
# Map the categories into more meaningful names
category_mapping = {'gain': 'mathematics_gain', 'general': 'mathematics_general', 'other': 'mathematics_other'}
mathqa_df['category'] = mathqa_df['category'].map(category_mapping)

In [None]:
# Create the relevant columns to have the same format as the template
mathqa_df['question'] = "Question: " + mathqa_df['Problem'] + "\n\nOptions:" + mathqa_df['options'] + "\nAnswer:"
mathqa_df['subject'] = mathqa_df['category']

# Keep only the relevant columns
mathqa_df = mathqa_df[['subject', 'question', 'answer']]

In [None]:
# Verify the dataset
display(mathqa_df.head())
print("Length of the dataset: ", len(mathqa_df))

Unnamed: 0,subject,question,answer
0,mathematics_gain,Question: the banker ' s gain of a certain sum...,A
1,mathematics_general,Question: average age of students of an adult ...,D
2,mathematics_general,Question: sophia finished 2 / 3 of a book . sh...,B
3,mathematics_gain,Question: 120 is what percent of 50 ?\n\nOptio...,B
4,mathematics_other,Question: there are 10 girls and 20 boys in a ...,A


Length of the dataset:  29837


The output format is similar to the template.

In [None]:
# Split the DataFrame into train and test sets
train_mathqa_df, test_mathqa_df = train_test_split(mathqa_df, test_size=0.15, random_state=42)

In [None]:
# Save the train dataset in a jsonl file
train_mathqa_df.to_json('../data/mcq_mathqa_dataset.jsonl', orient='records', lines=True)

### Aquarat MCQ dataset

In [None]:
# Load the dataset from: https://huggingface.co/datasets/deepmind/aqua_rat
aquarat_dataset = load_dataset("deepmind/aqua_rat")

In [None]:
# Create a pandas dataframe from the dataset
aquarat_df = aquarat_dataset['train'].to_pandas()

# Get some insights about the dataset
display(aquarat_df.head())
print("Length of the dataset: ", len(aquarat_df))

Unnamed: 0,question,options,rationale,correct
0,"Two friends plan to walk along a 43-km trail, ...","[A)21, B)21.5, C)22, D)22.5, E)23]","If Q complete x kilometers, then P completes 1...",E
1,"In the coordinate plane, points (x, 1) and (5,...","[A)4 and 1, B)1 and 5, C)5 and 1, D)3 and 5, E...",Line k passes through the origin and has slope...,C
2,"For all numbers p and q, the operation @ is de...","[A)II, B)I and II, C)I and III, D)II and III, ...",p@q = p^2 - pq=p(p-q).... so p@q will be zero ...,B
3,Carl is facing very difficult financial times ...,"[A)$1600, B)$2000, C)$2150, D)$2500, E)$12000]","Usually, you are given the annual rate of inte...",A
4,The speed at which a man can row a boat in sti...,"[A)18 seconds, B)27 seconds, C)26 seconds, D)1...",Speed of the boat downstream = 25 +11\n= 36 km...,E


Length of the dataset:  97467


In [None]:
# Sanitize the options column
aquarat_df["options"] = aquarat_df["options"].apply(lambda x: ''.join(np.array(x)))

aquarat_df['options'] = aquarat_df['options'].str.replace("A)", "\nA:", regex=False)
aquarat_df['options'] = aquarat_df['options'].str.replace("B)", "\nB:", regex=False)
aquarat_df['options'] = aquarat_df['options'].str.replace("C)", "\nC:", regex=False)
aquarat_df['options'] = aquarat_df['options'].str.replace("D)", "\nD:", regex=False)
aquarat_df['options'] = aquarat_df['options'].str.replace("E)", "\nE:", regex=False)

In [None]:
# Create the relevant columns to have the same format as the template
aquarat_df['question'] = "Question: " + aquarat_df['question'] + "\n\nOptions:" + aquarat_df['options'] + "\nAnswer:"
aquarat_df = aquarat_df.rename(columns={'correct': 'answer'})

# The dataset is all about algebra
aquarat_df['subject'] = 'algebra'

# Keep only the relevant columns
aquarat_df = aquarat_df[['subject', 'question', 'answer']]

In [None]:
# Verify the dataset
display(aquarat_df.head())
print("Length of the dataset: ", len(aquarat_df))

Unnamed: 0,subject,question,answer
0,algebra,Question: Two friends plan to walk along a 43-...,E
1,algebra,"Question: In the coordinate plane, points (x, ...",C
2,algebra,"Question: For all numbers p and q, the operati...",B
3,algebra,Question: Carl is facing very difficult financ...,A
4,algebra,Question: The speed at which a man can row a b...,E


Length of the dataset:  97467


The output format is similar to the template.

In [None]:
# Split the DataFrame into train and test sets
train_aquarat_df, test_aquarat_df = train_test_split(aquarat_df, test_size=0.15, random_state=42)

In [None]:
# Save the train dataset in a jsonl file
train_aquarat_df.to_json('../data/mcq_aquarat_dataset.jsonl', orient='records', lines=True)

### Test MCQ dataset

In [None]:
# Concatenate the dataframes
test_mcq_df = pd.concat([test_mmlu_df, test_mathqa_df, test_aquarat_df])

# Reset the index of the concatenated dataframe
test_mcq_df.reset_index(drop=True, inplace=True)

In [None]:
# Save the dataset in a jsonl file
test_mcq_df.to_json('../data/test/mcq_test_dataset.jsonl', orient='records', lines=True)

# Post-processing


In [None]:
def check_condition(condition):
    """
    Check if a condition is True or False and return the corresponding emoji.

    Args:
    condition: boolean

    Returns:
    emoji: string
    """
    if condition:
        return "✅"
    else:
        return "❌"

### DPO datasets checks

In [None]:
# Load the generated JSONL file
collaborative_df = pd.read_json('../data/collaborative_dataset.jsonl', lines=True)
argilla_df = pd.read_json('../data/argilla_dataset.jsonl', lines=True)
intel_df = pd.read_json('../data/intel_dataset.jsonl', lines=True)
nectar_df = pd.read_json('../data/nectar_train_dataset.jsonl', lines=True)

test_df = pd.read_json('../data/test/nectar_test_dataset.jsonl', lines=True)

In [None]:
# Calculate the lengths
collaborative_length = len(collaborative_df)
argilla_length = len(argilla_df)
intel_length = len(intel_df)
nectar_length = len(nectar_df)
test_length = len(test_df)

# Calculate the sum of lengths
training_length = collaborative_length + argilla_length + intel_length + nectar_length

# Print the lengths and the sum
print("Length of collaborative_dataset.jsonl:", collaborative_length)
print("Length of argilla_dataset.jsonl:", argilla_length)
print("Length of intel_dataset.jsonl:", intel_length)
print("Length of nectar_dataset.jsonl:", nectar_length)
print()
print("Total training length:", training_length)
print("Total test length:", test_length)
print("Ratio of test data:", "{:.2%}".format(test_length / (training_length + test_length)))

Length of collaborative_dataset.jsonl: 26738
Length of argilla_dataset.jsonl: 63619
Length of intel_dataset.jsonl: 12859
Length of nectar_dataset.jsonl: 64033

Total training length: 167249
Total test length: 27444
Ratio of test data: 14.10%


In [None]:
print("Collaborative dataset columns schema is correct: " + check_condition(set(collaborative_df.columns) == {"prompt", "chosen", "rejected"} and len(collaborative_df.columns) == 3))
print("Argilla dataset ccolumns schema is correct: " + check_condition(set(argilla_df.columns) == {"prompt", "chosen", "rejected"} and len(argilla_df.columns) == 3))
print("Intel dataset columns schema is correct: " + check_condition(set(intel_df.columns) == {"prompt", "chosen", "rejected"} and len(intel_df.columns) == 3))
print("Nectar dataset columns schema is correct: " + check_condition(set(nectar_df.columns) == {"prompt", "chosen", "rejected"} and len(nectar_df.columns) == 3))

print("Test dataset columns schema is correct: " + check_condition(set(nectar_df.columns) == {"prompt", "chosen", "rejected"} and len(nectar_df.columns) == 3))

Collaborative dataset columns schema is correct: ✅
Argilla dataset ccolumns schema is correct: ✅
Intel dataset columns schema is correct: ✅
Nectar dataset columns schema is correct: ✅
Test dataset columns schema is correct: ✅


### MCQ datasets checks

In [None]:
# Load the generated JSONL file
mmlu_df = pd.read_json('../data/mcq_mmlu_dataset.jsonl', lines=True)
mathqa_df = pd.read_json('../data/mcq_mathqa_dataset.jsonl', lines=True)
aquarat_df = pd.read_json('../data/mcq_aquarat_dataset.jsonl', lines=True)

test_mcq_df = pd.read_json('../data/test/mcq_test_dataset.jsonl', lines=True)

In [None]:
# Calculate the lengths
mmlu_length = len(mmlu_df)
mathqa_length = len(mathqa_df)
aquarat_length = len(aquarat_df)
test_mcq_length = len(test_mcq_df)

# Calculate the sum of lengths
trainning_mcq_length = mmlu_length + mathqa_length + aquarat_length

# Print the lengths and the sum
print("Length of mcq_mmlu_dataset.jsonl:", mmlu_length)
print("Length of mcq_mathqa_dataset.jsonl:", mathqa_length)
print("Length of mcq_aquarat_dataset.jsonl:", aquarat_length)
print()
print("Total training length:", trainning_mcq_length)
print("Total test length:", test_mcq_length)
print("Ratio of test data:", "{:.2%}".format(test_mcq_length / (trainning_mcq_length + test_mcq_length)))

Length of mcq_mmlu_dataset.jsonl: 84865
Length of mcq_mathqa_dataset.jsonl: 25361
Length of mcq_aquarat_dataset.jsonl: 82846

Total training length: 193072
Total test length: 34074
Ratio of test data: 15.00%


In [None]:
print("MMLU dataset columns schema is correct: " + check_condition(set(mmlu_df.columns) == {"subject", "question", "answer"} and len(mathqa_df.columns) == 3))
print("MathQA dataset columns schema is correct: " + check_condition(set(mathqa_df.columns) == {"subject", "question", "answer"} and len(mathqa_df.columns) == 3))
print("Aquarat dataset columns schema is correct: " + check_condition(set(aquarat_df.columns) == {"subject", "question", "answer"} and len(aquarat_df.columns) == 3))
print("Test dataset columns schema is correct: " + check_condition(set(test_mcq_df.columns) == {"subject", "question", "answer"} and len(mathqa_df.columns) == 3))
print()
print("MMLU dataset answers content is correct: " + check_condition(set(mmlu_df['answer']).issubset({"A", "B", "C", "D", "E"})))
print("MathQA dataset answers content is correct: " + check_condition(set(mathqa_df['answer']).issubset({"A", "B", "C", "D", "E"})))
print("Aquarat dataset answers content is correct: " + check_condition(set(aquarat_df['answer']).issubset({"A", "B", "C", "D", "E"})))
print("Test dataset answers content is correct: " + check_condition(set(test_mcq_df['answer']).issubset({"A", "B", "C", "D", "E"})))

MMLU dataset columns schema is correct: ✅
MathQA dataset columns schema is correct: ✅
Aquarat dataset columns schema is correct: ✅
Test dataset columns schema is correct: ✅

MMLU dataset answers content is correct: ✅
MathQA dataset answers content is correct: ✅
Aquarat dataset answers content is correct: ✅
Test dataset answers content is correct: ✅


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
# TODO: Fill in the `ROOT_PATH` with where you download the Assignment folder
ROOT_PATH = "/content/drive/MyDrive/project-m3-2024-jim/model"  # Replace with your directory to A3 folder
print(os.listdir(ROOT_PATH)) # Check the content of the path
os.chdir(ROOT_PATH) # cd into directory
print(os.listdir(".")) # Check the content of current folder

['utils.py', '.DS_Store', 'requirements.txt', 'documents', 'models', 'checkpoints', 'datasets', 'main_config.yaml', '__pycache__', 'evaluator.py', 'Copie de data_preprocessing.ipynb']
['utils.py', '.DS_Store', 'requirements.txt', 'documents', 'models', 'checkpoints', 'datasets', 'main_config.yaml', '__pycache__', 'evaluator.py', 'Copie de data_preprocessing.ipynb']


In [4]:
!pip install -r requirements.txt

Collecting datasets (from -r requirements.txt (line 2))
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate (from -r requirements.txt (line 3))
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting wandb (from -r requirements.txt (line 6))
  Downloading wandb-0.17.1-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optimum (from -r requirements.txt (line 7))
  Downloading optimum-1.20.0-py3-none-any.whl (418 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m418.4/418.4 kB[0m [31m43.3 MB/s[0m eta [36m

In [5]:
import torch

In [6]:
if torch.cuda.is_available():
    print("GPU is available! Good to go.")
else:
    print(
        "If you are using Colab, please set your runtime type to a GPU via {Edit -> Notebook Settings}."
    )

GPU is available! Good to go.


In [7]:
! ls

 checkpoints			      datasets	  evaluator.py	     models	   requirements.txt
'Copie de data_preprocessing.ipynb'   documents   main_config.yaml   __pycache__   utils.py


In [13]:
! python3 evaluator.py

2024-06-13 16:51:45.015145: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-13 16:51:45.067713: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-13 16:51:45.067765: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-13 16:51:45.069558: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-13 16:51:45.077705: I tensorflow/core/platform/cpu_feature_guar