In [None]:
import openai
import os
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', 2000)
import re
from datetime import datetime
from time import sleep
import matplotlib.pyplot as plt
import seaborn as sns
import random
import statsmodels.api as sm
from statsmodels.formula.api import ols
import numpy as np
from copy import deepcopy
import math
from statsmodels.stats.multicomp import MultiComparison
import warnings
warnings.filterwarnings('ignore')
import llm
from sklearn.model_selection import train_test_split
import json
from utils import *
from utils_persona import *
import ast
DATA_DIR = '/weka/scratch/djiang21/jo/psychometric-alignment/'

In [None]:
DATA_PATH = 'data/eedi/'
# DATA_PATH = 'data/wordbank/'
# DATA_PATH = 'data/duolingo/'
df = pd.read_csv(DATA_DIR + DATA_PATH + 'all_data.csv')

# prepare train val

In [None]:
results = pd.read_csv(DATA_DIR + DATA_PATH + 'test_data.csv')
test_user_ids = list(results['UserId'].unique())
test_question_ids = list(results['QuestionId'].unique())

print('before dropping test users:', len(df))
print('UserId', df['UserId'].nunique())
df = df[~df['UserId'].isin(test_user_ids)]
print('after dropping test users:', len(df))
print('UserId', df['UserId'].nunique())

print('before dropping test q:', len(df))
print('QuestionId', df['QuestionId'].nunique())
df = df[~df['QuestionId'].isin(test_question_ids)]
print('after dropping test q:', len(df))
print('QuestionId', df['QuestionId'].nunique())

df = df.sort_values(['UserId', 'DateAnswered'])

if 'countries' in df:
    df['countries_full'] = df['countries'].apply(lambda x: map_country_codes(x))

if 'duolingo' in DATA_PATH:
    df['persona'] = df.apply(create_persona_duolingo_first_pov, axis=1)
    input_response_column = 'Correctness'
elif 'wordbank' in DATA_PATH:
    df['persona'] = df.apply(create_persona_wordbank_first_pov, axis=1)
    input_response_column = 'Correctness'
else:
    df['persona'] = df.apply(create_persona_3basic, axis=1)
    input_response_column = 'answer_choice'

if 'AnswerValue' in df.columns and 'CorrectAnswer' in df.columns:
    df['answer_choice'] = df['AnswerValue'].apply(lambda x: number_to_letter(x))
    df['correct_answer_choice'] = df['CorrectAnswer'].apply(lambda x: number_to_letter(x))
df['Correctness'] = df['IsCorrect'].apply(lambda x: map_binary_to_correctness(x))

random.seed(0)
all_ids = list(df['UserId'].unique())
val_df_ids = random.sample(all_ids, int(0.1*len(all_ids)))
train_df_ids = list(set(all_ids) - set(val_df_ids))
train_df = df[df['UserId'].isin(train_df_ids)]
val_df = df[df['UserId'].isin(val_df_ids)]
print(len(train_df))
print(train_df['UserId'].nunique())
print(len(val_df))
print(val_df['UserId'].nunique())

# take subsets of training data

In [None]:
def sample_ids(train_df_ids):
    random.seed(0)
    current_ids = train_df_ids[:]
    samples = [train_df_ids]
    
    for _ in range(8):
        current_ids = random.sample(current_ids, int(0.5*len(current_ids)))
        samples.append(current_ids)
    
    return samples
train_samples = sample_ids(train_df_ids)
for t in train_samples:
    print(len(t))

# save answer_id files for val and each subset of training data

In [None]:
train_samples = train_samples[-2:]

In [None]:
DATE = '20240612'

#================eedi=================
# MIN_NUM = 3
# MAX_NUM = 10
# NUM_REPEAT_VAL = 1
# NUM_REPEAT_TRAIN = 20
# define_sequence = ['UserId', 'QuizId']

#================wordbank=================
# MIN_NUM = 2
# MAX_NUM = 49
# NUM_REPEAT_VAL = 10
# NUM_REPEAT_TRAIN = 100
# define_sequence = ['UserId']

#================duolingo=================
# MIN_NUM = 2
# MAX_NUM = 49
# NUM_REPEAT_VAL = 10
# NUM_REPEAT_TRAIN = 200
# define_sequence = ['UserId']

def generate_random_data_points(df, num_iter, num_example, min_num_example, define_sequence=['UserId', 'QuizId']):
    print('min_num_example', min_num_example + 1)
    print('max_num_example', num_example + 1)
    print(datetime.now())
    instruction_list = []
    input_list = []
    output_list = []
    input_answer_id_list = []
    output_answer_id_list = []
    is_correct_list = []
    for iteration in range(num_iter):
        for _, group_df in df.groupby(define_sequence):
            total_problems = len(group_df)
            if total_problems > num_example:
                input_df = group_df.sample(n=num_example)
                output_df = group_df.loc[~group_df.index.isin(input_df.index)]
                output_df = output_df.sample(n=1)
            elif total_problems > min_num_example and total_problems <= num_example:
                input_df = group_df.sample(n=total_problems - 1)
                output_df = group_df.loc[~group_df.index.isin(input_df.index)]
            else:
                continue
            output_answer_id = output_df['AnswerId'].iloc[0]
            input_answer_id_list.append(list(input_df['AnswerId']))
            output_answer_id_list.append(output_answer_id)
            is_correct_list_sample = list(input_df['IsCorrect'])
            is_correct_list_sample.append(output_df['IsCorrect'].iloc[0])
            is_correct_list.append(is_correct_list_sample)
    return input_answer_id_list, output_answer_id_list, is_correct_list

val_data = pd.DataFrame()
input_answer_id_list, output_answer_id_list, is_correct_list = generate_random_data_points(val_df, NUM_REPEAT_VAL, MAX_NUM, MIN_NUM, define_sequence=define_sequence)
val_data['input_answer_id'] = input_answer_id_list
val_data['output_answer_id'] = output_answer_id_list
val_data['is_correct_list'] = is_correct_list
val_data = val_data.sample(frac=1, random_state=0).reset_index(drop=True)
print('len(val_data) after', len(val_data))
val_data.to_csv(DATA_DIR + DATA_PATH + 'val_answer_id_' + DATE + '_' + str(val_df['UserId'].nunique()) + '.csv', index=False)

for s in train_samples:
    train_df = df[df['UserId'].isin(s)]
    print('len(train_df)', len(train_df))
    print('unique users', train_df['UserId'].nunique())
    train_data = pd.DataFrame()
    input_answer_id_list, output_answer_id_list, is_correct_list = generate_random_data_points(train_df, NUM_REPEAT_TRAIN, MAX_NUM, MIN_NUM, define_sequence=define_sequence)
    train_data['input_answer_id'] = input_answer_id_list
    train_data['output_answer_id'] = output_answer_id_list
    train_data['is_correct_list'] = is_correct_list
    train_data = train_data.sample(frac=1, random_state=0).reset_index(drop=True)
    print('len(train_data) after', len(train_data))   
    train_data.to_csv(DATA_DIR + DATA_PATH + 'train_answer_id_' + DATE + '_' + str(train_df['UserId'].nunique()) + '.csv', index=False)

# save jsonl files

In [None]:
TRAIN_DATA_DIR = DATA_DIR + DATA_PATH
PERSONA_TEMPLATE = 'first_pov'

def get_answer_id_list(x):
    input_id = x[0]
    output_id = x[1]
    result = ast.literal_eval(input_id)
    result.append(output_id)
    return result 

def df_to_jsonl(df, filename):
    with open(filename, 'w') as file:
        for _, row in df.iterrows():
            jsonl_obj = {"instruction": row["instruction"], "input": row["input"]}
            file.write(json.dumps(jsonl_obj) + '\n')

def save_data_jsonl(answer_id_df, sample_df, input_response_column, tmp_data_path):
    instruction_list = []
    input_list = []
    answer_id_list = []
    if PERSONA_TEMPLATE == 'first_pov':
        student_answer_indicator = 'Your answer'
    elif PERSONA_TEMPLATE == 'third_pov':
        student_answer_indicator = 'Student answer'
    for i in range(len(answer_id_df)):
        order_list = answer_id_df.iloc[i]['answer_id_list']
        # Create a dictionary to map AnswerId to position
        order_dict = {id: index for index, id in enumerate(order_list)}

        # Create a new column for sorting
        if 'sort_order' in sample_df:
            sample_df = sample_df.drop(['sort_order'], axis=1)
        sample_df['sort_order'] = sample_df['AnswerId'].map(order_dict)
        input_df = sample_df.dropna(subset=['sort_order']).sort_values('sort_order').drop('sort_order', axis=1).copy()
        if len(input_df) > 0:
            if input_response_column == 'Correctness':
                concatenated_string = 'Question:\n' + '\nQuestion:\n'.join(input_df['problem'] + '\n' + student_answer_indicator + ':\n' + input_df[input_response_column])
            else:
                concatenated_string = 'Question:\n' + '\nQuestion:\n'.join(input_df['problem'] + '\n' + student_answer_indicator + ':\n' + input_df[input_response_column] + '\nTrue answer:\n' + input_df['correct_answer_choice'])
            instruction_list.append(input_df['persona'].iloc[0])
            input_list.append(concatenated_string)
            answer_id_list.append(list(input_df['AnswerId']))

    json_data = pd.DataFrame()
    json_data['instruction'] = instruction_list
    json_data['input'] = input_list
    json_file_path = tmp_data_path + '.jsonl'
    df_to_jsonl(json_data, json_file_path)
    return json_file_path

all_answer_id_files = [f for f in os.listdir(TRAIN_DATA_DIR) if 'answer_id' in f and 'csv' in f and DATE in f]
all_answer_id_files

In [None]:
for f in all_answer_id_files:
    print(datetime.now())
    answer_id_file = TRAIN_DATA_DIR + f
    answer_id_df = pd.read_csv(answer_id_file)
    answer_id_df['answer_id_list'] = answer_id_df[['input_answer_id', 'output_answer_id']].apply(lambda x: get_answer_id_list(x), axis=1)
    tmp_data_path = TRAIN_DATA_DIR + DATE + '_data_with_' + answer_id_file.split('/')[-1].split('.csv')[0]
    json_file_path = save_data_jsonl(answer_id_df, df, input_response_column, tmp_data_path)
    print(json_file_path)

In [None]:
data = load_dataset('json', data_files='/weka/scratch/djiang21/jo/psychometric-alignment/data/eedi/20240612_data_with_val_answer_id_20240612_213.jsonl', split='train')


In [None]:
def formatting_func(row):
    return ("{instruction}\n{input}").format_map(row)

print(formatting_func(data[0]))