# Mergeall


In [1]:
import sys
sys.path.append('src/')
from dollarparser import parse_dollar_amount
from detailed_dunns import detailed_dunns, better_posthoc_dunns

import os
import json
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import glob
import matplotlib.pyplot as plt
import numpy as np
import datetime

import warnings
warnings.filterwarnings("ignore")

sns.set_theme(context='notebook', style='whitegrid')
pd.set_option("display.max_rows", 100)

In [2]:
start_run = datetime.datetime.now()

# Data Cleaning

In [8]:
fns = glob.glob("output_data/*.jsonl.zip")
fns.sort()
print(fns)

['output_data/emp_name_major_Llama-3-8B-Instruct_output.jsonl.zip', 'output_data/emp_name_major_Llama-3.1-8B-Instruct_output.jsonl.zip', 'output_data/emp_name_major_Llama-3.2-1B-Instruct_output.jsonl.zip', 'output_data/emp_name_major_Llama-3.2-3B-Instruct_output.jsonl.zip', 'output_data/emp_name_major_Mistral-7b-instruct-v0.1_output.jsonl.zip', 'output_data/emp_name_major_Mistral-7b-instruct-v0.3_output.jsonl.zip', 'output_data/emp_name_major_Phi-3-mini-4k-instruct_output.jsonl.zip', 'output_data/emp_name_major_Qwen2.5-0.5B-Instruct_output.jsonl.zip', 'output_data/emp_name_major_Qwen2.5-1.5B-Instruct_output.jsonl.zip', 'output_data/emp_name_major_Qwen2.5-3B-Instruct_output.jsonl.zip', 'output_data/emp_name_major_Qwen2.5-7B-Instruct_output.jsonl.zip', 'output_data/emp_name_major_gemma-2-2b-it_output.jsonl.zip', 'output_data/emp_name_major_gemma-2-9b-it_output.jsonl.zip', 'output_data/emp_name_major_gemma-2b-it_output.jsonl.zip', 'output_data/emp_name_major_gemma-7b-it_output.jsonl.zip',

In [9]:
dfs = []

for file in tqdm(fns):
    df = pd.read_json(file, lines=True)

    # Normalize nested data to access 'model' inside 'response.body' and 'content' inside 'response.body.choices'
    df = pd.json_normalize(
        df.to_dict(orient='records'),
        record_path=['response', 'body', 'choices'],
        meta=[['custom_id'], ['response', 'body', 'model']],
        record_prefix='choices.'
    )

    df_final = df[['custom_id', 'response.body.model', 'choices.message.content']]
    df_final.columns = ['custom_id', 'model', 'content']  

    df_seed = pd.read_csv("input_data/name_major_seed.csv")
    df_seed['custom_id'] = 'task-' + df_seed['run_id'].astype(str)

    df_merged = pd.merge(df_final, df_seed, on='custom_id', how='inner')
    
    # from dollarparser.py: return nan for values <30000 or >300000
    df_merged['query_response'] = df_merged['content'].apply(parse_dollar_amount, args=(30000,300000) )
    #df_merged[['content', 'query_response']].sort_values('query_response')

    dfs.append(df_merged)

df_all = pd.concat(dfs)
df_all.drop(columns="query_response_raw", inplace=True)
df_all.rename(columns={"content": "query_response_raw"}, inplace=True)
len(df_all)

100%|██████████| 20/20 [01:00<00:00,  3.03s/it]


986000

In [5]:
df_all

Unnamed: 0,custom_id,model,query_response_raw,run_id,name,gender,race,major,query_response
0,task-0,meta-llama/Meta-Llama-3-8B-Instruct,"$120,000",0,Osama Mubbaarak,Man,Arabic,Business,120000.0
1,task-1,meta-llama/Meta-Llama-3-8B-Instruct,"$105,000",1,Osama Mubbaarak,Man,Arabic,Business,105000.0
2,task-2,meta-llama/Meta-Llama-3-8B-Instruct,"$85,000",2,Osama Mubbaarak,Man,Arabic,Business,85000.0
3,task-3,meta-llama/Meta-Llama-3-8B-Instruct,"$125,000",3,Osama Mubbaarak,Man,Arabic,Business,125000.0
4,task-4,meta-llama/Meta-Llama-3-8B-Instruct,"$135,000",4,Osama Mubbaarak,Man,Arabic,Business,135000.0
...,...,...,...,...,...,...,...,...,...
49295,task-49295,gpt-4o-mini-2024-07-18,"$100,000",49295,None-Control,None-Control,None-Control,None-Control,100000.0
49296,task-49296,gpt-4o-mini-2024-07-18,"$105,000",49296,None-Control,None-Control,None-Control,None-Control,105000.0
49297,task-49297,gpt-4o-mini-2024-07-18,"$120,000",49297,None-Control,None-Control,None-Control,None-Control,120000.0
49298,task-49298,gpt-4o-mini-2024-07-18,"$105,000",49298,None-Control,None-Control,None-Control,None-Control,105000.0


In [10]:
keepcols = ['custom_id', 'model', 'name', 'gender', 'race', 'major', 'query_response_raw', 'query_response']
df_all[keepcols].to_csv("processed_data/emp_name_major_allmodels.csv.zip", index=False, compression='zip')

In [7]:
print("Elapsed time:", datetime.datetime.now() - start_run)

Elapsed time: 0:01:01.197407
