In [156]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset
from pprint import pprint
from openai import OpenAI

In [330]:
mmlu   = load_dataset("kz-transformers/mmlu-translated-kk")
const  = load_dataset("kz-transformers/kazakh-constitution-mc")
dastur = load_dataset("kz-transformers/kazakh-dastur-mc")
ent    = load_dataset("kz-transformers/kazakh-unified-national-testing-mc")

In [333]:
openai_key = os.environ['OPENAI_API_KEY']

client = OpenAI(api_key=openai_key)

In [61]:
file_response = client.files.content('file-FQdij62Y18TyFwjwMVbXyN')

In [62]:
response = file_response.text

In [67]:
with open('response.jsonl', 'w+') as f:
    f.write(response)

In [68]:
x = pd.read_json('response.jsonl', lines=True)

In [83]:
predicts = []
for idx, row in x.iterrows():
    r = row.response['body']['choices'][0]['message']['content']
    predicts.append(r)

In [360]:
predicts[:5]

['B', 'A', 'A', 'C', 'C']

In [289]:
x = pd.read_json('response.jsonl', lines=True)

In [138]:
ds = mmlu["validation"]
labels = []
for i, entry in enumerate(ds):
    labels.append(entry['Correct Answer'])

for dataset_name, dataset_obj in [("const", const), ("dastur", dastur)]:
    ds = dataset_obj["test"]
    for i, entry in enumerate(ds):
        labels.append(entry['Correct Answer'])
        

for subject_name in ent.keys():
    ds = ent[subject_name]
    for i, entry in enumerate(ds):
        labels.append(entry['correct_answer'])

In [231]:
len(labels), len(predicts)

(17800, 17800)

In [140]:
from sklearn.metrics import classification_report

In [149]:
for idx, fp in enumerate(labels):
    if fp == 'А':
        labels[idx] = 'A'

In [186]:
print(classification_report(labels, final_predicts))

              precision    recall  f1-score   support

           A       0.60      0.63      0.61      4707
           B       0.54      0.70      0.61      3288
           C       0.58      0.61      0.60      3919
           D       0.63      0.56      0.59      3059
           E       0.71      0.46      0.56      2827

    accuracy                           0.60     17800
   macro avg       0.61      0.59      0.60     17800
weighted avg       0.61      0.60      0.60     17800



In [290]:
x['predicts'] = final_predicts
x['labels']   = labels

In [291]:
x['category'] = x.custom_id.apply(lambda x: x.split('-')[:-1][-1])

In [208]:
from sklearn.metrics import accuracy_score

In [292]:
x['category'] = x.category.apply(lambda x:  'mmlu_translated_kk' if x == 'mmlu' else x)
x['category'] = x.category.apply(lambda x:  'kk_constitution_mc' if x == 'const' else x)
x['category'] = x.category.apply(lambda x:  'kk_biology_unt_mc' if x == 'biology' else x)
x['category'] = x.category.apply(lambda x:  'kk_dastur_mc' if x == 'dastur' else x)
x['category'] = x.category.apply(lambda x:  'kk_english_unt_mc' if x == 'english' else x)
x['category'] = x.category.apply(lambda x:  'kazakh_and_literature_unt_mc' if x == 'kazakh_and_literature' else x)
x['category'] = x.category.apply(lambda x:  'kk_geography_unt_mc' if x == 'geography' else x)
x['category'] = x.category.apply(lambda x:  'kk_history_of_kazakhstan_unt_mc' if x == 'history_of_kazakhstan' else x)
x['category'] = x.category.apply(lambda x:  'kk_human_society_rights_unt_mc' if x == 'human_society_rights' else x)
x['category'] = x.category.apply(lambda x:  'kk_world_history_unt_mc' if x == 'world_history' else x)

In [294]:
categories = x.category.unique()

In [325]:
state = {"model_dtype": "torch.float16",
        "model": "gpt-4o-mini",
        "ppl": 0}

for idx, category in enumerate(categories):
    buffer = x[x.category == category]

    print(category)
    print(accuracy_score(buffer.labels, buffer.predicts))
    print()
    
    state[category] =  accuracy_score(buffer.labels, buffer.predicts)

mmlu_translated_kk
0.5623775310254735

kk_constitution_mc
0.9565217391304348

kk_dastur_mc
0.9383084577114428

kazakh_and_literature_unt_mc
0.4953071672354949

kk_geography_unt_mc
0.5675203725261933

kk_world_history_unt_mc
0.6091205211726385

kk_history_of_kazakhstan_unt_mc
0.47883435582822087

kk_english_unt_mc
0.6763768775603095

kk_biology_unt_mc
0.607421875

kk_human_society_rights_unt_mc
0.7309417040358744



In [358]:
with open('submit_resul_gpt4o-mini.json', 'w') as f:
    json.dump([state], f)

# compute dummy random baseline

In [322]:
responses = dict()
state = {"model_dtype": "torch.float16",
        "model": "dummy-random-baseline",
        "ppl": 0}

for idx, category in enumerate(categories):
    buffer = x[x.category == category]

    print()

    random_preds = []
    for label in buffer.labels:
        random_predict = np.random.choice(buffer.labels.unique())
        random_preds.append(random_predict)
    print(category)
    print(accuracy_score(buffer.labels, random_preds))
    responses[category] = {
        'acc,none':accuracy_score(buffer.labels, random_preds),
        'acc_stderr,none': 0.01,
        'alias': category
    }

    state[category] =  accuracy_score(buffer.labels, random_preds)


mmlu_translated_kk
0.22991508817766165

kk_constitution_mc
0.25120772946859904

kk_dastur_mc
0.24477611940298508

kazakh_and_literature_unt_mc
0.2090443686006826

kk_geography_unt_mc
0.2019790454016298

kk_world_history_unt_mc
0.1986970684039088

kk_history_of_kazakhstan_unt_mc
0.19417177914110428

kk_english_unt_mc
0.189804278561675

kk_biology_unt_mc
0.22330729166666666

kk_human_society_rights_unt_mc
0.242152466367713


In [323]:
with open('submit_resul_dummy_baseline.json', 'w') as f:
    json.dump([state], f)

'[{"model_dtype": "torch.float16", "model": "dummy-random-baseline", "ppl": 0, "mmlu_translated_kk": 0.22991508817766165, "kk_constitution_mc": 0.25120772946859904, "kk_dastur_mc": 0.24477611940298508, "kazakh_and_literature_unt_mc": 0.2090443686006826, "kk_geography_unt_mc": 0.2019790454016298, "kk_world_history_unt_mc": 0.1986970684039088, "kk_history_of_kazakhstan_unt_mc": 0.19417177914110428, "kk_english_unt_mc": 0.189804278561675, "kk_biology_unt_mc": 0.22330729166666666, "kk_human_society_rights_unt_mc": 0.242152466367713}]'