In [1]:
import time
import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from sentence_transformers.util import cos_sim
from IPython.display import clear_output

import pickle
def save_obj(obj:object,name:str):
    ext = '.pickle'
    with open(name + ext, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_obj(name:str)->object:
    ext = '.pickle'
    with open(name + ext, 'rb') as handle:
        return pickle.load(handle)

In [2]:
def match_keywords(
    keywords_emb:list[np.ndarray],
    candidates_emb : list[np.ndarray],
    thershold: float)\
        -> float:
    """
    match keywords with candidates in a document

    Args:
        keywords_emb (List[np.ndarray]): list of keywords embeddings
        candidates_emb (List[np.ndarray]): list of document's candidates embeddings
        thershold (float): threshold

    Returns:
        float: score
    
    example:
        >>> match_keywords(keywords_emb, candidates_emb, thershold=0.5)
        >>> 0.8
    """
    similarities = list(map(lambda cand:
                    cos_sim(np.array(keywords_emb), cand.reshape(cand.shape[0],cand.shape[1])).__array__().max(axis=1).round(6).clip(-1, 1),
                    candidates_emb))

    return similarities

def grading(keywords_embeddings_list,students_candidates_emb_list,thershold=0.5):
    """
    Args:
        keywords_embeddings_list: list of list of list of embeddings
        students_candidates_emb_list: list of list of list of embeddings
        thershold: thershold for the similarity
    Returns:
        a list of list of list of grades
    """
    grades = []
    for i in range(len(keywords_embeddings_list)):
        # for j in range(len(keywords_embeddings_list[i])):
        #     grades.append(key_words.match_keywords(keywords_embeddings_list[i][j],
        #                                     students_candidates_emb_list[i][j],
        #                                     thershold))
        
        # map(lambda j: grades.append(key_words.match_keywords(keywords_embeddings_list[i][j],)))
        grades.append(np.array(list(map(lambda st_cand:
                match_keywords(keywords_embeddings_list[i], st_cand,
                thershold=thershold),
                students_candidates_emb_list[i]
                ))))
    grades = np.array(list(map(lambda sim: (sim.__array__().max(axis=1) >thershold).sum(axis=1)/float(sim.shape[-1]) , grades)))
    return grades

In [3]:
train_path = "data/train_phase1.tsv"
df = pd.read_csv(train_path, sep="\t")

In [4]:
df_dict ={}

for i in range(1,11):
    df_dict['ess_'+str(i)+'_keywords'] = pd.DataFrame(df[df['EssaySet'] == i]['score_gn_1'])
    # remove index
    df_dict['ess_'+str(i)+'_keywords'].reset_index(drop=True, inplace=True)

In [5]:
for essay in range(1,11):
# for essay in range(1,11)[:2]:
# for essay in range(3,4):
    print("Loading essay",essay,"...")
    kwrds = load_obj(f'data/results/keywords_res_essay_{essay}')
    # grid search for the best threshold
    thresholds = np.arange(0.2, 0.95, 0.03).round(3)
    for threshold in thresholds:
        print(f"Grading essay {essay} threshold {threshold} ...")
        x = grading(kwrds['keywords_embeddings_list'],kwrds['students_candidates_emb_list'],threshold)
        df_dict[f'ess_{essay}_keywords'][f'keys_score_{threshold}'] = np.median(x ,axis=0)
        clear_output()
print("Done!")

Done!


In [100]:
df_dict[f'ess_{essay}_keywords'].head()

Unnamed: 0,score_gn_1,x,keys_score_0.2,keys_score_0.23,keys_score_0.26,keys_score_0.29,keys_score_0.32,keys_score_0.35,keys_score_0.38,keys_score_0.41,...,keys_score_0.65,keys_score_0.68,keys_score_0.71,keys_score_0.74,keys_score_0.77,keys_score_0.8,keys_score_0.83,keys_score_0.86,keys_score_0.89,keys_score_0.92
0,0.282827,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.571429,0.571429,0.428571,0.428571,0.285714,0.142857,0.142857,0.0,0.0,0.0
1,0.367338,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.857143,0.857143,...,0.428571,0.285714,0.285714,0.285714,0.242857,0.142857,0.142857,0.0,0.0,0.0
2,0.321319,0.0,1.0,1.0,1.0,1.0,1.0,0.857143,0.714286,0.571429,...,0.285714,0.285714,0.214286,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0
3,0.013847,0.0,1.0,0.857143,0.535714,0.428571,0.285714,0.142857,0.071429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.690441,0.0,1.0,1.0,1.0,1.0,1.0,0.857143,0.857143,0.732143,...,0.285714,0.267857,0.183333,0.183333,0.154762,0.142857,0.142857,0.142857,0.0,0.0


In [151]:
# mean square error between the grades and the score_gn_1
mse = []
thresholds_best = []
for essay in range(1,2):
    thresholds = np.arange(0.2, 0.95, 0.03).round(3)
    for threshold in thresholds:
        mse.append(np.max((df_dict[f'ess_{essay}_keywords'][f'keys_score_{threshold}'] - df_dict[f'ess_{essay}_keywords']['score_gn_1'])**2))
    # select the best 3 thresholds
    thresholds_best = thresholds[np.argsort(mse)[:3]]
thresholds_best

array([0.56, 0.44, 0.47])

In [132]:
save_obj(df_dict[f'ess_{essay}_keywords']['keys_score_0.53'], 'data/results/keys_score_0.53_essay')