In [1]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt


def provide_confusion_matrix(GoldLabel, PredictLabel, label_set):
    """
    use `sklearn.metric confusion_matrix` to create confusion matrix of model predict.
    and `sklearn.metric ConfusionMatrixDisplay` to display created confusion matrix.

    Parameters
    ----------
    GoldLabel : list
        list of all Gold labels
    PredictLabel : list
        list of all Prediction labels
    label_set : list
        list of all classes

    Returns
    -------
        Confusion matrix
    """
    cf_matrix = confusion_matrix(GoldLabel, PredictLabel) # create a confusion matrix with gold and predicts
    print(cf_matrix) # print confusion_matrix as text
    display = ConfusionMatrixDisplay(confusion_matrix=cf_matrix, display_labels=label_set) # create graphical confusion_matrix
    fig, ax = plt.subplots(figsize=(15,15))
    # display.plot(ax=ax)
    display.plot(ax =ax) # show confusion_matrix
    plt.xticks(rotation=90)
    plt.show()
    return cf_matrix # return confusion_matrix (maybe useful later)

def calculate_precision_recall_f1score(GoldLabel, PredictLabel, label_set): # function get gold and predict and set of labels
    """
    use `sklearn.metric classification_report` to get report of model predict.

    Parameters
    ----------
    GoldLabel : list
        list of all Gold labels
    PredictLabel : list
        list of all Prediction labels
    label_set : list
        list of all classes

    Returns
    -------
        Classification report
    """
    report = classification_report(GoldLabel, PredictLabel, digits = 3, target_names=label_set) # calculate report
    print(report) # print report
    return report # return report (maybe useful later)

# def evaluation_model(GoldLabel, PredictLabel): # get gold and predict
def evaluation_model(GoldLabel, PredictLabel): # get gold and predict
    """
    Evaluation models by call `calculate_precision_recall_f1score` and `provide_confusion_matrix` functions.

    Parameters
    ----------
    GoldLabel : list
        list of all Gold labels
    PredictLabel : list
        list of all Prediction labels

    Returns
    -------
        Classification report and Confusion matrix
    """

    # GoldLabel = extract_golds_from_data(data)

    label_set = sorted(set(GoldLabel)) # find uniqe lables in gold
    print(label_set)

    print('precision_recall_f1-score')
    report = calculate_precision_recall_f1score(GoldLabel, PredictLabel, label_set) # calculate_precision_recall_f1score

    print('Confusion matrix')
    cf_matrix = provide_confusion_matrix(GoldLabel, PredictLabel, label_set) # provide_confusion_matrix

    return report, cf_matrix # return report and cf_matrix


In [5]:
import pickle
def load_data(file_path):
    with open(file_path.replace('.conllu', '.preprocessed.pkl'), 'rb') as pickle_file:
        return pickle.load(pickle_file)


In [9]:
train_file_path = 'en_ewt-up-train.conllu'
train_data = load_data(train_file_path)


In [10]:
word_list = []
for sents in train_data:
    for word in sents:
        for f in word['features']:
            word[f]=word['features'][f]
        word_list.append(word)


In [11]:
print(len(word_list))

1028137


In [15]:
import pandas as pd 

train_df = pd.DataFrame(word_list)
train_df.head()

Unnamed: 0,id,form,lemma,upos,xpos,features,head,dependency_relation,dependency_graph,miscellaneous,...,Case,Person,NumType,Voice,Gender,Poss,Reflex,Typo,Foreign,Abbr
0,1,Al,Al,PROPN,NNP,"{'Number': 'Sing', 'embedding': [0.0, 0.0, 0.0...",0,root,0:root,SpaceAfter=No,...,,,,,,,,,,
1,2,-,-,PUNCT,HYPH,"{'embedding': [-0.52482, -0.31963, -0.11898, -...",1,punct,1:punct,SpaceAfter=No,...,,,,,,,,,,
2,3,Zaman,Zaman,PROPN,NNP,"{'Number': 'Sing', 'embedding': [0.0, 0.0, 0.0...",1,flat,1:flat,_,...,,,,,,,,,,
3,4,:,:,PUNCT,:,"{'embedding': [0.43607, 1.5253, -0.11532, 0.33...",1,punct,1:punct,_,...,,,,,,,,,,
4,5,American,american,ADJ,JJ,"{'Degree': 'Pos', 'embedding': [0.15796, 0.012...",6,amod,6:amod,_,...,,,,,,,,,,


In [16]:
train_df = train_df.drop(columns=['id','features', 'Definite', 'PronType', 'Number', 'Mood','Person', 'Tense', 'VerbForm','head','dependency_graph',
                                  'miscellaneous','head_pp_feature','prev_token_morph_features', 'next_token_morph_features','embedding_head', 'punct_extracted',
                                  'NumType','Degree', 'Case', 'Gender', 'Poss', 'Voice', 'Foreign', 'Reflex', 'Typo','num_of_children','Abbr','propbank_arg'])

In [17]:
train_df.head()

Unnamed: 0,form,lemma,upos,xpos,dependency_relation,predicate,argument,embedding,pos_extracted,position_rel2pred,head_pos,dep_path,cosine_similarity_w_predicate,pos_misc_feature,ner
0,Al,Al,PROPN,NNP,root,kill.01,_,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",PROPN,Before,root,_,[[0.0]],PROPN_no_space,PERSON
1,-,-,PUNCT,HYPH,punct,kill.01,_,"[-0.52482, -0.31963, -0.11898, -0.62672, 0.043...",PUNCT,Before,PROPN,[HYPH],[[0.2082173]],PUNCT_no_space,_
2,Zaman,Zaman,PROPN,NNP,flat,kill.01,_,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",PROPN,Before,PROPN,[NNP],[[0.0]],PROPN_space,_
3,:,:,PUNCT,:,punct,kill.01,_,"[0.43607, 1.5253, -0.11532, 0.33558, 0.36617, ...",PUNCT,Before,PROPN,[:],[[0.3154642]],PUNCT_space,_
4,American,american,ADJ,JJ,amod,kill.01,_,"[0.15796, 0.012358, 0.1681, -0.81207, 0.34308,...",ADJ,Before,NOUN,"[JJ, NNS, VBD]",[[0.27067295]],ADJ_space,NORP


In [18]:
train_golds = train_df['argument'].copy()
train_df= train_df.drop(columns =['argument'])

In [19]:
train_df.head()

Unnamed: 0,form,lemma,upos,xpos,dependency_relation,predicate,embedding,pos_extracted,position_rel2pred,head_pos,dep_path,cosine_similarity_w_predicate,pos_misc_feature,ner
0,Al,Al,PROPN,NNP,root,kill.01,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",PROPN,Before,root,_,[[0.0]],PROPN_no_space,PERSON
1,-,-,PUNCT,HYPH,punct,kill.01,"[-0.52482, -0.31963, -0.11898, -0.62672, 0.043...",PUNCT,Before,PROPN,[HYPH],[[0.2082173]],PUNCT_no_space,_
2,Zaman,Zaman,PROPN,NNP,flat,kill.01,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",PROPN,Before,PROPN,[NNP],[[0.0]],PROPN_space,_
3,:,:,PUNCT,:,punct,kill.01,"[0.43607, 1.5253, -0.11532, 0.33558, 0.36617, ...",PUNCT,Before,PROPN,[:],[[0.3154642]],PUNCT_space,_
4,American,american,ADJ,JJ,amod,kill.01,"[0.15796, 0.012358, 0.1681, -0.81207, 0.34308,...",ADJ,Before,NOUN,"[JJ, NNS, VBD]",[[0.27067295]],ADJ_space,NORP


In [21]:
train_data_dict = train_df.to_dict(orient='records')
for row in train_data_dict:
    row['embedding'] = str(row['embedding'])
    row['cosine_similarity_w_predicate'] = str(row['cosine_similarity_w_predicate'])


In [None]:
train_df.isna().sum()

In [None]:
# logistic_regression model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import numpy as np

In [None]:
vec = DictVectorizer()
logreg = LogisticRegression()
X_transformed = vec.fit_transform(train_data_dict)
model = logreg.fit(X_transformed, train_golds)


# Test

In [None]:
test_file_path = 'en_ewt-up-test.conllu'
test_data = load_data(test_file_path)


In [None]:
word_list_test = []
for sents in test_data:
    for word in sents:
        for f in word['features']:
            word[f]=word['features'][f]
        word_list_test.append(word)


In [None]:
print(len(word_list_test))

In [None]:
df_test = pd.DataFrame(word_list)
df_test.head()

In [None]:
df_test = df_test.drop(columns=['id','features', 'Definite', 'PronType', 'Number', 'Mood','Person', 'Tense', 'VerbForm','head','dependency_graph',
                                  'miscellaneous','head_pp_feature','prev_token_morph_features', 'next_token_morph_features','embedding_head', 'punct_extracted',
                                  'NumType','Degree', 'Case', 'Gender', 'Poss', 'Voice', 'Foreign', 'Reflex', 'Typo','num_of_children','Abbr','propbank_arg'])

In [None]:
df_test.head()

In [None]:
golds_test = df_test['argument'].copy()
df_test= df_test.drop(columns =['argument'])

In [None]:
df_test.head()

In [None]:
data_dict_test = df_test.to_dict(orient='records')
for row in data_dict_test:
    row['embedding'] = str(row['embedding'])
    row['cosine_similarity_w_predicate'] = str(row['cosine_similarity_w_predicate'])


In [None]:
df_test.isna().sum()

In [None]:
X_test_transformed = vec.transform(data_dict_test)


In [None]:
preds = model.predict(X_test_transformed)

In [None]:
evaluation_model(golds_test,preds)