In [2]:
import pickle
import pprint  # Add this line
import pandas as pd 
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
from matplotlib import pyplot as plt

In [3]:
def load_data(file_path):
    with open(file_path.replace('.conllu', '.preprocessed.pkl'), 'rb') as pickle_file:
        return pickle.load(pickle_file)

In [4]:
def prepare_for_model(data):
    """Convert feature dict to pandas DataFrame to handle data for training.
    Return a pandas df with the relevant features for training the model.

    Paramenters:
    -data: a list of objects, where each object represents one 'frame' in a sentence.
    """
    list_features = [] #creating an empty list where the data will be stored
    
    for sentences in data:
        for token_dict in sentences:
            dict_feat = token_dict['features'] #grabbing the nested dictionaries where the features are stored
            list_features.append(dict_feat) #appending the dict to the list

    df = pd.DataFrame(list_features) #converting the list of dictionaries into a pandas dataframe, as seen at https://stackoverflow.com/questions/18837262/convert-python-dict-into-a-dataframe

    #selecting the features that are needed for the model, which are now columns in the df
    df = df[['embedding', 
         'pos_extracted', 
         'position_rel2pred',
         'embedding_head',
         'num_of_children',
         'punct_extracted',
         'head_pos',
         'dep_path',
         'cosine_similarity_w_predicate',
        'pos_misc_feature',
        'head_pp_feature',
        'ner',
        'propbank_arg']] 

    return df

In [5]:
def extract_gold_labels(data_file):
    '''
    Extract gold labels.
    Return a list of gold labels.
    
    :param data_file: a list of objects, where each object represents one 'frame' in a sentence.
    :type data_file: string
    '''
    labels = []
    
    for sentence in data_file:
        for token_dict in sentence:
            #adding gold label to labels
            gold_label = token_dict['argument']
            labels.append(gold_label)

    return labels

In [6]:
dev_file_path = 'data/en_ewt-up-dev.conllu'
train_file_path = 'data/en_ewt-up-train.conllu'
test_file_path = 'data/en_ewt-up-test.conllu'

In [7]:
dev_data = load_data(dev_file_path)
train_data = load_data(train_file_path)
test_data = load_data(test_file_path)

In [8]:
train_df = prepare_for_model(train_data)
dev_df = prepare_for_model(dev_data)
test_df = prepare_for_model(test_data)

In [10]:
test_df.isna().sum()

embedding                            0
pos_extracted                        0
position_rel2pred                    0
embedding_head                       0
num_of_children                      0
punct_extracted                      0
head_pos                             0
dep_path                             0
cosine_similarity_w_predicate        0
pos_misc_feature                     0
head_pp_feature                  92870
ner                                  0
propbank_arg                     37528
dtype: int64

In [9]:
def df_to_dict(dataframe):
    """Convert a pandas dataframe to a python dictionary.
    Return a dictionary containing the name of the feature as key and the respective feature as value.

    Parameter:
    -dataframe: dataframe containing the features for the model.
    """
    data_dict = dataframe.to_dict(orient='records')
    return data_dict

In [15]:
train_features_dict = df_to_dict(train_df)

In [26]:
train_dicc = train_features_dict[:2]
print(train_dicc)

[{'embedding': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'pos_extracted': 'PROPN', 'position_rel2pred': 'Before', 'embedding_head': array([0., 0., 0., 0., 0

In [46]:
for token_dict in train_dicc:
    lemma_vector = token_dict['embedding']
    print(lemma_vector)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
[-5.2482e-01 -3.1963e-01 -1.1898e-01 -6.2672e-01  4.3607e-02  3.9176e-02
 -7.4566e-01 -2.9516e-01 -7.0795e-01  5.0644e-01 -1.2069e-01  6.0460e-01
  1.7881e-01 -3.2358e-01  6.7840e-01  6.1368e-01 -6.8220e-01 -9.0958e-01
 -3.5056e-01 -5.0691e-01  4.2474e-01  3.3311e+00 -2.9048e-01  3.1487e-01
  2.9800e-01  3.0849e-01 -5.9682e-01 -9.0485e-02  6.3417e-01  1.1428e-01
  6.0949e-01  4.2750e-01 

In [16]:
def extract_feature_values(row_dict, selected_features):
    '''
    Extract feature value pairs from row
    
    :param row: row from conllu file
    :param selected_features: list of selected features
    :type row: string
    :type selected_features: list of strings
    
    :returns: dictionary of feature value pairs
    '''
    feature_to_index = {'embedding': 0,
                        'pos_extracted': 1, 
                        'position_rel2pred': 2, 
                        'embedding_head': 3,
                        'num_of_children': 4, 
                        'punct_extracted': 5, 
                        'head_pos': 6, 
                        'dep_path': 7,
                        'cosine_similarity_w_predicate': 8, 
                        'pos_misc_feature': 9,
                        'head_pp_feature': 10,
                        'ner': 11, 
                        'propbank_arg': 12}
    
    feature_values_dict = {}
    for feature_name in selected_features:
        r_index = feature_to_index.get(feature_name)
        feature_values_dict[feature_name] = row_dict.get(feature_name)

    return feature_values_dict

def create_vectorizer_traditional_features(feature_values):
    '''
    Create vectorizer for set of feature values
    
    :param feature_values: list of dictionaries containing feature-value pairs
    :type feature_values: list of dictionairies (key and values are strings)
    
    :returns: vectorizer with feature values fitted
    '''
    vectorizer = DictVectorizer()
    vectorizer.fit(feature_values)
    
    return vectorizer
        
    
def combine_sparse_and_dense_features(dense_vectors, sparse_features):
    '''
    Take sparse and dense feature representations and appends their vector representation
    
    :param dense_vectors: list of dense vector representations
    :param sparse_features: list of sparse vector representations
    :type dense_vector: list of arrays
    :type sparse_features: list of lists
    
    :returns: list of arrays in which sparse and dense vectors are concatenated
    '''
    
    combined_vectors = []
    sparse_vectors = np.array(sparse_features.toarray())
    
    for index, vector in enumerate(sparse_vectors):
        combined_vector = np.concatenate((vector,dense_vectors[index]))  
        combined_vectors.append(combined_vector)
    
    return combined_vectors
    

def extract_features(df, vectorizer=None):
    '''
    Extract features and converts them into a vector.
    
    :param df: dataframe containing the features extracted.
    :type df: pandas DataFrame.
    
    :return features: list of vector representation of tokens
    :return labels: list of gold labels
    '''

    dense_vectors = []
    traditional_features = []

    selected_features = [ 
                     'pos_extracted',
                     #'position_rel2pred',
                     #'head_pos',
                     #'dep_path',
                     #'pos_misc_feature',
                     #'head_pp_feature',
                     #'ner',
                     #'propbank_arg'
                    ] 

    features_dict_list = df_to_dict(df) #converting the df to dictionaries to extract the features and convert to vector representation
    
    for token_dict in features_dict_list:
        lemma_vector = token_dict['embedding']
        head_vector = token_dict['embedding_head']
        cos_sim_vector = np.asarray([token_dict['cosine_similarity_w_predicate']]) #converting the numerical feat to np arrays to be concatenated in a single array
        #num_children = np.asarray([token_dict['num_of_children']])
        punct_extracted = np.asarray([token_dict['punct_extracted']])
        dense_vectors.append(np.concatenate((lemma_vector,head_vector,cos_sim_vector,punct_extracted))) #contactenating embeddings plus numerical value features
        #mixing very sparse representations (for one-hot tokens) and dense representations is a bad idea
        #we thus only use other features with limited values
        other_features = extract_feature_values(token_dict, selected_features)
        traditional_features.append(other_features)

    
    #create vector representation of traditional features
    if vectorizer is None:
        #creates vectorizer that provides mapping (only if not created earlier)
        vectorizer = create_vectorizer_traditional_features(traditional_features)
    sparse_features = vectorizer.transform(traditional_features)
    combined_vectors = combine_sparse_and_dense_features(dense_vectors, sparse_features)
    
    return combined_vectors, vectorizer
    #return dense_vectors

def create_classifier(features, labels):
    '''
    Create classifier from features represented as vectors and gold labels
    
    :param features: list of vector representations of tokens
    :param labels: list of gold labels
    :type features: list of vectors
    :type labels: list of strings
    
    :returns trained logistic regression classifier
    '''
    
    
    #lr_classifier = LogisticRegression(solver='saga')
    lr_classifier = LogisticRegression(max_iter=10000)
    lr_classifier.fit(features, labels)
    
    return lr_classifier

def label_data(vectorizer, testfile, classifier):
    '''
    Extract features and gold labels from test data and runs a classifier
    
    :param testfile: path to test file
    :param classifier: trained classifier
    :type testfile: string
    :type classifier: LogisticRegression
    
    :return predictions: list of predicted labels
    :return labels: list of gold labels
    '''
    
    dense_feature_representations = extract_features(testfile,vectorizer)
    labels = extract_gold_labels(dev_data)
    predictions = classifier.predict(dense_feature_representations)
    
    return labels,predictions


In [17]:
def provide_confusion_matrix(GoldLabel, PredictLabel, label_set):
    """
    use `sklearn.metric confusion_matrix` to create confusion matrix of model predict.
    and `sklearn.metric ConfusionMatrixDisplay` to display created confusion matrix.

    Parameters
    ----------
    GoldLabel : list
        list of all Gold labels
    PredictLabel : list
        list of all Prediction labels
    label_set : list 
        list of all classes
    
    Returns
    -------
        Confusion matrix
    """
    cf_matrix = confusion_matrix(GoldLabel, PredictLabel) # create a confusion matrix with gold and predicts
    print(cf_matrix) # print confusion_matrix as text
    display = ConfusionMatrixDisplay(confusion_matrix=cf_matrix, display_labels=label_set) # create graphical confusion_matrix
    fig, ax = plt.subplots(figsize=(15,15)) # create bigger plot because there is many classes in this task
    display.plot(ax =ax) # show confusion_matrix
    plt.xticks(rotation=90) # rotate X label of plot 90 degree
    plt.show() # show confusion matrix
    return cf_matrix # return confusion_matrix (maybe useful later)

def calculate_precision_recall_f1score(GoldLabel, PredictLabel, label_set): # function get gold and predict and set of labels
    """
    use `sklearn.metric classification_report` to get report of model predict.
    
    Parameters
    ----------
    GoldLabel : list
        list of all Gold labels
    PredictLabel : list
        list of all Prediction labels
    label_set : list 
        list of all classes
    
    Returns
    -------
        Classification report
    """
    report = classification_report(GoldLabel, PredictLabel, digits = 3, target_names=label_set) # calculate report
    print(report) # print report
    return report # return report (maybe useful later)

def evaluation_model(GoldLabel, PredictLabel): # get gold and predict
    """
    Evaluation models by call `calculate_precision_recall_f1score` and `provide_confusion_matrix` functions.

    Parameters
    ----------
    data :
        Train or test or dev dataset (after extracting fetures)
    PredictLabel : list
        list of all Prediction labels
    
    Returns
    -------
        Classification report and Confusion matrix
    """

    label_set = sorted(set(GoldLabel)) # find uniqe lables in gold
    print(label_set)

    print('precision_recall_f1-score')
    report = calculate_precision_recall_f1score(GoldLabel, PredictLabel, label_set) # calculate_precision_recall_f1score

    print('Confusion matrix')
    cf_matrix = provide_confusion_matrix(GoldLabel, PredictLabel, label_set) # provide_confusion_matrix

    return report, cf_matrix # return report and cf_matrix

In [18]:
train_df = train_df[:100]

In [86]:
train_df.isna().sum()

embedding                          0
pos_extracted                      0
position_rel2pred                  0
embedding_head                     0
num_of_children                    0
punct_extracted                    0
head_pos                           0
dep_path                           0
cosine_similarity_w_predicate      0
pos_misc_feature                   0
head_pp_feature                  918
ner                                0
propbank_arg                     424
dtype: int64

In [19]:
print('Extracting dense features...')
dense_feature_representations,vec = extract_features(train_df)
gold = extract_gold_labels(train_data)
print('Training classifier....')
classifier = create_classifier(dense_feature_representations, gold[:100])
print('Running evaluation...')
gold_labels,predicted = label_data(vec,dev_df, classifier)
evaluation_model(gold_labels,predicted)

Extracting dense features...
Training classifier....
Running evaluation...


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.