In [1]:
import pandas as pd
import numpy as np
from MTLDNN import *

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tqdm.notebook import tqdm
import pandas as pd
import copy
from timeit import default_timer

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.utils import class_weight

import sys
sys.path.append("../../library")

import utils
import importlib

# aca la carga de los datos tokenizados

In [2]:
def load_data(data_path):
    data = pd.read_csv(data_path)#.iloc[:20,:]
    
    # remove stopwords - tfidf whole df
    stops = set(stopwords.words('english'))
    
    vectorizer = TfidfVectorizer(
        analyzer = "word",
        lowercase = True,
        tokenizer = word_tokenize,
        stop_words = stops,
        min_df = 5
    )

    X = vectorizer.fit_transform(data.sentence.to_numpy())
    y = data.iloc[:,1:].astype(int).to_numpy()
    return X,y

# aca las fns que irian a utils

In [3]:
def validate_MTL(
    X, 
    Y, 
    classifier, 
    n_splits = 5, 
    shuffle = True, 
    random_state = 1234, 
    full = False,
    verbose = True,
    params = None
):
    
    print("Cross-validation process started...")
    start = default_timer()
    results = []
    data = get_folds_MTL(X, Y, n_splits, shuffle, random_state)
    for i, ((a_train, b_train), (a_test, b_test)) in enumerate(data, 1):
        del classifier
        classifier = MTLDNN(1281, params)
        classifier.compilar(params)
        if verbose:
            print(f"*** fold {i} / {len(data)}")
            print("    training model...")
        classifier.entrenar(((a_train, b_train), (a_test, b_test)), params)
        if verbose: 
            print("    generating predictions on the train set...")
        train_predictions = classifier.predecir(a_train)  
        if verbose: 
            print("    generating predictions on the test set...")
        test_predictions = classifier.predecir(a_test)  
        
        for target in range(len(b_train)):
            results.append(
                dict(
                    fold = i,
                    column = columns.get(target),                    
                    train_accuracy = accuracy_score(b_train[target], train_predictions[target]),
                    train_precision = precision_score(b_train[target], train_predictions[target]),
                    train_recall = recall_score(b_train[target], train_predictions[target]),
                    train_f1 = f1_score(b_train[target], train_predictions[target]),
                    test_accuracy = accuracy_score(b_test[target], test_predictions[target]),
                    test_precision = precision_score(b_test[target], test_predictions[target]),
                    test_recall = recall_score(b_test[target], test_predictions[target]),
                    test_f1 = f1_score(b_test[target], test_predictions[target]),
                )
            )
        # adding task_1 predictions
        t1_train_predictions = np.logical_or.reduce(train_predictions)
        t1_test_predictions = np.logical_or.reduce(test_predictions)
        t1_b_train = np.logical_or.reduce(b_train)
        t1_b_test = np.logical_or.reduce(b_test)
        print(t1_test_predictions.shape)
        print(t1_b_test.shape)
        results.append(
                dict(
                    fold = i,
                    column = 'task_1',                    
                    train_accuracy = accuracy_score(t1_b_train, t1_train_predictions),
                    train_precision = precision_score(t1_b_train, t1_train_predictions),
                    train_recall = recall_score(t1_b_train, t1_train_predictions),
                    train_f1 = f1_score(t1_b_train, t1_train_predictions),
                    test_accuracy = accuracy_score(t1_b_test, t1_test_predictions),
                    test_precision = precision_score(t1_b_test, t1_test_predictions),
                    test_recall = recall_score(t1_b_test, t1_test_predictions),
                    test_f1 = f1_score(t1_b_test, t1_test_predictions),
                )
            )
        
        
        time = default_timer() - start
        print(f"    Total runtime: {time/60:.2f} minutes")
    results = pd.DataFrame(results)
    if full:
        return results
    else:
        return results.pivot_table(
            index = "column", 
            values = [
                "train_accuracy", "train_precision", "train_recall", "train_f1", 
                "test_accuracy", "test_precision", "test_recall", "test_f1"
            ]
        )

In [4]:
# Same but for MTL models, which take all labels at once
def get_folds_MTL(X, y, n_splits = 5, shuffle = True, random_state = 1234):
    """
    y should be a multidimensional array of labels (for multiclass classification)
    """
    kf = StratifiedKFold(
        n_splits = n_splits, 
        shuffle = shuffle, 
        random_state = random_state
    )
    
    # y_strat is the set of labels used for stratification in stratified sampling
    # computed as the logical or among all labels in task 2 (same as task 1)
    y_strat = np.logical_or.reduce(y.T)
    output = []
    for train_idx, val_idx in kf.split(X, y_strat):
        t = (X[train_idx].toarray(), list(y[train_idx].T))
        v = (X[val_idx].toarray(), list(y[val_idx].T))
        output.append((t,v))
    return output

# aca la instanciacion del modelo

In [5]:
# data
datapath ="../../data/task_2.csv"
X,y = load_data(datapath)

  'stop_words.' % sorted(inconsistent))


In [33]:
columns = {
    0:'xenophobia',
    1:'suffering',
    2:'economic',
    3:'migration',
    4:'culture',
    5:'benefits',
    6:'health',
    7:'security',
    8:'dehumanisation',
    9:'others'}



# weighed cost fn
cw = []
for c in list(y.T):
    class_weights = class_weight.compute_class_weight('balanced',np.unique(y),c)
    cw.append(class_weights)

weights = {
    'xenophobia':     {0:cw[0][0], 1:cw[0][1]},
    'suffering':      {0:cw[1][0], 1:cw[1][1]},
    'economic':       {0:cw[2][0], 1:cw[2][1]},
    'migration':      {0:cw[3][0], 1:cw[3][1]},
    'culture':        {0:cw[4][0], 1:cw[4][1]},
    'benefits':       {0:cw[5][0], 1:cw[5][1]},
    'health':         {0:cw[6][0], 1:cw[6][1]},
    'security':       {0:cw[7][0], 1:cw[7][1]},
    'dehumanisation': {0:cw[8][0], 1:cw[8][1]},
    'others':         {0:cw[9][0], 1:cw[9][1]}
}

weights_10_outputs = {'output_1':weights.get(columns.get(0)), 
                      'output_2':weights.get(columns.get(1)), 
                      'output_3':weights.get(columns.get(2)), 
                      'output_4':weights.get(columns.get(3)), 
                      'output_5':weights.get(columns.get(4)),
                      'output_6':weights.get(columns.get(5)), 
                      'output_7':weights.get(columns.get(6)), 
                      'output_8':weights.get(columns.get(7)), 
                      'output_9':weights.get(columns.get(8)), 
                      'output_10':weights.get(columns.get(9)),
                     
                     }

params = {'dropout':[0.25,0.15,0.1],
          'act':'relu',
          'lb':0.0001,
          'arq':(1000,500,100,20),
          'w': weights_10_outputs,
          'loss':BinaryCrossentropy(), 
          'l_rate':0.0001, 
          'metrics':[tf.keras.metrics.Precision(),tf.keras.metrics.Recall(),tf.keras.metrics.Accuracy()], 
          'min_delta':0.0001, 
          'patience':500,
          'n_epochs':1000,
          'columns': columns,
          'momentum_batch_norm':0.9,
          'ni':1281}

mi_modelo = MTLDNN(1281, params)
mi_modelo.compilar(params)

In [None]:
salida = validate_MTL(X, y, mi_modelo, n_splits = 5, shuffle = True, random_state = 1234, full = False, verbose = True, params = params)

Cross-validation process started...
*** fold 1 / 5
    training model...
Restoring model weights from the end of the best epoch.
Epoch 00508: early stopping
    generating predictions on the train set...
    generating predictions on the test set...
(764, 1)
(764,)
    Total runtime: 3.00 minutes
*** fold 2 / 5
    training model...
Restoring model weights from the end of the best epoch.
Epoch 00506: early stopping
    generating predictions on the train set...
    generating predictions on the test set...
(764, 1)
(764,)
    Total runtime: 6.00 minutes


  _warn_prf(average, modifier, msg_start, len(result))


*** fold 3 / 5
    training model...


In [28]:
salida

Unnamed: 0_level_0,test_accuracy,test_f1,test_precision,test_recall,train_accuracy,train_f1,train_precision,train_recall
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
benefits,0.834681,0.246877,0.164316,0.50139,0.843267,0.393196,0.248531,0.942301
culture,0.800117,0.127383,0.084356,0.327593,0.820013,0.296854,0.189028,0.776141
dehumanisation,0.895211,0.043548,0.027294,0.147656,0.881319,0.219797,0.125995,0.92933
economic,0.846185,0.100372,0.057338,0.547564,0.831681,0.148452,0.080719,0.958092
health,0.948392,0.035146,0.019027,0.273333,0.925529,0.1194,0.064513,0.984615
migration,0.810323,0.333048,0.237798,0.561214,0.834818,0.474518,0.32377,0.888822
others,0.884203,0.076746,0.044976,0.282045,0.859838,0.196772,0.109439,0.9776
security,0.770764,0.181423,0.121426,0.382598,0.797092,0.330577,0.21202,0.756886
suffering,0.883152,0.074264,0.043506,0.325,0.870055,0.202092,0.113377,0.958864
task_1,0.711783,0.486785,0.441242,0.57868,0.849301,0.761675,0.681045,0.900692


In [29]:
# params_lst = []
params_lst.append(params)
# salidas_lst = []
salidas_lst.append(salida)

In [32]:
params_lst

[{'dropout': [0.25, 0.15, 0.1],
  'act': 'relu',
  'lb': 0.001,
  'arq': (100, 50, 10, 5),
  'w': {'output_1': {0: 0.5021047092870298, 1: 119.28125},
   'output_2': {0: 0.5083910495471498, 1: 30.293650793650794},
   'output_3': {0: 0.5073099415204678, 1: 34.7},
   'output_4': {0: 0.5459096109839817, 1: 5.945482866043614},
   'output_5': {0: 0.5260474090407938, 1: 10.097883597883598},
   'output_6': {0: 0.528523954583218, 1: 9.264563106796116},
   'output_7': {0: 0.5022368421052632, 1: 112.26470588235294},
   'output_8': {0: 0.5357944974733296, 1: 7.484313725490196},
   'output_9': {0: 0.5086620469083155, 1: 29.361538461538462},
   'output_10': {0: 0.5089333333333333, 1: 28.48507462686567}},
  'loss': <tensorflow.python.keras.losses.BinaryCrossentropy at 0x147f25f998d0>,
  'l_rate': 0.0001,
  'metrics': [<tensorflow.python.keras.metrics.Precision at 0x147e6b130b10>,
   <tensorflow.python.keras.metrics.Recall at 0x147e6b096c50>,
   <tensorflow.python.keras.metrics.Accuracy at 0x147e6b096