In [0]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#####################################################################
#                           Set Z                                   #
#####################################################################
# Classic Multi-label algorithms + Neural Networks and Embeddings   #
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

from keras_preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
pd.set_option('max_colwidth',400)
from keras.preprocessing.sequence import pad_sequences
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D, CuDNNLSTM, Bidirectional, Dense, \
    LSTM, Conv1D, MaxPooling1D, Dropout, concatenate, Flatten, add, Conv2D
from keras import initializers, regularizers, constraints
from keras import backend as K
from keras.engine import Layer
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import Input, Model
from keras.optimizers import Adam
from keras.models import Sequential, clone_model
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from preprocess import Preproccesor
import time
import numpy as np
from keras.models import model_from_json
from sklearn.metrics import f1_score, accuracy_score, hamming_loss, make_scorer, fbeta_score, multilabel_confusion_matrix,\
                            average_precision_score, precision_score, recall_score
import nltk
import warnings

def average_precision_wrapper(y, y_pred, view):
    return average_precision_score(y, y_pred.toarray(),average=view)

In [18]:
!pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold



In [0]:
hamm_scorer = make_scorer(hamming_loss, greater_is_better=False)
ftwo_scorer = make_scorer(fbeta_score, beta=2)

In [20]:
nltk.download('wordnet')
nltk.download('stopwords')
X, yt, y = Preproccesor.load_multi_label_data(True) #yt has continuous data, y has binary
label_names = ["isHate","isViolence","isNotViolence","isGeneralized","isDirected","gender","race","national_origin","disability","religion","sexual_orientation"]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
!wget 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip'
!wget 'http://nlp.stanford.edu/data/glove.42B.300d.zip' 
import zipfile
with zipfile.ZipFile("/content/crawl-300d-2M.vec.zip","r") as zip_ref:
    zip_ref.extractall()
    print(zip_ref.filelist)
with zipfile.ZipFile("/content/glove.42B.300d.zip","r") as zip_ref:
    zip_ref.extractall()
    print(zip_ref.filelist)

del zip_ref

--2019-12-27 14:50:01--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:16a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2019-12-27 14:50:25 (60.1 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

--2019-12-27 14:50:27--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2019-12-27 14:50:28--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171

In [21]:
!rm '/content/crawl-300d-2M.vec.zip'
!rm '/content/glove.42B.300d.zip'

rm: cannot remove '/content/crawl-300d-2M.vec.zip': No such file or directory
rm: cannot remove '/content/glove.42B.300d.zip': No such file or directory


In [0]:
embedding_path1 = "/content/crawl-300d-2M.vec" #FastText
embedding_path2 = "/content/glove.42B.300d.txt" #Glove 300d
embed_size = 300

In [0]:
def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')
def build_matrix(embedding_path, tk, max_features):
    embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path, encoding = "utf-8"))

    word_index = tk.word_index
    nb_words = max_features
    embedding_matrix = np.zeros((nb_words + 1, 300))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix
def create_embedding_matrix(embed, tk, max_features):
    if embed == 1:
      return build_matrix(embedding_path1, tk, max_features)
    elif embed == 2:
      return build_matrix(embedding_path2, tk, max_features)
    else:
      return np.concatenate([build_matrix(embedding_path1, tk, max_features), build_matrix(embedding_path2, tk, max_features)], axis=-1)
  

In [0]:
n_fold = 10
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=7)

In [0]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                              K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim

In [0]:
def my_hamming_loss(y_true, y_pred):
    print(y_true,y_pred)
    y_true=K.cast(y_true, dtype='float32')
    y_pred=K.cast(y_pred, dtype='float32')
    print(y_true,y_pred)
    hamming_loss(y_true,y_pred)
    
    return K.mean(diff, axis=-1)

In [0]:
#Binary Relevance
def build_model1(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min")
    early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
    main_input = Input(shape=(max_len,), name='main_input')
    x = (Embedding(max_features + 1, embed_size*2, input_length=max_len, weights=[embedding_matrix], trainable=False))(main_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(150, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(150, return_sequences=True))(x)
    hidden = concatenate([
        Attention(max_len)(x),
        GlobalMaxPooling1D()(x),
    ])
    hidden = Dense(1024, activation='selu')(hidden)
    hidden = Dropout(0.3)(hidden)
    hidden = Dense(512, activation='selu')(hidden)
    hidden = Dropout(0.2)(hidden)
    hidden1 = Dense(128, activation='selu')(hidden)
    output_lay1 = Dense(11, activation='sigmoid')(hidden1)
    model = Model(inputs=[main_input], outputs=output_lay1)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=['binary_accuracy'])
    from keras.utils import plot_model
    plot_model(model, to_file='model1.png')
    model2 = Model(inputs=[main_input], outputs=output_lay1)
    model.fit(X_train, y_train, batch_size=16, epochs=20, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=['binary_accuracy'])
    return model2

In [0]:
#Classifier Chains
def build_model2(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min")
    early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
    main_input = Input(shape=(max_len,), name='main_input')
    x = (Embedding(max_features + 1, embed_size*2, input_length=max_len, weights=[embedding_matrix], trainable=False))(main_input)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(150, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(150, return_sequences=True))(x)
    hidden = concatenate([
        Attention(max_len)(x),
        GlobalMaxPooling1D()(x),
    ])
    hidden = Dense(1024, activation='selu')(hidden)
    hidden = Dropout(0.3)(hidden)
    hidden = Dense(512, activation='selu')(hidden)
    hidden = Dropout(0.3)(hidden)
    hidden1 = Dense(128, activation='selu')(hidden)
    output_lay1 = Dense(1, activation='selu')(hidden1)
    hidden2 = concatenate([hidden1,output_lay1])
    output_lay2 = Dense(1, activation='selu')(hidden2)
    hidden3 = concatenate([hidden2,output_lay2])
    output_lay3 = Dense(1, activation='selu')(hidden3)
    hidden4 = concatenate([hidden3,output_lay3])
    output_lay4 = Dense(1, activation='selu')(hidden4)
    hidden5 = concatenate([hidden4,output_lay4])
    output_lay5 = Dense(1, activation='selu')(hidden5)
    hidden6 = concatenate([hidden5,output_lay5])
    output_lay6 = Dense(1, activation='selu')(hidden6)
    hidden7 = concatenate([hidden6,output_lay6])
    output_lay7 = Dense(1, activation='selu')(hidden7)
    hidden8 = concatenate([hidden7,output_lay7])
    output_lay8 = Dense(1, activation='selu')(hidden8)
    hidden9 = concatenate([hidden8,output_lay8])
    output_lay9 = Dense(1, activation='selu')(hidden9)
    hidden10 = concatenate([hidden9,output_lay9])
    output_lay10 = Dense(1, activation='selu')(hidden10)
    hidden11 = concatenate([hidden10,output_lay10])
    output_lay11 = Dense(1, activation='selu')(hidden11)

    hidden_l = concatenate([output_lay1,output_lay2,output_lay3,output_lay4,output_lay5,output_lay6,
                                  output_lay7,output_lay8,output_lay9,output_lay10,output_lay11])
    output_layer = Dense(11, activation='sigmoid')(hidden_l)

    model = Model(inputs=[main_input], outputs=output_layer)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=['binary_accuracy','categorical_accuracy'])
    from keras.utils import plot_model
    plot_model(model, to_file='model2.png')
    model2 = Model(inputs=[main_input], outputs=output_layer)
    model.fit(X_train, y_train, batch_size=32, epochs=50, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=['binary_accuracy','categorical_accuracy'])
    return model2

In [0]:
def build_model3(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=0.0, lr_d=0.0, spatial_dr=0.0, dense_units=128, conv_size=128, dr=0.2, patience=3, fold_id=1):
    file_path = f"best_model_fold_{fold_id}.hdf5"
    check_point = ModelCheckpoint(file_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min")
    early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
    main_input = Input(shape=(max_len,), name='main_input')
    x = (Embedding(max_features + 1, embed_size*2, input_length=max_len, weights=[embedding_matrix], trainable=False))(main_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(150, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(150, return_sequences=True))(x)
    hidden = concatenate([
        Attention(max_len)(x),
        GlobalMaxPooling1D()(x),
    ])
    hidden = Dense(1024, activation='selu')(hidden)
    hidden = Dropout(0.3)(hidden)
    hidden = Dense(512, activation='selu')(hidden)
    hidden = Dropout(0.2)(hidden)
    hidden1 = Dense(128, activation='selu')(hidden)
    hidden_output_lay1 = Dense(1, activation='relu')(hidden1)

    hidden_conc_1 = concatenate([hidden_output_lay1,hidden1])

    hidden_output_lay2 = Dense(2, activation='relu')(hidden_conc_1)
    hidden_output_lay3 = Dense(2, activation='relu')(hidden_conc_1)
    
    hidden_output_lay4 = Dense(1, activation='relu')(hidden_conc_1)
    hidden_output_lay5 = Dense(1, activation='relu')(hidden_conc_1)
    hidden_output_lay6 = Dense(1, activation='relu')(hidden_conc_1)
    hidden_output_lay7 = Dense(1, activation='relu')(hidden_conc_1)
    hidden_output_lay8 = Dense(1, activation='relu')(hidden_conc_1)
    hidden_output_lay9 = Dense(1, activation='relu')(hidden_conc_1)

    final_hidden_conc = concatenate([hidden_output_lay1,hidden_output_lay2,hidden_output_lay3,hidden_output_lay4,
                                     hidden_output_lay5,hidden_output_lay6,hidden_output_lay7,hidden_output_lay8,
                                     hidden_output_lay9])
    final_output = Dense(11, activation='sigmoid')(final_hidden_conc)

    model = Model(inputs=[main_input], outputs=final_output)
    model.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=['binary_accuracy'])
    from keras.utils import plot_model
    plot_model(model, to_file='model3.png')
    model2 = Model(inputs=[main_input], outputs=final_output)
    model.fit(X_train, y_train, batch_size=16, epochs=50, validation_data=(X_valid, y_valid), verbose=1, callbacks=[early_stop, check_point])
    model2.load_weights(file_path)
    model2.compile(loss="binary_crossentropy", optimizer=Adam(lr=lr, decay=lr_d), metrics=['binary_accuracy'])
    return model2

In [0]:
max_features = 50000
scores = {}
scores = {}
scores.setdefault('test_F1_example', [])
scores.setdefault('test_F1_macro', [])
scores.setdefault('test_F1_micro', [])
scores.setdefault('test_precision_example', [])
scores.setdefault('test_precision_macro', [])
scores.setdefault('test_precision_micro', [])
scores.setdefault('test_recall_example', [])
scores.setdefault('test_recall_macro', [])
scores.setdefault('test_recall_micro', [])
scores.setdefault('test_average_precision_macro', [])
scores.setdefault('test_average_precision_micro', [])
scores.setdefault('test_Accuracy', [])
scores.setdefault('test_Hamm', [])
cm = []
mskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)
fold_n=0
save_ys = []
save_yt = []
max_len = 150
embed_size = 150
embma = 1
name = "Mixed2"
for train_index, test_index in mskf.split(X, y):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    tk = Tokenizer(lower = True, filters='', num_words=max_features, oov_token = True)
    tk.fit_on_texts(X_train)
    train_tokenized = tk.texts_to_sequences(X_train)
    valid_tokenized = tk.texts_to_sequences(X_valid)
    X_train = pad_sequences(train_tokenized, maxlen=max_len)
    X_valid = pad_sequences(valid_tokenized, maxlen=max_len)
    embedding_matrix = create_embedding_matrix(embma, tk, max_features)

    model = build_model4(X_train, y_train, X_valid, y_valid, max_len, max_features, embed_size, embedding_matrix, lr=1e-3, lr_d=0, spatial_dr=0.1, dense_units=128, conv_size=128, dr=0.1, patience=10, fold_id=fold_n)
      
    fold_n = fold_n + 1
    yT = model.predict(X_valid)
    y_preds = []
    for yt in yT: #Don't do this if you throw them with continuous values
      yi = []
      for i in yt:
        if i>=0.5:
          yi.append(int(1))
        else:
          yi.append(int(0))
      y_preds.append(yi)
    y_preds = np.array(y_preds)
    cm.append(multilabel_confusion_matrix(y_valid,y_preds))
    scores['test_F1_example'].append(f1_score(y_valid, y_preds, average='samples'))
    scores['test_F1_macro'].append(f1_score(y_valid, y_preds, average='macro'))
    scores['test_F1_micro'].append(f1_score(y_valid, y_preds, average='micro'))
    scores['test_precision_example'].append(precision_score(y_valid, y_preds, average='samples'))
    scores['test_precision_macro'].append(precision_score(y_valid, y_preds, average='macro'))
    scores['test_precision_micro'].append(precision_score(y_valid, y_preds, average='micro'))
    scores['test_recall_example'].append(recall_score(y_valid, y_preds, average='samples'))
    scores['test_recall_macro'].append(recall_score(y_valid, y_preds, average='macro'))
    scores['test_recall_micro'].append(recall_score(y_valid, y_preds, average='micro'))
    scores['test_average_precision_macro'].append(average_precision_score(y_valid, y_preds, average='macro'))
    scores['test_average_precision_micro'].append(average_precision_score(y_valid, y_preds, average='micro'))
    scores['test_Accuracy'].append(accuracy_score(y_valid, y_preds))
    scores['test_Hamm'].append(hamming_loss(y_valid, y_preds))
cmt = cm[0]
for ra in range(1,len(cm)):
    cmt = cmt + ra
cmt = cmt/10
print(cmt)
f = open("setE.txt", "a+")
f.write("{:<7} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} \n".format(str(name)[:7],
                                              str('%.4f' % (sum(scores['test_F1_example'])/10)),
                                              str('%.4f' % (sum(scores['test_F1_macro'])/10)),
                                              str('%.4f' % (sum(scores['test_F1_micro']) / 10)),
                                              str('%.4f' % (sum(scores['test_precision_example']) / 10)),
                                              str('%.4f' % (sum(scores['test_precision_macro']) / 10)),
                                              str('%.4f' % (sum(scores['test_precision_micro']) / 10)),
                                              str('%.4f' % (sum(scores['test_recall_example']) / 10)),
                                              str('%.4f' % (sum(scores['test_recall_macro']) / 10)),
                                              str('%.4f' % (sum(scores['test_recall_micro']) / 10)),
                                              str('%.4f' % (sum(scores['test_average_precision_macro'])/10)),
                                              str('%.4f' % (sum(scores['test_average_precision_micro'])/10)),
                                              str('%.4f' % (sum(scores['test_Accuracy'])/10)),
                                              str('%.4f' % (sum(scores['test_Hamm'])/10))))
f.close()
print("{:<7} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} \n".format(str(name)[:7],
                                            str('%.4f' % (sum(scores['test_F1_example'])/10)),
                                            str('%.4f' % (sum(scores['test_F1_macro'])/10)),
                                            str('%.4f' % (sum(scores['test_F1_micro']) / 10)),
                                            str('%.4f' % (sum(scores['test_precision_example']) / 10)),
                                            str('%.4f' % (sum(scores['test_precision_macro']) / 10)),
                                            str('%.4f' % (sum(scores['test_precision_micro']) / 10)),
                                            str('%.4f' % (sum(scores['test_recall_example']) / 10)),
                                            str('%.4f' % (sum(scores['test_recall_macro']) / 10)),
                                            str('%.4f' % (sum(scores['test_recall_micro']) / 10)),
                                            str('%.4f' % (sum(scores['test_average_precision_macro'])/10)),
                                            str('%.4f' % (sum(scores['test_average_precision_micro'])/10)),
                                            str('%.4f' % (sum(scores['test_Accuracy'])/10)),
                                            str('%.4f' % (sum(scores['test_Hamm'])/10))))

In [16]:
print("{:<7} | {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} {:<7} \n".format(str(name)[:7],
                                            str('%.4f' % (sum(scores['test_F1_example'])/10)),
                                            str('%.4f' % (sum(scores['test_F1_macro'])/10)),
                                            str('%.4f' % (sum(scores['test_F1_micro']) / 10)),
                                            str('%.4f' % (sum(scores['test_precision_example']) / 10)),
                                            str('%.4f' % (sum(scores['test_precision_macro']) / 10)),
                                            str('%.4f' % (sum(scores['test_precision_micro']) / 10)),
                                            str('%.4f' % (sum(scores['test_recall_example']) / 10)),
                                            str('%.4f' % (sum(scores['test_recall_macro']) / 10)),
                                            str('%.4f' % (sum(scores['test_recall_micro']) / 10)),
                                            str('%.4f' % (sum(scores['test_average_precision_macro'])/10)),
                                            str('%.4f' % (sum(scores['test_average_precision_micro'])/10)),
                                            str('%.4f' % (sum(scores['test_Accuracy'])/10)),
                                            str('%.4f' % (sum(scores['test_Hamm'])/10))))

Mixed   | 0.1983  0.2398  0.4633  0.2783  0.3182  0.6211  0.1701  0.2220  0.3803  0.2344  0.3283  0.4248  0.1288  

