In [1]:
import csv
import re
import string

import operator
import os
import functools
import operator
import fasttext

import numpy as np
import pandas as pd

from tqdm import tqdm

from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input, InputSpec, GlobalMaxPool1D, GlobalAvgPool1D, Masking
from keras.layers import LSTM, GRU, Bidirectional, Dropout, SpatialDropout1D, BatchNormalization
from keras.layers import concatenate
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, Callback, ReduceLROnPlateau
from keras.optimizers import Adam, Nadam
from keras import initializers, regularizers, constraints
from tqdm import tqdm
from collections import Counter


import keras.backend as K
import tensorflow as tf

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from scipy import sparse

from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

# from textacy.preprocess import preprocess_text

from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints

In [2]:
# !pip install textacy
# import textacy
# textacy
# textacy.__version__

In [3]:
# dir(textacy)

# Embedding Files

In [12]:
for i in sorted(os.scandir('../Vectors'), key=lambda x: x.stat().st_size, reverse=True):
    print(i.path)

../Vectors\glove.840B.300d.pkl
../Vectors\wiki.ny.bin
../Vectors\en-ny.xml.gz
../Vectors\wiki.ny.vec
../Vectors\en-nya.xml.gz


In [13]:
train = pd.read_csv('../Translated/cleaned/train.csv')
test = pd.read_csv('../Translated/cleaned/test.csv')


In [43]:
max_features = 60000
maxlen = 5000
embed_size = 300

file_path = "weights_base.best.hdf5"
emb_file = '../Vectors/wiki.ny.bin'
unused = set([])

tweet_tokenizer = TweetTokenizer(reduce_len=True)
lem = WordNetLemmatizer()
eng_stopwords = set(stopwords.words("english"))
other_stop_w = pd.read_csv('words_shared_by_all.csv')
list_classes = train.Label.unique().tolist()

In [44]:
CONTEXT_DIM = 100

class Attention(Layer):

    def __init__(self, regularizer=regularizers.l2(1e-10), **kwargs):
        self.regularizer = regularizer
        self.supports_masking = True
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3        
        self.W = self.add_weight(name='W',
                                 shape=(input_shape[-1], CONTEXT_DIM),
                                 initializer='normal',
                                 trainable=True, 
                                 regularizer=self.regularizer)
        self.b = self.add_weight(name='b',
                                 shape=(CONTEXT_DIM,),
                                 initializer='normal',
                                 trainable=True, 
                                 regularizer=self.regularizer)
        self.u = self.add_weight(name='u',
                                 shape=(CONTEXT_DIM,),
                                 initializer='normal',
                                 trainable=True, 
                                 regularizer=self.regularizer)        
        super(Attention, self).build(input_shape)

    @staticmethod
    def softmax(x, dim):
        """Computes softmax along a specified dim. Keras currently lacks this feature.
        """
        if K.backend() == 'tensorflow':
            import tensorflow as tf
            return tf.nn.softmax(x, dim)
        elif K.backend() == 'theano':
            # Theano cannot softmax along an arbitrary dim.
            # So, we will shuffle `dim` to -1 and un-shuffle after softmax.
            perm = np.arange(K.ndim(x))
            perm[dim], perm[-1] = perm[-1], perm[dim]
            x_perm = K.permute_dimensions(x, perm)
            output = K.softmax(x_perm)

            # Permute back
            perm[dim], perm[-1] = perm[-1], perm[dim]
            output = K.permute_dimensions(x, output)
            return output
        else:
            raise ValueError("Backend '{}' not supported".format(K.backend()))

    def call(self, x, mask=None):
        ut = K.tanh(K.bias_add(K.dot(x, self.W), self.b)) * self.u

        # Collapse `attention_dims` to 1. This indicates the weight for each time_step.
        ut = K.sum(ut, axis=-1, keepdims=True)

        # Convert those weights into a distribution but along time axis.
        # i.e., sum of alphas along `time_steps` axis should be 1.
        self.at = self.softmax(ut, dim=1)
        if mask is not None:
            self.at *= K.cast(K.expand_dims(mask, -1), K.floatx())

        # Weighted sum along `time_steps` axis.
        return K.sum(x * self.at, axis=-2)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
    
    def get_config(self):
        config = {}
        base_config = super(Attention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_mask(self, inputs, mask):
        return None


In [45]:
def create_embedding(emb_file, word_index):
    if emb_file.endswith('bin'):
        embeddings_index = fasttext.load_model(emb_file)
    else:
        embeddings_index = pd.read_table(emb_file,
                                         sep=" ",
                                         index_col=0,
                                         header=None,
                                         quoting=csv.QUOTE_NONE,
                                         usecols=range(embed_size + 1),
                                         dtype={h: np.float32 for h in range(1, embed_size + 1)},
                                         engine='c',
        )

    nb_words = min(max_features, len(word_index))

    # Initialize Random Matrix
    if emb_file.endswith('bin'):
        mean, std = 0.007565171, 0.29283202
    else:
        mean, std = embeddings_index.values.mean(), embeddings_index.values.std()

    embedding_matrix = np.random.normal(mean, std, (nb_words, embed_size))

    with tqdm(total=nb_words, desc='Embeddings', unit=' words') as pbar:
        for word, i in word_index.items():
            if i >= nb_words:
                continue
            if emb_file.endswith('bin'):
                if embeddings_index.get_word_id(word) != -1:
                    embedding_matrix[i] = embeddings_index.get_word_vector(word).astype(np.float32)
                    pbar.update()
            else:
                if word in embeddings_index.index:
                    embedding_matrix[i] = embeddings_index.loc[word].values
                    pbar.update()

    return embedding_matrix

def get_embedding(emb_file):
    return Embedding(min(max_features, len(tokenizer.word_index)), embed_size,
                     weights=[create_embedding(emb_file, tokenizer.word_index)],
                     input_length=maxlen,
                     trainable=False
    )

def tokenize(s):
    return re.sub('([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation), r' \1 ', s).split()

def replace_numbers(s):
    dictionary = {
        '&': ' and ',
        '@': ' at ',
        '0': ' zero ',
        '1': ' one ',
        '2': ' two ',
        '3': ' three ',
        '4': ' four ',
        '5': ' five ',
        '6': ' six ',
        '7': ' seven ',
        '8': ' eight ',
        '9': ' nine ',
    }
    for k, v in dictionary.items():
        s = s.replace(k, v)
    return s

def text_cleanup(s, remove_unused=True):
    """
    This function receives ss and returns clean word-list
    """
    # Remove leaky elements like ip, user, numbers, newlines
    s = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", "_ip_", s)
    s = re.sub("\[\[.*\]", "", s)
    s = re.sub('\n', ' ', s)
    s = replace_numbers(s)

    # Split the sentences into words
    s = tweet_tokenizer.tokenize(s)

    # Lemmatize
    s = [lem.lemmatize(word, "v") for word in s]

    # Remove Stopwords
    s = ' '.join([w for w in s if not w in eng_stopwords])
    
#     s = preprocess_text(s, fix_unicode=True,
#                            lowercase=True,
#                            no_currency_symbols=True,
#                            transliterate=True,
#                            no_urls=True,
#                            no_emails=True,
#                            no_contractions=True,
#                            no_phone_numbers=True,
#                            no_punct=True).strip()
    
    if remove_unused:
        s = ' '.join([i for i in s.split() if i not in unused])
    return s


In [46]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_train = lb.fit_transform(train.Label)
y_train


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [47]:
# dir(lb)
lb.classes_

array(['ARTS AND CRAFTS', 'CULTURE', 'ECONOMY', 'EDUCATION', 'FARMING',
       'FLOODING', 'HEALTH', 'LAW/ORDER', 'LOCALCHIEFS', 'MUSIC',
       'OPINION/ESSAY', 'POLITICS', 'RELATIONSHIPS', 'RELIGION', 'SOCIAL',
       'SOCIAL ISSUES', 'SPORTS', 'TRANSPORT', 'WILDLIFE/ENVIRONMENT',
       'WITCHCRAFT'], dtype='<U20')

In [48]:
# train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv').sample(frac=1)
# test  = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")

train['Text'] = train.Text.fillna("_na_").apply(text_cleanup)
test['Text']  = test.Text.fillna("_na_").apply(text_cleanup)

list_sentences_train = train.Text.tolist()
list_sentences_test  = test.Text.tolist()

y = y_train

tokenizer = text.Tokenizer(num_words=max_features, lower=True)
tokenizer.fit_on_texts(list_sentences_train + list_sentences_test)

X_t  = sequence.pad_sequences(tokenizer.texts_to_sequences(list_sentences_train), maxlen=maxlen)
X_te = sequence.pad_sequences(tokenizer.texts_to_sequences(list_sentences_test),  maxlen=maxlen)

X_train, X_val, y_train, y_val = train_test_split(X_t, y, test_size=0.1, random_state=1337)

In [49]:
embedding = get_embedding(emb_file)

Embeddings:   2%|█                                                          | 1076/60000 [00:00<00:09, 5935.68 words/s]


In [50]:
class RocAucEvaluation(Callback):

    def __init__(self, verbose=True):
        super(RocAucEvaluation, self).__init__()
        self.verbose = verbose

    def on_epoch_end(self, epoch, logs=None):
        logs   = logs or {}
        x_val  = self.validation_data[0]
        y_val  = self.validation_data[1]
        y_pred = self.model.predict(x_val, verbose=0)
        try:
            current  = roc_auc_score(y_val, y_pred)
        except ValueError:
            # Bug in AUC metric when TP = 100%
            # https://github.com/scikit-learn/scikit-learn/issues/1257
            current = 1.0

        logs['roc_auc'] = current

        if self.verbose:
            print("val_roc_auc: {:.6f}".format(current))

def create_model(embedding=None):
    inp = Input(shape=(maxlen,))

    x = embedding(inp)
    x = Bidirectional(GRU(512, return_sequences=True))(x)
    x = Attention()(x)
    x = Dense(20, activation="softmax")(x)

    model = Model(inputs=inp, outputs=x)

    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3, clipnorm=4), metrics=['accuracy'])

    return model

In [51]:
K.clear_session()
seed_value= 0

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)

model = create_model(embedding)

In [52]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5000)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 5000, 300)         18000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 5000, 1024)        2500608   
_________________________________________________________________
attention (Attention)        (None, 1024)              102600    
_________________________________________________________________
dense (Dense)                (None, 20)                20500     
Total params: 20,623,708
Trainable params: 2,623,708
Non-trainable params: 18,000,000
_________________________________________________________________


In [None]:
batch_size = 32
epochs = 7

model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_val, y_val),
          callbacks=[
#               RocAucEvaluation(verbose=True),
              ModelCheckpoint(file_path,    monitor='val_accuracy', mode='max', save_best_only=True),
              EarlyStopping(patience=10,    monitor="val_accuracy", mode="max"),
              ReduceLROnPlateau(patience=0, monitor='val_accuracy', mode='max', cooldown=2, min_lr=1e-7, factor=0.3)
          ]
)

Epoch 1/7


In [42]:
# y_val
# X_train.shape
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=5000,stop_words=stopw)
vect
vect.fit_transform(X)

(1292, 250)

In [None]:
# model.load_weights(file_path)

# sample_submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
# sample_submission[list_classes] = model.predict(X_te, verbose=True)
# sample_submission.to_csv('submission.csv', index=False)