<a href="https://colab.research.google.com/github/ispapadakis/nlp/blob/master/WordVectorsTextMine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing:<br>
Text Sentiment Classification Applied to Short Text

Data from Quora Insincere Questions Classification Competition

### Features

- Text Analysis Using Pretrained Word Vectors
- Fit Bidirectional RNN
- Apply Custom Keras Loss Function to Maximize F1-Score
- Accept Misspellings of Most Common Words

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np

import gc
import re
from datetime import datetime as dt
from difflib import get_close_matches
import matplotlib.pyplot as plt


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, concatenate, Dense, Dropout
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, Bidirectional, CuDNNGRU
from keras.models import Model
from keras.initializers import glorot_normal, orthogonal
from keras.optimizers import Adam, RMSprop
from keras.callbacks import LearningRateScheduler, ModelCheckpoint, ReduceLROnPlateau

#from keras import initializers, regularizers, constraints, optimizers, callbacks
import keras.backend as K

Using TensorFlow backend.


In [None]:
train = pd.read_csv("gdrive/My Drive/Colab Notebooks/train.csv")
test = pd.read_csv("gdrive/My Drive/Colab Notebooks/test.csv")

### Clean Text With Emphasis in Non-English Special Characters

In [None]:
puncts = list('!"#$%&\'()*+,\-.\/:;<=>?@\[\]\\\\^_`{|}~')
def clean_text(x):

    #x = str(x) # needed if x is nan
    spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d',
                '\xad', '\xa0', 'करना', 'है']
    for s in spaces:
        x = x.replace(s, ' ')

    x = re.sub('[’‘´`”“]', "'", x)
    x = re.sub(' … ', " ... ", x)
    x = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', x)

    for punct in puncts:
        x = x.replace(punct, f' {punct} ')

    return x

train["question_text"] = train["question_text"].apply(lambda x: clean_text(x))
test["question_text"] = test["question_text"].apply(lambda x: clean_text(x))

### Tokenize

In [None]:
## Parameters
embed_size = 300 # word vector size
max_features = None # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # Maximum message length (ignore after this many words)

In [None]:
## Tokenize the sentences
tokenizer = Tokenizer(num_words = max_features, lower = False, filters = "") # Expect that lower case and capitalized have diff meanings
tokenizer.fit_on_texts(train["question_text"])

In [None]:
X = tokenizer.texts_to_sequences(train["question_text"])
X_test = tokenizer.texts_to_sequences(test["question_text"])

In [None]:
## Pad the sentences
X = pad_sequences(X, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [None]:
## vector of target values
Y = train['target'].values
## submissiong data frame
sub_ = test[['qid']]

In [None]:
del train, test
gc.collect()

14

In [None]:
word_index = tokenizer.word_index
print("Word Index Entries: ",len(word_index))
lower_case_word = set(t.lower() for t in word_index)
max_features = len(word_index) + 1
approx = dict()

Word Index Entries:  240750


### Read Pretrained Word Vectors

Use: Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download)

If a word is not found among word vector choices:

- Check if different capitalization is available
- Check for misspellings of most common words

In [None]:
#%%time
#glove = pd.read_csv('glove.840B.300d.txt', sep=" ", quoting=3, header=None, index_col=0, na_filter=False)
#CPU times: user 1min 16s, sys: 26.3 s, total: 1min 43s
#Wall time: 4min 49s

In [None]:
def word_vectors(embedding_file):
    global approx
    embeddings_index = dict()
    ignore_case_index = dict()
    
    # Read Word Vector File
    f = open(embedding_file, errors='ignore')
    for line in f:
        lst = line.split()
        if len(lst) != 301: # Some files have headers, ignore them
            continue
        if lst[0] in word_index: 
            embeddings_index[lst[0]] = np.asarray(lst[1:], dtype='float32')
        if lst[0].lower() in lower_case_word:\
            ignore_case_index[lst[0].lower()] = np.asarray(lst[1:], dtype='float32')
    f.close()

    # Add words not in word vector keys if lower case of word is in
    for word in word_index:
        if word not in embeddings_index and word.lower() in ignore_case_index:
            embeddings_index[word] = ignore_case_index[word.lower()]

    # if a) word is of length > 5 and b) word is among the 100000 most common 
    #    and c) not in known words, then add to question marks list
    qmark = [w for w in word_index if len(w)>5 and word_index[w]<100000 and w not in embeddings_index]
    
    # if approx dict is not empty
    if not approx:
        # add to solid words list words of a) length > 5, b) among the 30000 most common, and c) known
        solid = [w for w in word_index if len(w)>5 and word_index[w]<30000 and w in embeddings_index]
        # find close matches to question mark words among solid words
        approx = {w:get_close_matches(w, solid, n=1, cutoff=0.80) for w in qmark}
        approx["Quorans"] = "Quoran"
        print("Spelll Checking --> Solids = ", len(solid)," QMarks = ",len(qmark))
        print("Corrections = ",sum(bool(v) for v in approx.values()))
        
    # if there is an approximation for a question mark use its word vector 
    for w in qmark:
        if w in approx and approx[w]:
            a = approx[w][0]
            if a in embeddings_index:
                embeddings_index[w] = embeddings_index[a]

    # Calculate center and std for each word vector dimension among known words
    all_embs = np.stack(tuple(embeddings_index.values()))
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    #embed_size = all_embs.shape[1]

    np.random.seed(2019)
    # KEY!! assign random vectors to unknown words
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]
            
    return embedding_matrix

In [None]:
%%time

embedding_matrix = word_vectors('gdrive/My Drive/Colab Notebooks/glove.840B.300d.txt')
embedding_matrix[0,] = 0

Spelll Checking --> Solids =  20112  QMarks =  3050
Corrections =  1152
CPU times: user 4min 43s, sys: 4.35 s, total: 4min 47s
Wall time: 5min 34s


In [None]:
#[(k,v) for k,v in approx.items() if v]

In [None]:
gc.collect()

0

In [None]:
##### CUSTOM LOSS FUNCTION

# Weighted Binary Crossentropy

def weighted_binary_crossentropy_init(dev_from_1_weight = 0.5):
  
  dev_from_0_weight = 1 - dev_from_1_weight
  
  def wbce_loss(y_true, y_pred):
      wce1 = dev_from_1_weight * y_true * K.log( K.clip(y_pred, K.epsilon(), None) )
      wce0 = dev_from_0_weight * (1.0-y_true) * K.log( K.clip(1.0-y_pred, K.epsilon(), None) )
      return K.mean(-(wce1+wce0), axis=-1)
    
  return wbce_loss


# Negative of F1 Score at Given Threshold

def f1_loss(thresh = 0.35, smooth = 1e-5):

    def f1_neg(y_true, y_pred):
        y_pred = K.sigmoid( 1e6 * (y_pred - thresh))
        y_true_f = K.flatten(y_true)
        y_pred_f = K.flatten(y_pred)
        intersection = K.sum(y_true_f * y_pred_f)
        return -(2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)

    return f1_neg


In [None]:
# Quick Approximation of Max F1-Score
def f1_smart(y_true, y_pred):
    args = np.argsort(y_pred)
    tp = y_true.sum()
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], y_pred[args[res_idx:res_idx+2]].mean()

In [None]:
weighted_binary_crossentropy = weighted_binary_crossentropy_init(0.40)

In [None]:
### Neural Network Configuration

def nn_config_a(gru_units=64, dense_units=16, dropout_rate=0.40):
    K.clear_session()
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x = Bidirectional(CuDNNGRU(gru_units, return_sequences = True, kernel_initializer = glorot_normal(seed = 12300),
                                recurrent_initializer = orthogonal(gain = 1.0, seed = 10000)))(x)
    emm_max_pool = GlobalMaxPooling1D()(x)
    emm_ave_pool = GlobalAveragePooling1D()(x)
    out = concatenate([emm_max_pool, emm_ave_pool])
    out = Dense(dense_units, activation = "relu", kernel_initializer = glorot_normal(seed = 12300))(out)
    out = Dropout(dropout_rate)(out)
    out = Dense(dense_units, activation = "relu", kernel_initializer = glorot_normal(seed = 12300))(out)
    out = Dropout(dropout_rate)(out)
    out = Dense(1, activation = "sigmoid")(out)
    model = Model(inputs = inp, outputs = out)
    f1_neg = f1_loss()
    model.compile(loss = weighted_binary_crossentropy, optimizer = RMSprop(clipvalue=0.5), metrics = [f1_neg])
    return model

In [None]:
kfold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
stats = {"f1":dict(), "cutoff":dict(), "pctile":dict(), "avg_prob":dict()}
y_test = np.zeros((X_test.shape[0],)).reshape([-1,1])

# Cure Schedule
p  = {"batch_size":512, "lr1":[0.0010,0.0010,0.0005,0.0005], "lr2":[2e-5]}

oof = np.array([[],[],[]])

t0 = dt.now()
__show_model__ = True
for i, (train_index, valid_index) in enumerate(kfold.split(X,Y)):
    model = nn_config_a()
    if __show_model__:
        model.summary()
        __show_model__ = False

    print("\n\nFold:",i)
    X_train, X_val, Y_train, Y_val = X[train_index], X[valid_index], Y[train_index], Y[valid_index]
    #checkpoint = ModelCheckpoint("weights_best.h5", monitor='val_loss', verbose=2, save_best_only=True, mode='min')
    #reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.6, patience=1, min_lr=0.0001, verbose=2)
    lrate = lambda epoch: p["lr1"][epoch]
    lr_schedule = LearningRateScheduler(lrate, verbose=0)
    #earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose=2, mode='auto')
    callbacks = [lr_schedule]
    model.fit(X_train, Y_train, batch_size=p["batch_size"], epochs=len(p["lr1"]),
        validation_data=(X_val, Y_val), verbose=2, callbacks=callbacks)
    #model.load_weights(filepath)
    print("Elapsed Time: ", np.round((dt.now()-t0).seconds / 60,2), " min")

    for lr_curr in p["lr2"]:
        model.layers[1].trainable = True   # Embedding
        #model.layers[2].trainable = False   # Bidirectional

        f1_neg = f1_loss()
        model.compile(loss = weighted_binary_crossentropy, optimizer = RMSprop(lr=lr_curr, clipvalue=0.5), metrics = [f1_neg])
        model.fit(X_train, Y_train, batch_size=2048, epochs=1,
            validation_data=(X_val, Y_val), verbose=2) #, class_weight = {0:1,1:1.5})

        #model.load_weights(filepath)
        print("Elapsed Time: ", np.round((dt.now()-t0).seconds / 60,2), " min")

    y_pred = model.predict(X_val, batch_size=1024).flatten()
    f1, threshold = f1_smart(Y_val, y_pred)
    oof = np.append(oof,[Y_val, y_pred, np.full((X_val.shape[0],),i)], axis = 1)
    y_test += model.predict(X_test, batch_size=1024) / 5

    print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(f1, threshold))
    print("F1(0.34) = ",f1_score(Y_val,y_pred>=0.34),"\n\n\n")
    stats["f1"][i] = f1
    stats["cutoff"][i] = threshold
    stats["pctile"][i] = np.round(np.mean(y_pred >= threshold),5)
    stats["avg_prob"][i] = y_pred.mean()

print("Elapsed Time: ", np.round((dt.now()-t0).seconds / 60,2), " min")

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 70)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 70, 300)      72225300    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 70, 128)      140544      embedding_1[0][0]                
__________________________________________________________________________________________________
global_max_pooling1d_1 (GlobalM (None, 128)          0           bidirectional_1[0][0]            
__________________________________________________________________________________________________
global_ave

In [None]:
print('{:10}'.format(".") + ' '.join('Fold {} '.format(i) for i in range(5)))
for m in stats:
    out = '{:9}'.format(m)
    out += " ".join('{:7.4f}'.format(stats[m][i]) for i in range(5) if i in stats[m])
    print(out)

.         Fold 0  Fold 1  Fold 2  Fold 3  Fold 4 
f1        0.6746  0.6817  0.6790  0.6750  0.6817
cutoff    0.2946  0.2411  0.2970  0.2836  0.2242
pctile    0.0721  0.0709  0.0735  0.0687  0.0719
avg_prob  0.0437  0.0461  0.0434  0.0442  0.0465


In [None]:
oof_f1, oof_cutoff = f1_smart(oof[0],oof[1])
print('OOF F1 = ',oof_f1," at ",oof_cutoff)
oof_labels = pd.Series(oof[1] >= oof_cutoff, name="pred")
print("F1 Score at OOF Cutoff: {:7.4f}".format(f1_score(oof[0],oof_labels)))
print(pd.crosstab(pd.Series(oof[0],name="true"), oof_labels))
np.save("score.npy",oof)

OOF F1 =  0.6767031275509015  at  0.27970461547374725
F1 Score at OOF Cutoff:  0.6767
pred    False  True 
true                
0.0   1192644  32668
1.0     22780  58030


In [None]:
optimal_threshold = oof_cutoff
#optimal_threshold = 0.34
sub_['prediction'] = (y_test>optimal_threshold).astype(int)
sub_.to_csv("submission.csv", index=False)