In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_auc_score

from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU,Add
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam

import lightgbm as lgb

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse
import re, string
%matplotlib inline
seed = 42
import os
os.environ['OMP_NUM_THREADS'] = '4'
EMBEDDING_FILE = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): return re_tok.sub(r' \1 ', s).split()
def pr(x, y_i, y):
    y = y.values
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    #Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    #remove \n
    comment=re.sub("\\n","",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    #removing usernames
    comment=re.sub("\[\[.*\]","",comment)
    
    return comment

def enrich_indirect_features(df):
    #Word count in each comment:
    df['count_word']=df["comment_text"].apply(lambda x: len(str(x).split()))
    #Unique word count
    df['count_unique_word']=df["comment_text"].apply(lambda x: len(set(str(x).split())))
    #punctuation count
    df["count_punctuations"] =df["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    #derived features
    #Word count percent in each comment:
    df['word_unique_percent']=df['count_unique_word']*100/df['count_word']
    df['word_unique_percent'].fillna(1,inplace=True)
    #derived features
    #Punct percent in each comment:
    df['punct_percent']=df['count_punctuations']*100/df['count_word']
    df['punct_percent'].fillna(0,inplace=True)
    df.drop(['count_word', 'count_unique_word'], axis=1, inplace=True)

In [None]:
start_time = time.time()
path = '../input/jigsaw-toxic-comment-classification-challenge/'
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
print('[{}] Finished to load data'.format(time.time() - start_time))
print('Number of rows and columns in the train data set:',train.shape)
print('Number of rows and columns in the test data set:',test.shape)

# create submission file
submission = pd.DataFrame.from_dict({'id': test['id']})

nrow_train = train.shape[0]
merge: pd.DataFrame = pd.concat([train, test])

# train dun have Nan values
merge.fillna(' ',inplace=True)
merge['comment_text']=merge['comment_text'].apply(lambda x :clean(x))
enrich_indirect_features(merge)
gc.collect()

In [None]:
# SELECTED_COLS = ['word_unique_percent','punct_percent']

# # Word ngram vector
# vect_word = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='word', tokenizer=tokenize,
#                         stop_words= 'english',ngram_range=(1,2),dtype=np.float32)
# merge_word = vect_word.fit_transform(merge['comment_text'])
# print('[{}] TFIDF vectorize `vect_word` completed.'.format(time.time() - start_time))

# # Character n gram vector
# vect_char = TfidfVectorizer(max_features=40000, lowercase=True, analyzer='char',
#                         stop_words= 'english',ngram_range=(2,6),dtype=np.float32)
# merge_char = vect_char.fit_transform(merge['comment_text'])
# print('[{}] TFIDF vectorize `vect_char` completed.'.format(time.time() - start_time))

# num_clusters = 30 # need to be selected wisely
# kmeans_model_char = MiniBatchKMeans(n_clusters=num_clusters,
#                                init='k-means++',
#                                n_init=1,
#                                init_size=1000, batch_size=1000, verbose=0, max_iter=1000)
# merge_char_kmeans = kmeans_model_char.fit_transform(merge_char)
# print('[{}] K means clustering  `merge_char_kmeans` completed.'.format(time.time() - start_time))

# sparse_merge = sparse.hstack([merge_word, merge_char_kmeans, merge[SELECTED_COLS]]).tocsr()

# del merge_word, merge_char, merge, merge_char_kmeans
# gc.collect()
    

# x_train = sparse_merge[:nrow_train]
# x_test = sparse_merge[nrow_train:]

target_col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
y_train = train[target_col].values

In [None]:
# x_train[y_train['toxic'].values==1].sum(0)
# (y_train['toxic'].values==0).sum()
# def pr(x, y_i, y):
#     p = x[y==y_i].sum(0)
#     return (p+1) / ((y==y_i).sum()+1)

In [None]:
# #Logistic Regression
# prd = np.zeros((x_test.shape[0],y_train.shape[1]))
# model_val = np.zeros((x_train.shape[0],y_train.shape[1]))
# # cv_score =[]
# for i,col in enumerate(target_col):
#     r = sparse.csr_matrix(np.log(pr(x_train, 1,y_train[col]) / pr(x_train, 0,y_train[col])))
#     x_train_nb = x_train.multiply(r)
#     lr = LogisticRegression(C=2,random_state = i,class_weight = 'balanced')
#     print('Building {} model for column:{''}'.format(i,col)) 
#     lr.fit(x_train_nb,y_train[col])
#     #cv_score.append(lr.score)
    
#     prd[:,i] = lr.predict_proba(x_test.multiply(r))[:,1]
# #     prd[:,i] = lr.predict_proba(x_test)[:,1]
    
#     model_val[:,i] = lr.predict(x_train_nb)
# submission = pd.DataFrame(prd,columns=y_train.columns)
# model_val = pd.DataFrame(model_val,columns=y_train.columns)

In [None]:
# # LightGBM with Select K Best
# for i,col in enumerate(target_col):
#     print(col)
#     train_target = y_train[col]
#     train_sparse_matrix = x_train
# #     model = LogisticRegression(solver='sag')
# #     sfm = SelectFromModel(model, threshold=0.2)
# #     train_sparse_matrix = sfm.fit_transform(x_train.tocoo(), train_target)
# #     print(train_sparse_matrix.shape)
#     train_sparse_matrix, valid_sparse_matrix, lgbm_y_train, lgbm_y_valid = train_test_split(train_sparse_matrix, train_target, test_size=0.05, random_state=144)
# #     test_sparse_matrix = sfm.transform(x_test.tocoo())
#     test_sparse_matrix = x_test
#     d_train = lgb.Dataset(train_sparse_matrix, label=lgbm_y_train)
#     d_valid = lgb.Dataset(valid_sparse_matrix, label=lgbm_y_valid)
#     watchlist = [d_train, d_valid]
#     params = {'learning_rate': 0.2,
#               'application': 'binary',
#               'num_leaves': 31,
#               'verbosity': -1,
#               'metric': 'auc',
#               'data_random_seed': 2,
#               'bagging_fraction': 0.8,
#               'feature_fraction': 0.6,
#               'nthread': 4,
#               'lambda_l1': 1,
#               'lambda_l2': 1}
#     rounds_lookup = {'toxic': 140,
#                  'severe_toxic': 50,
#                  'obscene': 80,
#                  'threat': 80,
#                  'insult': 70,
#                  'identity_hate': 80}
#     model = lgb.train(params,
#                       train_set=d_train,
#                       num_boost_round=rounds_lookup[col],
#                       valid_sets=watchlist,
#                       verbose_eval=10)
#     submission[col] = model.predict(test_sparse_matrix)

In [None]:
# keras
max_features=30000
maxlen=150
embed_size=300

x_comment_train = merge[:nrow_train]['comment_text']
x_comment_test = merge[nrow_train:]['comment_text']
print("x_train shape: ", x_comment_train.shape)
tokenizer=text.Tokenizer(num_words=max_features,lower=True)
tokenizer.fit_on_texts(list(x_comment_train)+list(x_comment_test))
x_comment_train =tokenizer.texts_to_sequences(x_comment_train)
x_comment_test =tokenizer.texts_to_sequences(x_comment_test)
# print("x_train shape after to seq: ", x_train.shape)
x_train_seq=sequence.pad_sequences(x_comment_train,maxlen=maxlen)
x_test_seq=sequence.pad_sequences(x_comment_test,maxlen=maxlen)
print("x_train shape after pad sequence: ", x_train_seq.shape)

x_train = pd.DataFrame(x_train_seq)
x_train['word_unique_percent']  = merge['word_unique_percent'][:nrow_train]
x_train['punct_percent'] = merge['punct_percent'][:nrow_train]
x_test = pd.DataFrame(x_test_seq)
# x_test['word_unique_percent']  = merge['word_unique_percent'][nrow_train:]
# x_test['punct_percent'] = merge['punct_percent'][nrow_train:]

In [None]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
# embedding_matrix = np.zeros((nb_words, embed_size))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
sequence_input = Input(shape=(maxlen, ))
# x = Embedding(max_features, embed_size)(sequence_input)
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(sequence_input)

input2 = Input(shape=(2,))

x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(80, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
# x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)

x = concatenate([avg_pool, max_pool, input2]) # Add input2 into this layer
# x = concatenate([avg_pool, max_pool])

# x = Dense(128, activation='relu')(x)
# x = Dropout(0.1)(x)
preds = Dense(6, activation="sigmoid")(x)
model = Model(inputs=[sequence_input, input2], outputs=preds)
# model = Model(inputs=sequence_input, outputs=preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

In [None]:
batch_size = 64
epochs = 4
print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)
x_input2 = X_tra[['word_unique_percent', 'punct_percent']]
X_val2 = X_val[['word_unique_percent', 'punct_percent']]
X_tra = X_tra.drop(columns=['word_unique_percent', 'punct_percent'])
X_val = X_val.drop(columns=['word_unique_percent', 'punct_percent'])

x_test2 = merge[['word_unique_percent', 'punct_percent']][nrow_train:]

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [None]:
filepath="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
ra_val = RocAucEvaluation(validation_data=([X_val,X_val2], y_val), interval = 1)
# ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [ra_val,checkpoint, early]

In [None]:
model.fit([X_tra,x_input2], y_tra, batch_size=batch_size, epochs=epochs, validation_data=([X_val,X_val2], y_val),callbacks = callbacks_list,verbose=1)
# model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)
#Loading model weights
model.load_weights(filepath)
print('Predicting....')
y_pred = model.predict([x_test,x_test2],batch_size=1024,verbose=1)
# y_pred = model.predict(x_test,batch_size=1024,verbose=1)
submission = submission.reindex(columns=["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred

In [None]:
#Model validation
# score = roc_auc_score(model_val, y_train)
# print("\n ROC-AUC - score: %.6f \n" % (score))

In [None]:
submission.to_csv('toxic_comment_classification.csv',index=False)
submission.head()