In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
from keras import regularizers
from keras.models import Model
from keras.layers import Dense, Input, Dropout, Embedding, concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
PATH = '../../data/'

print('reading data')

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

print('data cleaning')

tok = TweetTokenizer()
stopword = set(stopwords.words("english"))

def clean(comment):
    text = tok.tokenize(comment)
    text = [w for w in text if not w in stopword]
    text = ' '.join(text)
    if text == '': text = 'na'
    return text

train['comment_text_cleaned'] = train['comment_text_cleaned'].apply(lambda x: clean(x))
test['comment_text_cleaned'] = test['comment_text_cleaned'].apply(lambda x: clean(x))

print('calculations')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

def f(x):
    x = tok.tokenize(x)
    return len(x)

text = pd.concat([train_sentence, test_sentence])

print(train.shape)
print(test.shape)

reading data
data cleaning
calculations
(159571, 10)
(153164, 4)


In [4]:
print('getting tfidf')
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words='english', max_df=0.9, min_df=100,\
                                   strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1)

print('fitting')
tfidf_vectorizer.fit(text.values)
print('transforming train')
train_tfidf = tfidf_vectorizer.transform(train['comment_text_cleaned'].values)
print('transforming test')
test_tfidf = tfidf_vectorizer.transform(test['comment_text_cleaned'].values)
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

getting tfidf
fitting
transforming train
transforming test


<159571x11786 sparse matrix of type '<class 'numpy.float64'>'
	with 4078557 stored elements in Compressed Sparse Row format>

In [20]:
def pr(y_i, y):
    p = train_tfidf[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

def get_nn_model(inp_len):
    input = Input(shape=(inp_len,), sparse=True)
    x = Dense(50, activation='relu')(input)
    x = Dense(50, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dense(2, activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

def train_model(model, file_path, batch_size, epochs, X_train, y):
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, earlystopping]
    model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks_list)
    return model

def predict(model, file_path, X_test):
    model.load_weights(file_path)
    return model.predict(X_test, verbose=1)

def run(label_cols, train_tfidf, train, test_tfidf, test_len, file_path, batch_size, epochs):
    preds = np.zeros((test_len, len(label_cols)))
    for i, j in enumerate(label_cols):
        print('fit', j)
        y = train[j]
        r = np.log(pr(1, y.values) / pr(0, y.values))
        y = y.to_frame()
        y['2'] = 1 - y
        y = y.values
        model = get_nn_model(train_tfidf.shape[1])
        model = train_model(model, file_path, batch_size, epochs, train_tfidf.multiply(r).tocsr(), y)
        preds[:, i] = predict(model, file_path, test_tfidf.multiply(r).tocsr())[:, 0]
    return preds

def save(model_name, y_test, label_cols, sample_submission_file_path, path):
    submission = pd.read_csv(sample_submission_file_path)
    submission[label_cols] = y_test
    submission.to_csv(path + model_name + '/' + model_name '.csv', index=False)
    
print('done')

done


In [21]:
sample_submission_file_path = PATH + 'sample_submission.csv'

FILE_PATH = '../../model/nn_best.hdf5'
BATCH_SIZE = 32
EPOCHS = 2

print('predicting')
y_test = run(label_cols, train_tfidf, train, test_tfidf, test.shape[0], FILE_PATH, BATCH_SIZE, EPOCHS)

print('saving files')
save('nbnn', y_test, label_cols, sample_submission_file_path, PATH)

print('done')

predicting
fit toxic
Train on 127656 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2
fit severe_toxic
Train on 127656 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2
fit obscene
Train on 127656 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2
fit threat
Train on 127656 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2
fit insult
Train on 127656 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2
fit identity_hate
Train on 127656 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2
saving files
done
