In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
from keras import regularizers
from keras.models import Model
from keras.layers import Dense, Input, Dropout, Embedding, concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from scipy.sparse import csr_matrix, hstack

Using TensorFlow backend.


In [2]:
PATH = '../../data/'

print('reading data')

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

print('data cleaning')

tok = TweetTokenizer()
stopword = set(stopwords.words("english"))

def clean(comment):
    text = tok.tokenize(comment)
    text = [w for w in text if not w in stopword]
    text = ' '.join(text)
    if text == '': text = 'na'
    return text

train['comment_text_cleaned'] = train['comment_text_cleaned'].apply(lambda x: clean(x))
test['comment_text_cleaned'] = test['comment_text_cleaned'].apply(lambda x: clean(x))

print('calculations')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

def f(x):
    x = tok.tokenize(x)
    return len(x)

text = pd.concat([train_sentence, test_sentence])

print(train.shape)
print(test.shape)

reading data
data cleaning
calculations
(159571, 26)
(153164, 20)


In [3]:
# TfidfVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None,\
#                 vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>, norm=’l2’, use_idf=True,\
#                 smooth_idf=True, sublinear_tf=False)

print('getting tfidf')
char_vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words='english', max_df=0.9, min_df=100,\
                                  strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1,
                                  max_features=2000)
phrase_vectorizer = TfidfVectorizer(ngram_range=(2,3), stop_words='english', max_df=0.9, min_df=100,\
                                    strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1,\
                                    max_features=3000)
print('fitting char')
char_vectorizer.fit(text.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)
print('transforming train char')
train_char = char_vectorizer.transform(train['comment_text_cleaned'].values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train['comment_text_cleaned'].values)
print('transforming test char')
test_char = char_vectorizer.transform(test['comment_text_cleaned'].values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test['comment_text_cleaned'].values)

train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

getting tfidf
fitting char
fitting phrase
transforming train char
transforming train phrase
transforming test char
transforming test phrase


<159571x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 3351156 stored elements in Compressed Sparse Row format>

In [4]:
print('combine featrues')
other_feature_cols = ['word_count', 'unique_word_count', 'consecutive_question_marks',\
                      'consecutive_exclamation_marks', 'uppercase_letters', 'ellipsis',\
                      'period', 'parentheses_paird', 'cleaned_word_count', 'cleaned_unique_word_count',\
                      'cleaned_consecutive_question_marks', 'cleaned_consecutive_exclamation_marks',\
                      'cleaned_uppercase_letters', 'cleaned_ellipsis', 'cleaned_period', 'cleaned_parentheses_pair']

print('getting train features')
train_features = [train_tfidf, csr_matrix(train[other_feature_cols].values)]
# train_features = train_tfidf

print('gettingtest features')
test_features = [test_tfidf, csr_matrix(test[other_feature_cols].values)]
# test_features = test_tfidf

train_features

combine featrues
getting train features
gettingtest features


[<159571x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 3351156 stored elements in Compressed Sparse Row format>,
 <159571x16 sparse matrix of type '<class 'numpy.int64'>'
 	with 1208018 stored elements in Compressed Sparse Row format>]

In [6]:
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

def get_nn_model(inp1_len, inp2_len):
    input = [Input(shape=(inp1_len,), sparse=True), Input(shape=(inp2_len,), sparse=True)]
    x1 = Dense(200, activation='relu')(input[0])
    x1 = Dense(200, activation='relu')(x1)
    x2 = Dense(200, activation='relu')(input[1])
    x2 = Dense(200, activation='relu')(x2)
    x = Dense(200, activation='relu')(concatenate([x1, x2]))
    x = Dropout(0.2)(x)
    x = Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dense(2, activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

def train_model(model, file_path, batch_size, epochs, X_train, y):
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, earlystopping]
    model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks_list)
    return model

def predict(model, file_path, X_test):
    model.load_weights(file_path)
    return model.predict(X_test, verbose=1)

def run(label_cols, train_features, train, test_features, train_len, file_path, batch_size, epochs):
    preds = np.zeros((test.shape[0], len(label_cols)))
    preds_train = np.zeros((train.shape[0], len(label_cols)))
    for i, j in enumerate(label_cols):
        print('fit', j)
        y = train[j]
        r = np.log(pr(1, y.values, train_features[0]) / pr(0, y.values, train_features[0]))
        y = y.to_frame()
        y['2'] = 1 - y
        y = y.values
        model = get_nn_model(train_len[0], train_len[1])
        x_nb = [train_features[0].multiply(r).tocsr(), train_features[1]]
        model = train_model(model, file_path, batch_size, epochs, x_nb, y)
        preds[:, i] = predict(model, file_path, [test_features[0].multiply(r).tocsr(), test_features[1]])[:, 0]
        preds_train[:, i] = predict(model, file_path, x_nb)[:, 0]
    return preds, preds_train

def save(model_name, y_test, label_cols, path, is_train=False):
    if is_train:
        submission = pd.read_csv(path + 'sample_train.csv')
        file_name = 'train_' + model_name
    else:
        submission = pd.read_csv(path + 'sample_submission.csv')
        file_name = model_name
    submission[label_cols] = y_test
    submission.to_csv(path + model_name + '/' + file_name + '.csv', index=False)
    
print('done')

done


In [None]:
FILE_PATH = '../../model/nn_best.hdf5'
BATCH_SIZE = 32
EPOCHS = 2

print('predicting')
y_test, y_train = run(label_cols, train_features, train, test_features,\
                      [train_tfidf.shape[1], len(other_feature_cols)],\
                      FILE_PATH, BATCH_SIZE, EPOCHS)

print('saving files')
save('nbnn', y_test, label_cols, PATH)
save('nbnn', y_train, label_cols, PATH, True)

print('done')

predicting
fit toxic
Train on 127656 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2