In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from scipy.sparse import csr_matrix, hstack

In [2]:
PATH = '../../data/'

print('reading data')

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

print('data cleaning')

tok = TweetTokenizer()
stopword = set(stopwords.words("english"))

def clean(comment):
    text = tok.tokenize(comment)
    text = [w for w in text if not w in stopword]
    text = ' '.join(text)
    if text == '': text = 'na'
    return text

train['comment_text_cleaned'] = train['comment_text_cleaned'].apply(lambda x: clean(x))
test['comment_text_cleaned'] = test['comment_text_cleaned'].apply(lambda x: clean(x))

print('calculations')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

def f(x):
    x = tok.tokenize(x)
    return len(x)

text = pd.concat([train_sentence, test_sentence])

print(train.shape)
print(test.shape)

reading data
data cleaning
calculations
(159571, 21)
(153164, 15)


In [3]:
# CountVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), analyzer=’word’, max_df=1.0,\
#                 min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>)

# TfidfVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None,\
#                 vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>, norm=’l2’, use_idf=True,\
#                 smooth_idf=True, sublinear_tf=False)

print('getting tfidf')
char_vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words='english', max_df=0.9, min_df=100,\
                                  strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1)
phrase_vectorizer = TfidfVectorizer(ngram_range=(2,3), stop_words='english', max_df=0.9, min_df=100,\
                                    strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1)
print('fitting char')
char_vectorizer.fit(text.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)
print('transforming train char')
train_char = char_vectorizer.transform(train['comment_text_cleaned'].values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train['comment_text_cleaned'].values)
print('transforming test char')
test_char = char_vectorizer.transform(test['comment_text_cleaned'].values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test['comment_text_cleaned'].values)

train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

getting tfidf
fitting char
fitting phrase
transforming train char
transforming train phrase
transforming test char
transforming test phrase


<159571x11864 sparse matrix of type '<class 'numpy.float64'>'
	with 4052743 stored elements in Compressed Sparse Row format>

In [4]:
# print('combine featrues')
other_feature_cols = ['word_count', 'cleaned_word_count', 'unique_word_count', 'cleaned_unique_word_count',\
                      'consecutive_question_marks', 'consecutive_exclamation_marks', 'uppercase_letters', 'ellipsis',\
                      'period', 'parentheses_pair', 'special_symbol']

print('getting train features')
train_features = [train_tfidf, csr_matrix(train[other_feature_cols].values)]
# train_features = train_tfidf

print('gettingtest features')
test_features = [test_tfidf, csr_matrix(test[other_feature_cols].values)]
# test_features = test_tfidf

train_features

getting train features
gettingtest features


[<159571x11864 sparse matrix of type '<class 'numpy.float64'>'
 	with 4052743 stored elements in Compressed Sparse Row format>,
 <159571x11 sparse matrix of type '<class 'numpy.int64'>'
 	with 1002198 stored elements in Compressed Sparse Row format>]

In [5]:
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

def get_nblogreg_model(label_cols, train_features, train, test_features):
    preds = np.zeros((test.shape[0], len(label_cols)))
    train_preds = np.zeros((train.shape[0], len(label_cols)))
    for i, j in enumerate(label_cols):
        print('fit', j)
        y = train[j].values
        r = np.log(pr(1, y, train_features[0]) / pr(0, y, train_features[0]))
        model = LogisticRegression(C=4, dual=True)
        x_nb = hstack((train_features[0].multiply(r), train_features[1]), format='csr')
        model.fit(x_nb, y)
        preds[:, i] = model.predict_proba(hstack((test_features[0].multiply(r), test_features[1]), format='csr'))[:, 1]
        train_preds[:, i] = model.predict_proba(x_nb)[:, 1]
        print('accuracy is {}'.format(log_loss(y, train_preds[:, i])))
    return preds, train_preds

def save(model_name, y_test, label_cols, path, is_train=False):
    if is_train:
        submission = pd.read_csv(path + 'sample_train.csv')
        file_name = 'train_' + model_name
    else:
        submission = pd.read_csv(path + 'sample_submission.csv')
        file_name = model_name
    submission[label_cols] = y_test
    submission.to_csv(path + model_name + '/' + file_name + '.csv', index=False)
    
print('done')

done


In [6]:
print('predicting')
y_test, y_train = get_nblogreg_model(label_cols, train_features, train, test_features)

print('saving files')
model_name = 'nblogreg'
# save(model_name, y_test, label_cols, PATH)
# save(model_name, y_train, label_cols, PATH, True)

print('done')

predicting
fit toxic
accuracy is 0.13888645107060046
fit severe_toxic
accuracy is 0.04269950960170049
fit obscene
accuracy is 0.09341086624555074
fit threat
accuracy is 0.011327501043671254
fit insult
accuracy is 0.10963884515065506
fit identity_hate
accuracy is 0.03959981595718401
saving files
done
