In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

In [2]:
PATH = '../../data/'

print('reading data')

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

print('data cleaning')

tok = TweetTokenizer()
stopword = set(stopwords.words("english"))

def clean(comment):
    text = tok.tokenize(comment)
    text = [w for w in text if not w in stopword]
    text = ' '.join(text)
    if text == '': text = 'na'
    return text

train['comment_text_cleaned'] = train['comment_text_cleaned'].apply(lambda x: clean(x))
test['comment_text_cleaned'] = test['comment_text_cleaned'].apply(lambda x: clean(x))

print('calculations')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

def f(x):
    x = tok.tokenize(x)
    return len(x)

text = pd.concat([train_sentence, test_sentence])

print(train.shape)
print(test.shape)

done


In [8]:
print('getting tfidf')
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3), stop_words='english', max_df=0.9, min_df=100,\
                                   strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1)

print('fitting')
tfidf_vectorizer.fit(text.values)
print('transforming train')
train_tfidf = tfidf_vectorizer.transform(train['comment_text_cleaned'].values)
print('transforming test')
test_tfidf = tfidf_vectorizer.transform(test['comment_text_cleaned'].values)
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

getting tfidf
fitting
transforming train


NameError: name 'transform' is not defined

In [10]:
def pr(y_i, y):
    p = train_tfidf[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

def get_nblogreg_model(label_cols, train_tfidf, train, test_tfidf, test_len):
    preds = np.zeros((test_len, len(label_cols)))
    for i, j in enumerate(label_cols):
        print('fit', j)
        y = train[j].values
        r = np.log(pr(1, y) / pr(0, y))
        model = LogisticRegression(C=4, dual=True)
        x_nb = train_tfidf.multiply(r)
        model.fit(x_nb, y)
        preds[:, i] = model.predict_proba(test_tfidf.multiply(r))[:, 1]
    return preds

def save(model_name, y_test, label_cols, sample_submission_file_path, path):
    submission = pd.read_csv(sample_submission_file_path)
    submission[label_cols] = y_test
    submission.to_csv(path + model_name + '/' + model_name '.csv', index=False)
    
print('done')

In [13]:
sample_submission_file_path = PATH + 'sample_submission.csv'

print('predicting')
y_test = get_nblogreg_model(label_cols, train_tfidf, train, test_tfidf, test.shape[0])

print('saving files')
save('nblogreg', y_test, label_cols, sample_submission_file_path, PATH)

print('done')

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
done
