In [1]:
import pandas as pd
import numpy as np
import re, string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

# read data

path = '../data/'

train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

print(train.head())
print(train.shape[0])

         id                                       comment_text  toxic  \
0  22256635  Nonsense?  kiss off, geek. what I said is true...      1   
1  27450690  "\n\n Please do not vandalize pages, as you di...      0   
2  54037174  "\n\n ""Points of interest"" \n\nI removed the...      0   
3  77493077  Asking some his nationality is a Racial offenc...      0   
4  79357270  The reader here is not going by my say so for ...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
95851


In [2]:
# get tfidf

import re, string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def tokenize(text):
    text = text.lower().split()
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(f'([{string.punctuation}¨«»®´·º½¾¿¡§£₤‘’\d+])', " ", text)
    return text.split()

# CountVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), analyzer=’word’, max_df=1.0,\
#                 min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>)

# TfidfVectorizer(input=’content’, encoding=’utf-8’, decode_error=’strict’, strip_accents=None,\
#                 lowercase=True, preprocessor=None, tokenizer=None, analyzer=’word’, stop_words=None,\
#                 token_pattern=’(?u)\b\w\w+\b’, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None,\
#                 vocabulary=None, binary=False, dtype=<class ‘numpy.int64’>, norm=’l2’, use_idf=True,\
#                 smooth_idf=True, sublinear_tf=False)

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,4), stop_words='english', tokenizer=tokenize,\
                                   max_df=0.9, min_df=3, strip_accents='unicode', use_idf=1,\
                                   smooth_idf=1, sublinear_tf=1)

train_tfidf_original = tfidf_vectorizer.fit_transform(train['comment_text'])
tfidf_name = tfidf_vectorizer.get_feature_names()
test_tfidf_original = tfidf_vectorizer.transform(test['comment_text'])

train_tfidf_original

<95851x204277 sparse matrix of type '<class 'numpy.float64'>'
	with 3754712 stored elements in Compressed Sparse Row format>

In [3]:
# Embedding

label = train[label_cols]

# num = 10000
# selector = SelectKBest(chi2, num)
# train_tfidf = selector.fit_transform(train_tfidf_original, label)
# test_tfidf = selector.transform(test_tfidf_original)
# filtered_name = selector.transform([tfidf_name])[0]

train_tfidf = train_tfidf_original
test_tfidf = test_tfidf_original
# label = label.iloc[ : 500]

print('done')

done


In [4]:
def pr(y_i, y):
    p = train_tfidf[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)

def get_model(y):
    y = y.values
    r = np.log(pr(1, y) / pr(0, y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = train_tfidf.multiply(r)
    return m.fit(x_nb, y), r

preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m, r = get_model(train[j])
    preds[:, i] = m.predict_proba(test_tfidf.multiply(r))[:, 1]
    
print(preds.shape)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
(226998, 6)


In [5]:
label_cols_ini = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
file_name = 'baseline.csv'

sample_submission = pd.read_csv(path + 'sample_submission.csv')

sample_submission[label_cols_ini] = preds[:, : len(label_cols_ini)]

sample_submission.to_csv(path + file_name, index=False)

print('done')

done
