In [1]:

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn import svm

In [2]:
train, test = pd.read_csv("train.csv"), pd.read_csv("test.csv")
IDs = test['id']
X_train, X_test = train['comment_text'], test['comment_text']
X_test.loc[X_test.isnull()] = " " # replace the 1 NaN value in test
Y_train = train[train.columns[2:]]

In [None]:
del train
del test

print("%.2f of data is not flagged" % (Y_train.loc[(Y_train.sum(axis=1) == 0)].shape[0] / Y_train.shape[0]))

tfv = TfidfVectorizer(min_df=3, max_df=0.9, max_features=None, strip_accents='unicode',\
               analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), use_idf=1,\
               smooth_idf=1, sublinear_tf=1, stop_words='english')
print("tfidf-vectorizing train ...")
tfv.fit(X_train)
X_train = tfv.transform(X_train)
print("tfidf-vectorizing test ...")
X_test = tfv.transform(X_test)

In [None]:
print("fitting log reg & reporting cv accuracy ...")
n,q=X_test.shape
m,p=Y_train.shape
labels_predicted=np.zeros((n,p))
for i in range(Y_train.shape[1]):
    feature = Y_train.columns[i]
    print("\n%s:" % feature)
    print("Baseline: %.2f" % (Y_train.iloc[:,i].sum() / Y_train.shape[0]))
    clf = svm.LinearSVC()
    clf.fit(X_train, Y_train.iloc[:,i])
    labels_predicted[:,i] = clf.predict(X_test) 
    print(cross_val_score(clf, X_train, Y_train.iloc[:,i], cv=3, scoring='f1'))
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission[list_classes] = labels_predicted
sample_submission.to_csv('submission.csv', index=False)