Based on https://www.kaggle.com/nicapotato/tf-idf-xgboosts

In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
import xgboost as xgb
from xgboost.sklearn import XGBClassifier # <3
from sklearn.model_selection import train_test_split
import gc

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
# cleaned text files. Refer notebook "Preprocessing Text"
train_file = 'train_cleaned.csv' #train.csv
test_file = 'test_cleaned.csv' #test.csv

train = pd.read_csv('../input/' + train_file).fillna(' ')#.sample(1000)
test = pd.read_csv('../input/' + test_file).fillna(' ')#.sample(1000)

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

train = train.loc[:,class_names]

### Feature building

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=15000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [None]:
bigram_vectorizer = TfidfVectorizer(
    max_features=10000,
    strip_accents='unicode', 
    analyzer='word',
    ngram_range=(1,2),
    use_idf=1,
    smooth_idf=True,
    sublinear_tf=False,
    stop_words = 'english', 
    dtype=np.float32)

bigram_vectorizer.fit(all_text)
train_bigram_features = bigram_vectorizer.transform(train_text)
test_bigram_features = bigram_vectorizer.transform(test_text)

In [None]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [None]:
import pickle

#pickle.dump(train_char_features, open('../models/train_char_vectors.pkl', 'wb'))
#pickle.dump(test_char_features, open('../models/test_char_vectors.pkl', 'wb'))

train_char_features = pickle.load( open('../models/train_char_vectors.pkl', 'rb') )
test_char_features = pickle.load( open('../models/test_char_vectors.pkl', 'rb') )

In [None]:
train_features = hstack([train_char_features, train_word_features])
del train_char_features,train_word_features
test_features = hstack([test_char_features, test_word_features])
del test_char_features,test_word_features

print(train_features.shape)
print(test_features.shape)
d_test = xgb.DMatrix(test_features)
del test_features
gc.collect()

In [None]:
## Indirect features

#Sentense count in each comment:
    #  '\n' can be used to count the number of sentences in each comment
df['count_sent']=df["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
#Word count in each comment:
df['count_word']=df["comment_text"].apply(lambda x: len(str(x).split()))
#Unique word count
df['count_unique_word']=df["comment_text"].apply(lambda x: len(set(str(x).split())))
#Letter count
df['count_letters']=df["comment_text"].apply(lambda x: len(str(x)))
#punctuation count
df["count_punctuations"] =df["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
#upper case words count
df["count_words_upper"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#title case words count
df["count_words_title"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
#Number of stopwords
df["count_stopwords"] = df["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
#Average length of the words
df["mean_word_len"] = df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

#derived features
#Word count percent in each comment:
df['word_unique_percent'] = df['count_unique_word']/df['count_word']
#derived features
#Punct percent in each comment:
df['punct_percent'] = df['count_punctuations']/df['count_word']
# upper case percent
df['upper_percent'] = df["count_words_upper"]/ df['count_letters']

## Learning

In [None]:
print("Modeling")
cv_scores = []
xgb_preds = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    # Split out a validation set
    X_train, X_valid, y_train, y_valid = train_test_split(
        train_features, train_target, test_size=0.25, random_state=23)

    xgb_params = {'eta': 0.3, 
              'max_depth': 5, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8, 
              'objective': 'binary:logistic', 
              'eval_metric': 'auc', 
              'seed': 23
             }

    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)

    watchlist = [(d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 200, watchlist, verbose_eval=False, early_stopping_rounds=30)
    print("class Name: {}".format(class_name))
    print(model.attributes()['best_msg'])
    cv_scores.append(float(model.attributes()['best_score']))
    submission[class_name] = model.predict(d_test)
    del X_train, X_valid, y_train, y_valid
    gc.collect()
print('Total CV score is {}'.format(np.mean(cv_scores)))
submission.to_csv('submission.csv', index=False)

### Examples

In [None]:
# https://www.kaggle.com/codename007/toxic-avenger-spin-lb-0-9832
ave = pd.read_csv("../input/toxic-avenger/submission.csv")

# https://www.kaggle.com/tunguz/blend-of-blends-1/output 
supbl= pd.read_csv('../input/blend-of-blends-1/superblend_1.csv')

# https://www.kaggle.com/nuhsikander/blend-of-blends-can-be-toxic-lb-0-9837 0.9837
best = pd.read_csv('../input/toxic-hight-of-blending/hight_of_blending.csv')

# https://www.kaggle.com/ogrellier/wordbatch-fm-ftrl-using-mse-lb-0-9804 0.9805 FM FLTR
wordbtch = pd.read_csv('../input/wordbatch-fm-ftrl-using-mse-lb-0-9804/lvl0_wordbatch_clean_sub.csv')

# https://www.kaggle.com/hhstrand/oof-stacking-regime 0.9858
oofs = pd.read_csv('../input/oof-stacking-regime/submission.csv')

# https://www.kaggle.com/antmarakis/another-blend-tinkered-by-correlation - 0.9855
corrbl = pd.read_csv('../input/another-blend-tinkered-by-correlation/corr_blend.csv')

# GRU pooling with R - https://www.kaggle.com/krosam/why-a-such-low-score-with-r-and-keras 0.9835
rkera = pd.read_csv('../input/why-a-such-low-score-with-r-and-keras/submission.csv')

# https://www.kaggle.com/ogrellier/lgbm-with-words-and-chars-n-gram 0.9792
lgbm = pd.read_csv('../input/lgbm-with-words-and-chars-n-gram/lvl0_lgbm_clean_sub.csv')

# https://www.kaggle.com/kailex/tidy-xgboost-glmnet-text2vec-lsa 0.9786
tidy = pd.read_csv('../input/tidy-xgboost-glmnet-text2vec-lsa/tidy_xgb_glm.csv')

# https://www.kaggle.com/eashish/bidirectional-gru-with-convolution
grucnn = pd.read_csv('../input/bi-gru-cnn-poolings/submission.csv')

# https://www.kaggle.com/eashish/bidirectional-gru-with-convolution
bilst = pd.read_csv('../input/bidirectional-gru-with-convolution/submission.csv')

# https://www.kaggle.com/prashantkikani/pooled-gru-glove-with-preprocessing 0.9823
gruglo = pd.read_csv("../input/pooled-gru-glove-with-preprocessing/submission.csv")

# https://www.kaggle.com/zhbain/pooled-gru-fasttext-6c07c9/code #0.9833
fast = pd.read_csv('../input/pooled-gru-fasttext-6c07c9/submission.csv')