In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold



In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('F:/notebook_working/kaggle_compe/train.csv').fillna(' ')
test = pd.read_csv('F:/notebook_working/kaggle_compe/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])



In [10]:
tox_train = train.loc[train.toxic==1,]
sev_train =train.loc[train.severe_toxic==1,]
obs_train = train.loc[train.obscene==1,]
threat_train = train.loc[train.threat==1,]
insult_train  = train.loc[train.insult==1,]
identity_train = train.loc[train.identity_hate==1,]
all_train = [tox_train, sev_train, obs_train, threat_train, insult_train, identity_train]

In [11]:
word_vectorizer = TfidfVectorizer(
                    sublinear_tf=True,
                    encoding='utf-8',
                    lowercase=True,
                    min_df=0.00009,
                    strip_accents='unicode',
                    analyzer='word',
                    token_pattern=r'\w{1,}',
                    stop_words='english',
                    ngram_range=(1, 1))
    

In [14]:
char_vectorizer = TfidfVectorizer(
                sublinear_tf=True,
                strip_accents='unicode',
                analyzer='char',
                stop_words='english',
                ngram_range=(1,1),                 
                min_df = 0.0001)
                #max_features=10000)

In [None]:


for x in all_train:
    train_text = x['comment_text']    
                        
    word_vectorizer.fit(train_text)
    train_word_features = word_vectorizer.transform(train_text)
    char_vectorizer.fit(all_text)
    train_char_features = char_vectorizer.transform(train_text)
    train_features = hstack([train_char_features, train_word_features])
#test_features = hstack([test_char_features, test_word_features])


    scores = []
    submission = pd.DataFrame.from_dict({'id': test['id']})
    for class_name in class_names:
        train_target = x[class_name]
        X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_target, test_size=0.3, random_state=42)

        #skf=StratifiedKFold(n_splits=3, shuffle=False)
    
        #for train_indices, valid_indices in skf.split(train_features, train_target):
         #   X_train, y_train = train_features[train_indices], train_target[train_indices]
         #   X_valid, y_valid = train_features[valid_indices], train_target[valid_indices]
    
        classifier = LogisticRegression()

        cv_score = np.mean(cross_val_score(classifier, X_valid, y_valid, cv=3, scoring='roc_auc'))
        scores.append(cv_score)
        print('CV score for class {} is {}'.format(class_name, cv_score))

        classifier.fit(X_train, y_train)
    #submission[class_name] = classifier.predict_proba(test_features)[:, 1]

        print('Total CV score is {}'.format(np.mean(scores)))




In [29]:
train_features = hstack([train_char_features, train_word_features])
#test_features = hstack([test_char_features, test_word_features])


scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    #X_train, X_valid, y_train, y_valid = train_test_split(train_word_features, train_target, test_size=0.3, random_state=42)

    skf=StratifiedKFold(n_splits=3, shuffle=False)
    #skf=StratifiedKFold(n_splits=3, shuffle=True)
    for train_indices, valid_indices in skf.split(train_word_features, train_target):
        X_train, y_train = train_word_features[train_indices], train_target[train_indices]
        X_valid, y_valid = train_word_features[valid_indices], train_target[valid_indices]
    
    classifier = LogisticRegression()

    cv_score = np.mean(cross_val_score(classifier, X_valid, y_valid, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(X_train, y_train)
    #submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

#submission.to_csv('submission.csv', index=False)

CV score for class toxic is 0.96063005064
CV score for class severe_toxic is 0.985030062298
CV score for class obscene is 0.981276336101
CV score for class threat is 0.975127399673
CV score for class insult is 0.971183023061
CV score for class identity_hate is 0.970082187264
Total CV score is 0.973888176506


In [13]:
#train_features = hstack([train_char_features, train_word_features])
#test_features = hstack([test_char_features, test_word_features])


scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    X_train, X_valid, y_train, y_valid = train_test_split(train_word_features, train_target, test_size=0.3, random_state=42)

    classifier = SVC()

    cv_score = np.mean(cross_val_score(classifier, X_valid, y_valid, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(X_train, y_train)
    #submission[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

#submission.to_csv('submission.csv', index=False)

CV score for class toxic is 0.953617374677
CV score for class severe_toxic is 0.878066506914
CV score for class obscene is 0.978812954813
CV score for class threat is 0.885887012223
CV score for class insult is 0.95942138282
CV score for class identity_hate is 0.871775257044
Total CV score is 0.921263414748


In [9]:
submission[class_name] = classifier.predict_proba(test_features)[:, 1]
submission.to_csv('submission.csv', index=False)