In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  
import joblib
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

## Load Data

In [2]:
train_clean = joblib.load('data/train_clean.pckl')
test_clean = joblib.load('data/test_clean.pckl')

X_tfidf = joblib.load('data/X_tfidf.pckl')
X_test_tfidf = joblib.load('data/X_test_tfidf.pckl')

## Regularized Logistic Regression - Bagged Approach

In [36]:
final_c = {'toxic':20,
           'severe_toxic':2,
           'obscene':9,
           'threat':11,
           'insult':7,
           'identity_hate':9}

log_bagged = test_clean[['id']].copy()

for target in ['toxic','severe_toxic','obscene','threat','insult','identity_hate']:
        
    bag = BaggingClassifier(base_estimator=LogisticRegression(C=final_c[target]),
                           n_estimators=30,
                           max_features=1.0,
                           max_samples=1.0,
                           bootstrap=True,
                           n_jobs=2,
                           random_state=58999414)
    
    bag.fit(X_tfidf, train_clean[target])
    
    preds = bag.predict_proba(X_tfidf)
    fpr, tpr, thresholds = roc_curve(train_clean[target], preds[:,1], pos_label=1)
    roc_auc = auc(fpr, tpr)
    print(target.upper(),"  :  ", round(roc_auc,5))
    
    log_bagged[target] = bag.predict_proba(X_test_tfidf)[:,1]
    

TOXIC   :   0.99977
SEVERE_TOXIC   :   0.99706
OBSCENE   :   0.99946
THREAT   :   0.99993
INSULT   :   0.99859
IDENTITY_HATE   :   0.9995


In [37]:
log_bagged.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999996,0.243478,0.999899,0.036606,0.996166,0.432437
1,0000247867823ef7,0.001585,0.003528,0.002081,0.000667,0.004259,0.001441
2,00013b17ad220c46,0.006103,0.003418,0.004396,0.000444,0.0057,0.00113
3,00017563c3f7919a,0.000698,0.002163,0.001245,0.000393,0.001622,0.000455
4,00017695ad8997eb,0.007399,0.002157,0.002895,0.000325,0.005407,0.000668


In [40]:
log_bagged.to_csv('data/S6_bagged_log.csv', index=False)