# Experiment 02 

### General description 

Mix Jigsaw dataset and CTEC dataset for training. 

<ul>
    <li>Encoding: TF-IDF</li>
    <li>Models: logistic regression vs. multinomial bayesian</li>
    <li>Training set: 18386 jigsaw negative + 1614 jigsaw positive + 500 ctec negative + 10000 ctec positive</li>
    <li>Test set: 1401 ctec negative + 11600 ctec positive</li>
    <li>Metric: ROC AUC score</li>
</ul>

## Load data

In [1]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
# Load jigsaw and ctec datasets 
jigsaw_df = pd.read_csv('train_preproc_shrk.csv')
ctec_df = pd.read_csv('ctec_training_data_preproc.csv')

# target >= 0.5 --> toxic --> label = 1
# target < 0.5 --> non-toxic --> label = 0
jigsaw_df.loc[jigsaw_df['target'] >= 0.5, 'label'] = 1
jigsaw_df.loc[jigsaw_df['target'] < 0.5, 'label'] = 0

# Split by label 
jigsaw_neg = jigsaw_df[jigsaw_df['label'] == 0]
jigsaw_pos = jigsaw_df[jigsaw_df['label'] == 1]
ctec_neg = ctec_df[ctec_df['label'] == 0]
ctec_pos = ctec_df[ctec_df['label'] == 1]

# Show number of positive and negative examples in each dataset 
print(f'jigsaw # negative examples = {jigsaw_neg.shape[0]}')
print(f'jigsaw # positive examples = {jigsaw_pos.shape[0]}')
print(f'ctec # negative examples = {ctec_neg.shape[0]}')
print(f'ctec # positive examples = {ctec_pos.shape[0]}')

jigsaw # negative examples = 18386
jigsaw # positive examples = 1614
ctec # negative examples = 1901
ctec # positive examples = 21600


We determine the scheme of data combination and train-test-split based on the proportion of positive and negative examples. 


In [3]:
# Create training set and test set based on the scheme described above

# Randomly sampling indices 
indpos = np.random.choice(range(ctec_pos.shape[0]), size = 10000, replace = False)
indneg = np.random.choice(range(ctec_neg.shape[0]), size = 500, replace = False)
notindpos = np.setdiff1d(range(ctec_pos.shape[0]), indpos)
notindneg = np.setdiff1d(range(ctec_neg.shape[0]), indneg)

# X_train in text format, and y_train
# All jigsaw examples plus sampled ctec examples
X_train_text = jigsaw_df['comment_text'].append(
    ctec_pos['comment_text'].iloc[indpos], 
    ignore_index = True
).append(
    ctec_neg['comment_text'].iloc[indneg], 
    ignore_index = True
)

y_train = jigsaw_df['label'].append(
    ctec_pos['label'].iloc[indpos], 
    ignore_index = True
).append(
    ctec_neg['label'].iloc[indneg], 
    ignore_index = True
)

# X_test in text format, and y_test 
X_test_text = ctec_neg['comment_text'].iloc[notindneg].append(
    ctec_pos['comment_text'].iloc[notindpos], 
    ignore_index = True
)

y_test = ctec_neg['label'].iloc[notindneg].append(
    ctec_pos['label'].iloc[notindpos], 
    ignore_index = True
)


### Vectorization and normalization

In [4]:
# Vectorization with tfidf
encoder = TfidfVectorizer(strip_accents = 'unicode', stop_words = 'english')
X_train_unscaled = encoder.fit_transform(X_train_text)
X_test_unscaled = encoder.transform(X_test_text)

# Normalization without 0-mean
scaler = StandardScaler(with_mean = False)
X_train = scaler.fit_transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)

## Training and testing 

### Multinomial naive Bayes

In [5]:
# Model and hyperparameterization
clf = GridSearchCV(
    MultinomialNB(), 
    param_grid = {'alpha': [0.1, 1, 10]}, 
    scoring = 'roc_auc'
)

# Train 
%time clf.fit(X_train, y_train)
 
# Predict label
y_train_pred_class = clf.predict(X_train)
y_test_pred_class = clf.predict(X_test)
# Predict probability of being toxic
y_train_pred_prob = clf.predict_proba(X_train)[:,1]
y_test_pred_prob = clf.predict_proba(X_test)[:,1]

print('\nMultinomial naive Bayes')
print(f'best parameter = {clf.best_params_}')

# Store results
results = [
    [
        'accuracy', 
        metrics.accuracy_score(y_train, y_train_pred_class), 
        metrics.accuracy_score(y_test, y_test_pred_class)
    ], 
    [
        'confusion matrix', 
        str(metrics.confusion_matrix(y_train, y_train_pred_class).tolist()), 
        str(metrics.confusion_matrix(y_test, y_test_pred_class).tolist())
    ], 
    [
        'F1 score', 
        metrics.f1_score(y_train, y_train_pred_class), 
        metrics.f1_score(y_test, y_test_pred_class)
    ], 
    [
        'ROC AUC score', 
        metrics.roc_auc_score(y_train, y_train_pred_class), 
        metrics.roc_auc_score(y_test, y_test_pred_class)
    ]
]

colNames = ['metric', 'train set', 'test set']

# Show result 
pd.DataFrame(results, columns = colNames)

CPU times: user 190 ms, sys: 23.9 ms, total: 214 ms
Wall time: 214 ms

Multinomial naive Bayes
best parameter = {'alpha': 10}


Unnamed: 0,metric,train set,test set
0,accuracy,0.925836,0.642951
1,confusion matrix,"[[18739, 147], [2115, 9499]]","[[899, 502], [4140, 7460]]"
2,F1 score,0.893603,0.762703
3,ROC AUC score,0.905054,0.642394


## Logistic regression

In [6]:
# Model and hyperparameterization
clf = GridSearchCV(
    LogisticRegression(max_iter = 2000), 
    param_grid = {'C': [0.1, 1, 10]}, 
    scoring = 'roc_auc'
)

# Train 
%time clf.fit(X_train, y_train)
 
# Predict label
y_train_pred_class = clf.predict(X_train)
y_test_pred_class = clf.predict(X_test)
# Predict probability of being toxic
y_train_pred_prob = clf.predict_proba(X_train)[:,1]
y_test_pred_prob = clf.predict_proba(X_test)[:,1]

print('\nLogistic regression')
print(f'best parameter = {clf.best_params_}')

# Store results
results = [
    [
        'accuracy', 
        metrics.accuracy_score(y_train, y_train_pred_class), 
        metrics.accuracy_score(y_test, y_test_pred_class)
    ], 
    [
        'confusion matrix', 
        str(metrics.confusion_matrix(y_train, y_train_pred_class).tolist()), 
        str(metrics.confusion_matrix(y_test, y_test_pred_class).tolist())
    ], 
    [
        'F1 score', 
        metrics.f1_score(y_train, y_train_pred_class), 
        metrics.f1_score(y_test, y_test_pred_class)
    ], 
    [
        'ROC AUC score', 
        metrics.roc_auc_score(y_train, y_train_pred_class), 
        metrics.roc_auc_score(y_test, y_test_pred_class)
    ]
]

colNames = ['metric', 'train set', 'test set']

# Show result 
pd.DataFrame(results, columns = colNames)

CPU times: user 6min 33s, sys: 7.35 s, total: 6min 41s
Wall time: 34.1 s

Logistic regression
best parameter = {'C': 0.1}


Unnamed: 0,metric,train set,test set
0,accuracy,0.99718,0.766326
1,confusion matrix,"[[18861, 25], [61, 11553]]","[[740, 661], [2377, 9223]]"
2,F1 score,0.996292,0.858592
3,ROC AUC score,0.996712,0.66164
