# Experiment 03 

### General description 

Mix Jigsaw dataset and CTEC dataset for training. 

<ul>
    <li>Encoding: chars2vec for word embeddings. For sentence embedding, we *naively* sum all the word vectors in a sentence.</li>
    <li>Models: logistic regression vs. multilayer perceptrons</li>
    <li>Training set: 18386 jigsaw negative + 1614 jigsaw positive + 500 ctec negative + 10000 ctec positive</li>
    <li>Test set: 1401 ctec negative + 11600 ctec positive</li>
    <li>Metric: ROC AUC score</li>
</ul>

## Load data

In [7]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import chars2vec
import multiprocessing as mp
import time

In [8]:
# Load jigsaw and ctec datasets 
jigsaw_df = pd.read_csv('train_preproc_shrk.csv')
ctec_df = pd.read_csv('ctec_training_data_preproc.csv')

# target >= 0.5 --> toxic --> label = 1
# target < 0.5 --> non-toxic --> label = 0
jigsaw_df.loc[jigsaw_df['target'] >= 0.5, 'label'] = 1
jigsaw_df.loc[jigsaw_df['target'] < 0.5, 'label'] = 0

# Split by label 
jigsaw_neg = jigsaw_df[jigsaw_df['label'] == 0]
jigsaw_pos = jigsaw_df[jigsaw_df['label'] == 1]
ctec_neg = ctec_df[ctec_df['label'] == 0]
ctec_pos = ctec_df[ctec_df['label'] == 1]

# Show number of positive and negative examples in each dataset 
print(f'jigsaw # negative examples = {jigsaw_neg.shape[0]}')
print(f'jigsaw # positive examples = {jigsaw_pos.shape[0]}')
print(f'ctec # negative examples = {ctec_neg.shape[0]}')
print(f'ctec # positive examples = {ctec_pos.shape[0]}')

jigsaw # negative examples = 18386
jigsaw # positive examples = 1614
ctec # negative examples = 1901
ctec # positive examples = 21600


We determine the scheme of data combination and train-test-split based on the proportion of positive and negative examples. 


In [9]:
# Create training set and test set based on the scheme described above

# Randomly sampling indices 
indpos = np.random.choice(range(ctec_pos.shape[0]), size = 10000, replace = False)
indneg = np.random.choice(range(ctec_neg.shape[0]), size = 500, replace = False)
notindpos = np.setdiff1d(range(ctec_pos.shape[0]), indpos)
notindneg = np.setdiff1d(range(ctec_neg.shape[0]), indneg)

# X_train in text format, and y_train
# All jigsaw examples plus sampled ctec examples
X_train_text = jigsaw_df['comment_text'].append(
    ctec_pos['comment_text'].iloc[indpos], 
    ignore_index = True
).append(
    ctec_neg['comment_text'].iloc[indneg], 
    ignore_index = True
)

y_train = jigsaw_df['label'].append(
    ctec_pos['label'].iloc[indpos], 
    ignore_index = True
).append(
    ctec_neg['label'].iloc[indneg], 
    ignore_index = True
)

# X_test in text format, and y_test 
X_test_text = ctec_neg['comment_text'].iloc[notindneg].append(
    ctec_pos['comment_text'].iloc[notindpos], 
    ignore_index = True
)

y_test = ctec_neg['label'].iloc[notindneg].append(
    ctec_pos['label'].iloc[notindpos], 
    ignore_index = True
)


### Vectorization and normalization

In [4]:
# Vocabulary of training set
countvec = CountVectorizer()
countvec.fit(X_train_text)
vocab_train = list(countvec.vocabulary_.keys())

# Vocabulary of test set 
countvec.fit(X_test_text)
vocab_test = list(countvec.vocabulary_.keys())

print(len(vocab_train))
print(len(vocab_test))

67634
54086


In [6]:
nDim = 300

# Vectorization with chars2vec, 300-dimensional
c2v_model = chars2vec.load_model('eng_' + str(nDim))

# The cache that maps a word to its encoding 
cache = {}

# Given a sentence, return its sentence embedding vector
# Most naive approach: add together 
def sent2vec(sent): 
    words = sent.split()
    sentVec = np.zeros(nDim)
    
    for word in words: 
        # Remove a word from the list if its embedding has already been computed
        # Add the word vector to `sentVec`
        if word in cache: 
            words.remove(word)
            sentVec += cache[word]
            
    # Use chars2vec model to calculate word embeddings
    veclist = c2v_model.vectorize_words(words)
    veclist = np.array(veclist)
    
    # Calculate the sentence embedding by summing word embeddings
    sentVec += np.sum(veclist, axis = 0)
    
    # Cache the new words 
    for i in range(len(words)): 
        cache[words[i]] = veclist[i, :]
    
    return sentVec

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


X_train_unscaled = []
counter = 0
startTime = time.time()

# Calculate sentence embedding for each text in training set 
for text in X_train_text: 
    embedding = sent2vec(text)
    X_train_unscaled.append(embedding)
    counter += 1
    if counter % 1000 == 0:
        print(f'{counter} training texts have been encoded. Total time elapsed for training set = {time.time() - startTime}', 
             flush = True)
    
X_test_unscaled = []
counter = 0
startTime = time.time()
    
# Calculate sentence embedding for each text in test set 
for text in X_test_text:
    embedding = sent2vec(text)
    X_test_unscaled.append(embedding)
    counter += 1
    if counter % 1000 == 0:
        print(f'{counter} training texts have been encoded. Total time elapsed for training set = {time.time() - startTime}', 
             flush = True)

np.savetxt('train_embed.csv', X_train_unscaled, fmt = '%.5f', delimiter=',')
np.savetxt('test_embed.csv', X_test_unscaled, fmt = '%.5f', delimiter=',')


1000 training texts have been encoded. Total time elapsed for training set = 93.65556788444519
2000 training texts have been encoded. Total time elapsed for training set = 165.1000828742981
3000 training texts have been encoded. Total time elapsed for training set = 229.66850686073303
4000 training texts have been encoded. Total time elapsed for training set = 292.52003812789917
5000 training texts have been encoded. Total time elapsed for training set = 356.16848373413086
6000 training texts have been encoded. Total time elapsed for training set = 429.3732364177704
7000 training texts have been encoded. Total time elapsed for training set = 497.3414192199707
8000 training texts have been encoded. Total time elapsed for training set = 569.1971561908722
9000 training texts have been encoded. Total time elapsed for training set = 636.2412085533142
10000 training texts have been encoded. Total time elapsed for training set = 709.0072994232178
11000 training texts have been encoded. Total 

In [10]:
# X_train_unscaled = np.array(X_train_unscaled)
X_train_unscaled = np.genfromtxt('train_embed.csv', delimiter = ',')    # Read data from cached files
# X_test_unscaled = np.array(X_test_unscaled)
X_test_unscaled = np.genfromtxt('test_embed.csv', delimiter = ',')    # Read data from cached files

# Normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)

## Training and testing 

### Logistic regression

In [13]:
# Model and hyperparameterization
clf = GridSearchCV(
    LogisticRegression(max_iter = 2000), 
    param_grid = {'C': [0.1, 1, 10]}, 
    scoring = 'roc_auc'
)

# Train 
%time clf.fit(X_train, y_train)
 
# Predict label
y_train_pred_class = clf.predict(X_train)
y_test_pred_class = clf.predict(X_test)
# Predict probability of being toxic
y_train_pred_prob = clf.predict_proba(X_train)[:,1]
y_test_pred_prob = clf.predict_proba(X_test)[:,1]

print('\nLogistic regression')
print(f'best parameter = {clf.best_params_}')

# Store results
results = [
    [
        'accuracy', 
        metrics.accuracy_score(y_train, y_train_pred_class), 
        metrics.accuracy_score(y_test, y_test_pred_class)
    ], 
    [
        'confusion matrix', 
        str(metrics.confusion_matrix(y_train, y_train_pred_class).tolist()), 
        str(metrics.confusion_matrix(y_test, y_test_pred_class).tolist())
    ], 
    [
        'F1 score', 
        metrics.f1_score(y_train, y_train_pred_class), 
        metrics.f1_score(y_test, y_test_pred_class)
    ], 
    [
        'ROC AUC score', 
        metrics.roc_auc_score(y_train, y_train_pred_class), 
        metrics.roc_auc_score(y_test, y_test_pred_class)
    ]
]

colNames = ['metric', 'train set', 'test set']

# Show result 
pd.DataFrame(results, columns = colNames)

CPU times: user 8min 42s, sys: 1min 53s, total: 10min 36s
Wall time: 2min 41s

Logistic regression
best parameter = {'C': 1}


Unnamed: 0,metric,train set,test set
0,accuracy,0.720295,0.457734
1,confusion matrix,"[[17291, 1595], [6936, 4678]]","[[946, 455], [6595, 5005]]"
2,F1 score,0.523061,0.586753
3,ROC AUC score,0.659168,0.553349


### Multilayer perceptron 

In [16]:
clf = MLPClassifier(hidden_layer_sizes = (50, 10))

# Train 
%time clf.fit(X_train, y_train)
 
# Predict label
y_train_pred_class = clf.predict(X_train)
y_test_pred_class = clf.predict(X_test)
# Predict probability of being toxic
y_train_pred_prob = clf.predict_proba(X_train)[:,1]
y_test_pred_prob = clf.predict_proba(X_test)[:,1]

print('\nMultilayer Perceptrons')
# print(f'best parameter = {clf.best_params_}')

# Store results
results = [
    [
        'accuracy', 
        metrics.accuracy_score(y_train, y_train_pred_class), 
        metrics.accuracy_score(y_test, y_test_pred_class)
    ], 
    [
        'confusion matrix', 
        str(metrics.confusion_matrix(y_train, y_train_pred_class).tolist()), 
        str(metrics.confusion_matrix(y_test, y_test_pred_class).tolist())
    ], 
    [
        'F1 score', 
        metrics.f1_score(y_train, y_train_pred_class), 
        metrics.f1_score(y_test, y_test_pred_class)
    ], 
    [
        'ROC AUC score', 
        metrics.roc_auc_score(y_train, y_train_pred_class), 
        metrics.roc_auc_score(y_test, y_test_pred_class)
    ]
]

colNames = ['metric', 'train set', 'test set']

# Show result 
pd.DataFrame(results, columns = colNames)

CPU times: user 3min 21s, sys: 2min 40s, total: 6min 2s
Wall time: 1min 32s

Multilayer Perceptrons


Unnamed: 0,metric,train set,test set
0,accuracy,0.839279,0.664718
1,confusion matrix,"[[16189, 2697], [2205, 9409]]","[[636, 765], [3594, 8006]]"
2,F1 score,0.793339,0.786019
3,ROC AUC score,0.833669,0.572067


More iter. 

In [11]:
clf = GridSearchCV(
    MLPClassifier(hidden_layer_sizes = (50, 10), max_iter = 500),
    param_grid = {'alpha': [0.01, 0.1, 1, 10]}, 
    scoring = 'roc_auc'
)


# Train 
%time clf.fit(X_train, y_train)
 
# Predict label
y_train_pred_class = clf.predict(X_train)
y_test_pred_class = clf.predict(X_test)
# Predict probability of being toxic
y_train_pred_prob = clf.predict_proba(X_train)[:,1]
y_test_pred_prob = clf.predict_proba(X_test)[:,1]

print('\nMultilayer Perceptrons')
print(f'best parameter = {clf.best_params_}')

# Store results
results = [
    [
        'accuracy', 
        metrics.accuracy_score(y_train, y_train_pred_class), 
        metrics.accuracy_score(y_test, y_test_pred_class)
    ], 
    [
        'confusion matrix', 
        str(metrics.confusion_matrix(y_train, y_train_pred_class).tolist()), 
        str(metrics.confusion_matrix(y_test, y_test_pred_class).tolist())
    ], 
    [
        'F1 score', 
        metrics.f1_score(y_train, y_train_pred_class), 
        metrics.f1_score(y_test, y_test_pred_class)
    ], 
    [
        'ROC AUC score', 
        metrics.roc_auc_score(y_train, y_train_pred_class), 
        metrics.roc_auc_score(y_test, y_test_pred_class)
    ]
]

colNames = ['metric', 'train set', 'test set']

# Show result 
pd.DataFrame(results, columns = colNames)

CPU times: user 29min 41s, sys: 20min 47s, total: 50min 28s
Wall time: 12min 53s

Multilayer Perceptrons
best parameter = {'alpha': 1}


Unnamed: 0,metric,train set,test set
0,accuracy,0.734721,0.610415
1,confusion matrix,"[[15588, 3298], [4793, 6821]]","[[682, 719], [4346, 7254]]"
2,F1 score,0.627709,0.741225
3,ROC AUC score,0.706341,0.55607
