In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import sys
path = '/content/gdrive/Team Drives/cs273p project'
sys.path.append(path)

In [0]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt


import re
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('wordnet') 

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import data_loader


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
import pickle

def saveobj(save_list, filename):
    with open(path + '/' + filename, 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump(save_list, f)
        
# restore object        
def loadobj(filename):
    with open(path + '/' + filename, 'rb') as f:  # Python 3: open(..., 'rb')
        li = pickle.load(f)
    return li

In [0]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [0]:
train_df, valid_df, test_df = loadobj('data/filtered_comment_pickle')

## Vectorized and Transform 

In [0]:
train_txt = train_df.filt_comment
valid_txt = valid_df.filt_comment
test_txt = test_df.filt_comment

In [0]:
word_vect = TfidfVectorizer(analyzer = 'word',
                            token_pattern = '\w+',
                            stop_words = 'english',
                            lowercase = True,
                            strip_accents='unicode',
                            sublinear_tf=True,
                            max_features = 10000)
                            # sublinear_tf=True
  

train_wrd = word_vect.fit_transform(train_txt)
valid_wrd = word_vect.transform(valid_txt)
test_wrd = word_vect.transform(test_txt)

In [0]:
char_vect = TfidfVectorizer(analyzer = 'char',
                            stop_words = 'english',
                            lowercase = True,
                            ngram_range = (3, 6),
                            strip_accents='unicode',
                            sublinear_tf=True,
                            max_features = 10000)
  

train_char = char_vect.fit_transform(train_txt)
valid_char = char_vect.transform(valid_txt)
test_char = char_vect.transform(test_txt)

In [0]:
train_X = hstack([train_wrd, train_char])
valid_X = hstack([valid_wrd, valid_char])
test_X = hstack([test_wrd, test_char])


## Model
Other options might include feature design, or optimizing your models to deal with special aspects of the data (missing features, too many features, large numbers of zeros in the data; possible outlier data; etc.). Your report should describe what aspects you chose to focus on.

In [0]:
def train_and_prediction(classifier, classifier_name):
  train_scores = np.zeros(len(classes))
  valid_scores = np.zeros(len(classes))
  test_scores  = np.zeros(len(classes))
  
  valid_predict = pd.DataFrame.from_dict({'id': valid_df['id']})
  test_predict = pd.DataFrame.from_dict({'id': test_df['id']})
  
  bin_valid_predict = pd.DataFrame.from_dict({'id': valid_df['id']})
  bin_test_predict = pd.DataFrame.from_dict({'id': test_df['id']})

  for cls_ind, clas in enumerate(classes):  
    
    classifier.fit(train_X, train_df[clas])
    train_pred_Y = classifier.predict_proba(train_X)[:,1]
    valid_predict[clas] = classifier.predict_proba(valid_X)[:,1]
    test_predict[clas] = classifier.predict_proba(test_X)[:,1]
    
    
    bin_valid_predict[clas] = classifier.predict(valid_X)
    bin_test_predict[clas] = classifier.predict(test_X)

    train_scores[cls_ind] = roc_auc_score(train_df[clas], train_pred_Y)
    valid_scores[cls_ind] = roc_auc_score(valid_df[clas], valid_predict[clas])
    test_scores[cls_ind]  = roc_auc_score(test_df[clas], test_predict[clas])

    print("{}: train_score {}, valid_score {}, test_scores {}".format(clas,\
                                                                    train_scores[cls_ind], valid_scores[cls_ind],
                                                                    test_scores[cls_ind]) )
  print("\navg: train_score {}, valid_score {}, test_scores {}".format(clas,\
                                                                  np.mean(train_scores[cls_ind]), np.mean(valid_scores[cls_ind]),
                                                                  np.mean(test_scores[cls_ind])) )
  
  test_predict.to_csv(path + "/output/{}/prob_{}_test_submission.csv".format(classifier_name, classifier_name))
  valid_predict.to_csv(path + "/output/{}/prob_{}_valid_submission.csv".format(classifier_name, classifier_name))
  
  bin_test_predict.to_csv(path + "/output/{}/bin_{}_test_submission.csv".format(classifier_name, classifier_name))
  bin_valid_predict.to_csv(path + "/output/{}/bin_{}_valid_submission.csv".format(classifier_name, classifier_name))
  

In [0]:
############ tune optimize parameter ############

classifier = RandomForestClassifier(n_estimators = 1000, max_leaf_nodes = 18, random_state = 1)
train_and_prediction(classifier, 'random_forest')

toxic: train_score 0.9111013634031732, valid_score 0.9050353211678859, test_scores 0.9069242026503135
severe_toxic: train_score 0.9845275331325715, valid_score 0.9776537323382598, test_scores 0.9714522923883788
obscene: train_score 0.9667238392675068, valid_score 0.9620937635965685, test_scores 0.9483149141609131
threat: train_score 0.9924153919129415, valid_score 0.9496556994195832, test_scores 0.9820722094217864
insult: train_score 0.9506224948170006, valid_score 0.9454283138265871, test_scores 0.9351841806290937
identity_hate: train_score 0.9720848039015502, valid_score 0.9621022707888325, test_scores 0.9607362502251063

avg: train_score identity_hate, valid_score 0.9720848039015502, test_scores 0.9621022707888325


In [0]:
############ tune optimize parameter ############

classifier = RandomForestClassifier(n_estimators = 1000, max_leaf_nodes = 18, min_samples_leaf = 10, random_state = 1)
train_and_prediction(classifier, 'random_forest')

toxic: train_score 0.9111167166793032, valid_score 0.9050749765689515, test_scores 0.9069643940714237
severe_toxic: train_score 0.9835744701695708, valid_score 0.978693112531096, test_scores 0.9718637896029927
obscene: train_score 0.9668702692190708, valid_score 0.9622893050845769, test_scores 0.9483168286014468
threat: train_score 0.9897385707859696, valid_score 0.960655859692505, test_scores 0.9825219361631806
insult: train_score 0.950542093385299, valid_score 0.9453642235875306, test_scores 0.9350703056533981
identity_hate: train_score 0.9695329853667047, valid_score 0.9650859183267471, test_scores 0.9634281015913903

avg: train_score identity_hate, valid_score 0.9695329853667047, test_scores 0.9650859183267471


In [16]:
############ tune optimize parameter ############

classifier = RandomForestClassifier(n_estimators = 1000, max_leaf_nodes = 30, min_samples_leaf = 10, random_state = 1)
train_and_prediction(classifier, 'random_forest')

toxic: train_score 0.9218616967534017, valid_score 0.9149499143926261, test_scores 0.9159759154419475
severe_toxic: train_score 0.9858506095802039, valid_score 0.9796916410581598, test_scores 0.9732656173077189
obscene: train_score 0.9721846897174534, valid_score 0.9667255643717596, test_scores 0.9532456164243934
threat: train_score 0.9929038111307967, valid_score 0.9626106169362686, test_scores 0.9842052713087495
insult: train_score 0.9563928882321657, valid_score 0.9505859381405977, test_scores 0.9406956258424334
identity_hate: train_score 0.9754586975757326, valid_score 0.9674359727581548, test_scores 0.9663518412715778

avg: train_score identity_hate, valid_score 0.9754586975757326, test_scores 0.9674359727581548
