# Toxic Comment Classification:
- Baseline Naive Bayes method
- [Kaggle Link](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)
- Final project for CS 7650, Spring 2021 at Georgia Tech taught by Alan Ritter
  - Due 05/05/2021
- By Justin Chen

## Libraries

Mount my google drive for the data

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir("drive/MyDrive/Colab Notebooks/CS7650/final")
os.listdir()

Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier

In [None]:
np.random.seed(0)

## Read in Data

In [None]:
df_train = pd.read_csv('data/clean/train_clean_stop_stem.csv')
df_test = pd.read_csv('data/clean/test_clean_stop_stem.csv')
df_train.head()

In [None]:
print('{0} rows in train'.format(len(df_train)))
print('{0} rows in test'.format(len(df_test)))

In [None]:
# #create mask to generate train/val set
# train_mask = np.full(len(df), False)
# num_test = int(len(df)*0.25)
# train_mask[:num_test] = True
# train_mask = np.random.shuffle(train_mask)
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X_train = df_train['comment_text']
y_train = df_train[classes]
X_test = df_test['comment_text']
y_test = df_test[classes]

## Naive Bayes
- Try either tfidf or count vectorizer

In [None]:
class NaiveBayes():
  def __init__(self):
    self.pipeline = Pipeline([('vect', CountVectorizer()),
                              ('clf', OneVsRestClassifier(MultinomialNB()))
                              ])
  
  def fit(self, x, y):
    self.pipeline.fit(x, y)

  def predict(self, x):
    return self.pipeline.predict(x)
  
  def eval(self, train, train_label, test, test_label):
    all_preds = np.zeros(shape=(test.shape[0], len(classes)))
    for i, c in enumerate(classes):
      self.fit(train, train_label[c])
      pred = self.predict(test)
      acc = accuracy_score(test_label[c], pred)
      rec = recall_score(test_label[c], pred)
      prec = precision_score(test_label[c], pred)
      f1 = f1_score(test_label[c], pred)
      print(f'{c} label')
      print(f'Accuracy: {acc} Recall {rec} Precision {prec} F1 {f1}')
      print('-----------------------')
      all_preds[:,i] = pred
    total_acc = accuracy_score(test_label, all_preds)
    total_rec = recall_score(test_label, all_preds, average='micro')
    total_prec = precision_score(test_label, all_preds, average='micro')
    total_f1 = f1_score(test_label, all_preds, average='micro')
    print('Total')
    print(f'Accuracy: {total_acc} Recall {total_rec} Precision {total_prec} F1 {total_f1}')

In [None]:
NB = NaiveBayes()
NB.eval(X_train, y_train, X_test, y_test)