In [31]:
from sklearn.metrics import precision_score, recall_score, auc, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from pre_processing.textProcessing import TextPreProcessor
import random
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle
import nltk
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [12]:
train = pd.read_csv('../data/train.csv')
train = train.sample(1000, random_state=923).reset_index(drop=True)
labels = train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]

In [16]:
# Createa flag indicating whether a comment is neutral 
neutral_flag = labels.sum(axis=1) ==0
np.mean(neutral_flag)

0.879

In [17]:
random.seed(923)

X_train, X_test, y_train, y_test = train_test_split(train.comment_text, neutral_flag, test_size = 0.25,random_state = 23)

X_train.reset_index(drop = True,inplace = True)
X_test.reset_index(drop = True,inplace = True)
y_train.reset_index(drop = True,inplace = True)
y_test.reset_index(drop = True,inplace = True)

In [21]:
def tokenize(text):
    """
    Define a function that take in a text and process the doc
    """
    return TextPreProcessor(text=text, lemma_flag=True, stem_flag=False).process()

vec_count = CountVectorizer(ngram_range=(1,1),tokenizer=tokenize,min_df=15, max_df=0.9)
vec_count_f = vec_count.fit(X_train)

  "The parameter 'token_pattern' will not be used"


In [24]:
X_train_processed = vec_count_f.transform(X_train)

## Modeling

In [26]:
rf = RandomForestClassifier()

In [27]:
rf_f = rf.fit(X_train_processed, y_train)

## Process test data

In [29]:
X_test_processed = vec_count_f.transform(X_test)

## Scoring

In [30]:
pred = rf_f.predict(X_test_processed)

In [32]:
accuracy_score(y_test, pred)
precision_score(y_test,pred)
recall_score(y_test,pred)

0.9723502304147466

In [34]:
def evaluate_classifier(y_true, y_pred):
    accr=accuracy_score(y_true, y_pred)
    precision=precision_score(y_true,y_pred)
    recall=recall_score(y_true,y_pred)
    print(f"Accuracy: {accr}, Precision: {precision}, Recall: {recall}")
    return (accr, precision, recall)

In [35]:
evaluate_classifier(y_test, pred)

Accuracy: 0.876, Precision: 0.8940677966101694, Recall: 0.9723502304147466


(0.876, 0.8940677966101694, 0.9723502304147466)

In [36]:
np.mean(y_test)

0.868