### MultiLabel Classifier for Toxic Comments Classifier

Author: Md Junaid Alam

In [11]:
!pip install scikit-multilearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
# Import nltk libraries and download
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss,classification_report

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN

import re
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords

In [14]:
lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()
eng_stopwords = set(stopwords.words("english"))

In [15]:
# Download the train.csv from the google drive by providing its Unique ID
! gdown 1hdJ8HxCOeS7ZCtCGPLNfKoWt4h_8qJUV

Downloading...
From: https://drive.google.com/uc?id=1hdJ8HxCOeS7ZCtCGPLNfKoWt4h_8qJUV
To: /content/train.csv
100% 68.8M/68.8M [00:00<00:00, 146MB/s]


In [16]:
# Load Dataset
df = pd.read_csv("/content/train.csv")

In [17]:
df.sample(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
156337,ccb30b8a04538cc0,"message \n\nDude, this is the Internet, not ou...",1,0,1,0,1,0
156439,ce306da96cb87b7d,""":The """"rampant speculation"""" was a cited stat...",0,0,0,0,0,0
11794,1f2fb372744e39cc,"""It does not matter if they disclaim it's trut...",0,0,0,0,0,0
64639,acfdfbeff63718ff,One possible solution maybe could,0,0,0,0,0,0
48022,80462cbcbf03c896,"""\n\n A barnstar for you! \n\n The Teamwork B...",0,0,0,0,0,0
58240,9bed30929596f984,That's more or less finished now if you want t...,0,0,0,0,0,0
67377,b4482442dad4b563,"""\nI'm afraid I can't go on hearsay. «Talk» """,0,0,0,0,0,0
27872,49c8a2ea474bedf9,"yes i do. have sourced the ones needing it, oo...",0,0,0,0,0,0
41403,6e708337fe2287df,OMG\nTHIS THING IS TOO LONG!!!!!!!!please shor...,0,0,0,0,0,0
35407,5e8f520b9af4897c,get a life you stupid wikipedia nerd seriosly ...,1,0,1,0,1,0


In [18]:
df["toxic"].value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

In [19]:
df["severe_toxic"].value_counts()

0    157976
1      1595
Name: severe_toxic, dtype: int64

In [20]:
# Create a dictionary of Aphost lookup
APHOST_LOOKUP = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [21]:
# Define the function for data cleaning
def clean_data_pipeline(comment):
    comment=comment.lower() # Convert to lower case
    comment=re.sub("\\n","",comment) # Remove new line
    # Remove the leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    comment=re.sub("\[\[.*\]","",comment) # Remove users
    words=tokenizer.tokenize(comment) # Tokenize into words
    # Replace apostrophe with reference to the Aphost Dictionary 
    words=[APHOST_LOOKUP[word] if word in APHOST_LOOKUP else word for word in words]
    words=[lem.lemmatize(word, "v") for word in words] # Lemmatization
    words = [w for w in words if not w in eng_stopwords] # Remove stopwords
    clean_sent=" ".join(words)
    return(clean_sent)

In [22]:
#corpus = df['comment_text'].apply(nfx.remove_stopwords)
corpus = df['comment_text'].apply(clean_data_pipeline)

## Feature Engineering

In [23]:
# Initiate a TF-IDF Vectorizer
tfidf = TfidfVectorizer(min_df=100,  max_features=30000)

In [24]:
# Generate vectorized features features
Xfeatures = tfidf.fit_transform(corpus).toarray()

In [25]:
# Examine samples
df.sample(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
156106,c911b71975441c3d,mR. DEARING ROCKS ON!,0,0,0,0,0,0
26925,474ce6f0100b7508,"""\n\n What sport did he block in, in HS? \n\nI...",0,0,0,0,0,0
52198,8ba65566fc792a7d,"another, well-intentioned editor puts",0,0,0,0,0,0
156738,d2eac3d3e3b87e8e,Calling you a dildo is not vandalism (seriousl...,1,0,0,0,0,0
52651,8cc968809c0baade,You Haven't a Clue \n\nYou Haven't a clue what...,1,0,1,0,0,0
13758,2451f6f8c1b690c9,(UTC)\n\nThings are not this simple. There is ...,0,0,0,0,0,0
144763,11f66f5616c690ab,U are a bitch nigga. Fuck u and the guy who bl...,1,0,1,0,1,1
24696,414ace4068d42441,"Also, it's well known that slavic women don't ...",1,0,1,0,1,1
144802,12b17308407fffd4,"""\n\n The two studies are prominently discusse...",0,0,0,0,0,0
152094,87d8a2ccc4debc98,One or more users of this I.P. address repeate...,0,0,0,0,0,0


In [26]:
# Get the columns related to toxicity
y = df[['toxic', 'severe_toxic', 'obscene',	'threat','insult', 'identity_hate']]

In [27]:
# Split Data 
X_train,X_test,y_train,y_test = train_test_split(Xfeatures,y,test_size=0.3,random_state=42)

### Binary Relevance

In [28]:
# Multi-Label conversion to Multi-Class binary classification
binary_rel_clf = BinaryRelevance(LogisticRegression())

In [29]:
# Train the classifier
binary_rel_clf.fit(X_train,y_train)

In [30]:
# Make predictions
br_prediction = binary_rel_clf.predict(X_test)

In [31]:
#Check the accuracy
accuracy_score(y_test,br_prediction)

0.9184283088235294

In [32]:
# Check for the Hamming Loss which indicates incorrect prediction
hamming_loss(y_test,br_prediction)

0.019468582887700536

#### Classifier chains

In [33]:
def get_model_metrics(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    # Create an Instance
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    # Predict
    clf_predictions = clf.predict(xtest)
    # Check For Accuracy
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham}
    return result

In [34]:
# Build a model chain using LogisticRegression
clf_model_chained = get_model_metrics(LogisticRegression(),ClassifierChain,X_train,y_train,X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
clf_model_chained

{'accuracy:': 0.9201412098930482, 'hamming_score': 0.019249247994652406}

### Multilabel Classification

In [36]:
clf_labelP_model = get_model_metrics(LogisticRegression(),LabelPowerset,X_train,y_train,X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
clf_labelP_model

{'accuracy:': 0.917550969251337, 'hamming_score': 0.02067318404634581}

#### Test to validate the prediction

In [38]:
# Identify the index of a clean and a toxic comment to test
CLEAN = 2
TOXIC = 19191

In [39]:
# Take a clean comment
comment_clean = df['comment_text'].iloc[CLEAN]
comment_clean

"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."

In [40]:
# take a toxic comment
comment_toxic = df['comment_text'].iloc[TOXIC]
comment_toxic

'Did you know... \n\n... that you are an ugly piece of shit?\n\nSerious question.'

In [41]:
# Output of clean comment
df[df.columns[2:]].iloc[CLEAN]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 2, dtype: int64

In [42]:
# Output of toxic comment
df[df.columns[2:]].iloc[TOXIC]

toxic            1
severe_toxic     0
obscene          1
threat           0
insult           1
identity_hate    0
Name: 19191, dtype: int64

In [43]:
# Vectorized  clean comment
vec_clean = tfidf.transform([comment_clean])

In [44]:
# Vectorized Toxic comment
vec_toxic= tfidf.transform([comment_toxic])

In [45]:
# Predict for clean comment
binary_rel_clf.predict(vec_clean).toarray()

array([[0, 0, 0, 0, 0, 0]])

In [46]:
# Predict for toxic comment
binary_rel_clf.predict(vec_toxic).toarray()

array([[1, 0, 1, 0, 1, 0]])

### Hence we observe that it correctly predicts the clean and toxic comments. Hence our classifier is ready.

#### Save the Toxic Comment Classifier for creating API and REST End points and validate them

In [47]:
# Import the pickle library for persisting and loading saved models and tf_idf
import pickle

In [49]:
# save the model as a file
model_filename = 'toxic_comments_classifier.sav'
pickle.dump(binary_rel_clf, open(model_filename, 'wb'))

In [51]:
# Save the tf_idf as a file
tf_idf_filename = 'toxic_comments_tfidf_vectorizer.sav'
pickle.dump(tfidf, open(tf_idf_filename, 'wb'))

In [65]:
# Let us try a sample comment
sample_comment = "Hey you look ugly"

In [66]:
# load the saved model from disk
loaded_model = pickle.load(open(model_filename, 'rb'))

In [67]:
# load the saved tf_idf from disk
tf_idf = pickle.load(open(tf_idf_filename, 'rb'))

In [68]:
# Test from the saved tf_idf and model
vectorized_comment = tf_idf.transform([sample_comment])

In [69]:
# Classify the topic for the complaint
loaded_model.predict(vectorized_comment).toarray()

array([[1, 0, 0, 0, 1, 0]])

The saved classifier and vectorizer can be used for creating an API and REST Service to classify any toxic comments and implement in any Desktop, Web or Mobile applications.