### MultiLabel Classifier for Toxic Comments Classifier

Author: Md Junaid Alam

In [1]:
!pip install scikit-multilearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install neattext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# Import nltk libraries and download
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss,classification_report

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN

import re
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords

import joblib

In [5]:
lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()
eng_stopwords = set(stopwords.words("english"))

In [6]:
# Download the train.csv from the google drive by providing its Unique ID
! gdown 1hdJ8HxCOeS7ZCtCGPLNfKoWt4h_8qJUV

Downloading...
From: https://drive.google.com/uc?id=1hdJ8HxCOeS7ZCtCGPLNfKoWt4h_8qJUV
To: /content/train.csv
100% 68.8M/68.8M [00:00<00:00, 279MB/s]


In [7]:
# Load Dataset
df = pd.read_csv("/content/train.csv")

In [8]:
df.sample(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
52800,8d2529f5f496ba22,"""\n\n Demented Cartoon Movie \n\nYes I have, b...",0,0,0,0,0,0
133029,c7b17cb1acde86ee,. I'm sure you meant well,0,0,0,0,0,0
134081,cd170abd27edc660,WikiProject Alternative music September 2007 N...,0,0,0,0,0,0
89875,f070ac124550ea32,"Yer, but its also based on the size of the mar...",1,0,0,0,0,0
49885,8564468984d81f80,Let us keep it as Wiki Conference India. Each ...,0,0,0,0,0,0
149093,56f19bcf42c7fcb3,"Theresa, I would urge caution to the torches a...",0,0,0,0,0,0
19486,33734602a869ce61,". If you live 3,000 miles away from Karyn's ap...",0,0,0,0,0,0
11821,1f43b3a65d205afe,How can my comment on my own talk page WP:DISR...,0,0,0,0,0,0
132804,c688a6d0d47e4cda,"""\nFrom Wikipedia:MOSQUOTE:\n""""A quotation is ...",0,0,0,0,0,0
14366,25f3d4d49e64ff85,Fantastic idea! Since Family Guy uses music so...,0,0,0,0,0,0


In [9]:
df["toxic"].value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

In [10]:
df["severe_toxic"].value_counts()

0    157976
1      1595
Name: severe_toxic, dtype: int64

In [11]:
# Import neatText
import neattext as nt
import neattext.functions as nfx

In [12]:
# Explore For Noise
#df['comment_text'].apply(lambda x:nt.TextFrame(x).noise_scan())

In [13]:
# Explore For stopwords
#df['comment_text'].apply(lambda x:nt.TextExtractor(x).extract_stopwords())

In [14]:
# When stopwords removed
#df['comment_text'].apply(nfx.remove_stopwords)

In [15]:
# Create a dictionary of Aphost lookup
APHOST_LOOKUP = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [16]:
# Define the function for data cleaning
def clean_data_pipeline(comment):
    comment=comment.lower() # Convert to lower case
    comment=re.sub("\\n","",comment) # Remove new line
    # Remove the leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    comment=re.sub("\[\[.*\]","",comment) # Remove users
    words=tokenizer.tokenize(comment) # Tokenize into words
    # Replace apostrophe with reference to the Aphost Dictionary 
    words=[APHOST_LOOKUP[word] if word in APHOST_LOOKUP else word for word in words]
    words=[lem.lemmatize(word, "v") for word in words] # Lemmatization
    words = [w for w in words if not w in eng_stopwords] # Remove stopwords
    clean_sent=" ".join(words)
    return(clean_sent)

In [17]:
#corpus = df['comment_text'].apply(nfx.remove_stopwords)
corpus = df['comment_text'].apply(clean_data_pipeline)

## Feature Engineering

In [18]:
# Initiate a TF-IDF Vectorizer
tfidf = TfidfVectorizer(min_df=100,  max_features=30000)

In [19]:
# Generate vectorized features features
Xfeatures = tfidf.fit_transform(corpus).toarray()

In [20]:
# Examine samples
df.sample(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
62171,a6537520900369f6,Listen here Schumin\n\nI have waged a war of a...,0,0,0,0,0,0
52906,8d67235438097663,"Not\n\na Goddamned thing, if you dont like it,...",1,0,0,0,0,0
15389,28ac0fe57fcd956c,"its been merged, but how to delete the other a...",0,0,0,0,0,0
153348,9b8b9d7f821ec107,Massive Plagarism\nThe bulk of the article is ...,0,0,0,0,0,0
114683,65573622ed3c6f62,"""This article needs to stay and grow. In my o...",0,0,0,0,0,0
26961,4767daebdc2142af,"""\n\nThe anonymous comments above (written by ...",0,0,0,0,0,0
99384,139b903f981cf4ed,Again Pan-Turkistic vandalism \n\nSorry to bot...,0,0,0,0,0,0
119603,7f719934cad289be,"""\nYou could try proposing a change. (Talk) """,0,0,0,0,0,0
47091,7dcc8797cede5cd1,"""\n\n TV show? \n\nWill you please explain of ...",0,0,0,0,0,0
4433,0bcc879e6af0093d,proposed the bank notes,0,0,0,0,0,0


In [21]:
# Get the columns related to toxicity
y = df[['toxic', 'severe_toxic', 'obscene',	'threat','insult', 'identity_hate']]

In [22]:
# Split Data 
X_train,X_test,y_train,y_test = train_test_split(Xfeatures,y,test_size=0.3,random_state=42)

In [23]:
# import skmultilearn
import skmultilearn

### Binary Relevance

In [24]:
# Multi-Label conversion to Multi-Class binary classification
binary_rel_clf = BinaryRelevance(MultinomialNB())

In [25]:
# Train the classifier
binary_rel_clf.fit(X_train,y_train)

In [26]:
# Make predictions
br_prediction = binary_rel_clf.predict(X_test)

In [27]:
#Check the accuracy
accuracy_score(y_test,br_prediction)

0.912850935828877

In [28]:
# Check for the Hamming Loss which indicates incorrect prediction
hamming_loss(y_test,br_prediction)

0.022396529634581105

#### Classifier chain

In [29]:
def get_model_metrics(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    # Create an Instance
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    # Predict
    clf_predictions = clf.predict(xtest)
    # Check For Accuracy
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham}
    return result

In [30]:
# Build a model chain using LogisticRegression
clf_model_chained = get_model_metrics(LogisticRegression(),ClassifierChain,X_train,y_train,X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
clf_model_chained

{'accuracy:': 0.9201412098930482, 'hamming_score': 0.019249247994652406}

### Multilabel Classification

In [32]:
clf_labelP_model = get_model_metrics(LogisticRegression(),LabelPowerset,X_train,y_train,X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
clf_labelP_model

{'accuracy:': 0.917550969251337, 'hamming_score': 0.02067318404634581}

#### Test to validate the prediction

In [34]:
# Identify the index of a clean and a toxic comment to test
CLEAN = 2
TOXIC = 19191

In [35]:
# Take a clean comment
comment_clean = df['comment_text'].iloc[CLEAN]
comment_clean

"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."

In [36]:
# take a toxic comment
comment_toxic = df['comment_text'].iloc[19191]
comment_toxic

'Did you know... \n\n... that you are an ugly piece of shit?\n\nSerious question.'

In [37]:
# Output of clean comment
df[df.columns[2:]].iloc[CLEAN]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 2, dtype: int64

In [38]:
# Output of toxic comment
df[df.columns[2:]].iloc[TOXIC]

toxic            1
severe_toxic     0
obscene          1
threat           0
insult           1
identity_hate    0
Name: 19191, dtype: int64

In [39]:
# Vectorized  clean
vec_clean = tfidf.transform([comment_clean])

In [40]:
vec_toxic= tfidf.transform([comment_toxic])

In [41]:
# Predict for clean comment
binary_rel_clf.predict(vec_clean).toarray()

array([[0, 0, 0, 0, 0, 0]])

In [42]:
# Predict for toxic comment
binary_rel_clf.predict(vec_toxic).toarray()

array([[1, 0, 1, 0, 0, 0]])

### Hence we observe that it correctly predicts the clean and toxic comments. Hence our classifier is ready.

#### Save the Toxic Comment Classifier for creating API and REST End points

In [43]:
# Save the Model
binary_rel_clf_file = open("toxic_comments_classifier.pkl","wb")
joblib.dump(binary_rel_clf,binary_rel_clf_file)
binary_rel_clf_file.close()

In [44]:
# Save the vectorizer
tfidf_vectorizer_file = open("toxic_comments_tfidf_vectorizer.pkl","wb")
joblib.dump(tfidf,tfidf_vectorizer_file)
tfidf_vectorizer_file.close()

The saved classifier and vectorizer can be used for creating an API and REST Service to classify any toxic comments and implement in any Desktop, Web or Mobile applications.