### MultiLabel Classifier for Toxic Comments Classifier

Author: Md Junaid Alam

In [1]:
!pip install scikit-multilearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 KB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [2]:
!pip install neattext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 KB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3


In [3]:
# Import nltk libraries and download
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss,classification_report

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN

import re
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords

import joblib

In [5]:
lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()
eng_stopwords = set(stopwords.words("english"))

In [6]:
# Download the train.csv from the google drive by providing its Unique ID
! gdown 1hdJ8HxCOeS7ZCtCGPLNfKoWt4h_8qJUV

Downloading...
From: https://drive.google.com/uc?id=1hdJ8HxCOeS7ZCtCGPLNfKoWt4h_8qJUV
To: /content/train.csv
100% 68.8M/68.8M [00:01<00:00, 57.2MB/s]


In [7]:
# Load Dataset
df = pd.read_csv("/content/train.csv")

In [8]:
df.sample(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
59541,9f76beff4840a109,POV Tag\n\nI just noticed that the bias tag wa...,0,0,0,0,0,0
45146,78b780bc72596d7a,"woah\nim fron spokane too,and im liberal.woah!",0,0,0,0,0,0
131838,c173e7e97e0f5ade,Console Software developer values \n\nuser:Ind...,0,0,0,0,0,0
7838,14dc8a3136d0561e,My twopenn'orth is that it is probably better ...,0,0,0,0,0,0
147570,3e80370452eaaaa5,Just a little note \n\nIt's generally consider...,0,0,0,0,0,0
54004,90533ea0b16a4347,"""\n\nAlright. First to your analysis. Informat...",0,0,0,0,0,0
1438,03d858bf3a01074e,"This article needs referencing, per WP:CITE.",0,0,0,0,0,0
56412,96b64a98ebc9c740,"Do you sex goats? \n\nLook, don't voice this a...",1,0,1,0,0,0
13581,23d7d3bd41ae7dd5,Merger \n\nIf this article is to avoid being m...,0,0,0,0,0,0
54457,918cd91bb20a1b36,""":::::I 'spose we could be patient and leave t...",0,0,0,0,0,0


In [9]:
df["toxic"].value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

In [10]:
df["severe_toxic"].value_counts()

0    157976
1      1595
Name: severe_toxic, dtype: int64

In [11]:
# Import neatText
import neattext as nt
import neattext.functions as nfx

In [12]:
# Create a dictionary of Aphost lookup
APHOST_LOOKUP = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [13]:
# Define the function for data cleaning
def clean_data_pipeline(comment):
    comment=comment.lower() # Convert to lower case
    comment=re.sub("\\n","",comment) # Remove new line
    # Remove the leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    comment=re.sub("\[\[.*\]","",comment) # Remove users
    words=tokenizer.tokenize(comment) # Tokenize into words
    # Replace apostrophe with reference to the Aphost Dictionary 
    words=[APHOST_LOOKUP[word] if word in APHOST_LOOKUP else word for word in words]
    words=[lem.lemmatize(word, "v") for word in words] # Lemmatization
    words = [w for w in words if not w in eng_stopwords] # Remove stopwords
    clean_sent=" ".join(words)
    return(clean_sent)

In [14]:
#corpus = df['comment_text'].apply(nfx.remove_stopwords)
corpus = df['comment_text'].apply(clean_data_pipeline)

## Feature Engineering

In [15]:
# Initiate a TF-IDF Vectorizer
tfidf = TfidfVectorizer(min_df=100,  max_features=30000)

In [16]:
# Generate vectorized features features
Xfeatures = tfidf.fit_transform(corpus).toarray()

In [17]:
# Examine samples
df.sample(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
49875,8559845415d77f7e,Hi I know how the talk pages work. Your crite...,0,0,0,0,0,0
121631,8ac4861e970dcd90,Rain is patters on the corrugated iron roof ab...,0,0,0,0,0,0
32371,56129139a86c29b3,"""Please be careful not to remove content from ...",0,0,0,0,0,0
84424,e1c72466bbbb064d,getting pics into the commons \n\nI can't figu...,0,0,0,0,0,0
58406,9c5d74102e3558c5,"""\nThanks, Luxembourg has been removed after c...",0,0,0,0,0,0
81240,d953b2d96d554d1a,"All right, I completely understand and won't t...",0,0,0,0,0,0
19592,33beb00e285a2dc5,Fried chickens \n\nIs dat sum fried chickens?,0,0,0,0,0,0
51002,88655df2e5ab9579,Actually ... \n\nThis entire article needs re...,0,0,0,0,0,0
71854,c067234f7af04c11,Ya wanna believe it. It's like those town meet...,0,0,0,0,0,0
77151,ce9d95ec0c229baf,It's already in the article.,0,0,0,0,0,0


In [18]:
# Get the columns related to toxicity
y = df[['toxic', 'severe_toxic', 'obscene',	'threat','insult', 'identity_hate']]

In [19]:
# Split Data 
X_train,X_test,y_train,y_test = train_test_split(Xfeatures,y,test_size=0.3,random_state=42)

In [20]:
# import skmultilearn
import skmultilearn

### Binary Relevance

In [21]:
# Multi-Label conversion to Multi-Class binary classification
binary_rel_clf = BinaryRelevance(LogisticRegression())

In [22]:
# Train the classifier
binary_rel_clf.fit(X_train,y_train)

In [23]:
# Make predictions
br_prediction = binary_rel_clf.predict(X_test)

In [24]:
#Check the accuracy
accuracy_score(y_test,br_prediction)

0.9184491978609626

In [25]:
# Check for the Hamming Loss which indicates incorrect prediction
hamming_loss(y_test,br_prediction)

0.019465101381461677

#### Classifier chains

In [26]:
def get_model_metrics(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    # Create an Instance
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    # Predict
    clf_predictions = clf.predict(xtest)
    # Check For Accuracy
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham}
    return result

In [27]:
# Build a model chain using LogisticRegression
clf_model_chained = get_model_metrics(LogisticRegression(),ClassifierChain,X_train,y_train,X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
clf_model_chained

{'accuracy:': 0.9201412098930482, 'hamming_score': 0.019249247994652406}

### Multilabel Classification

In [29]:
clf_labelP_model = get_model_metrics(LogisticRegression(),LabelPowerset,X_train,y_train,X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
clf_labelP_model

{'accuracy:': 0.917550969251337, 'hamming_score': 0.02067318404634581}

#### Test to validate the prediction

In [31]:
# Identify the index of a clean and a toxic comment to test
CLEAN = 2
TOXIC = 19191

In [32]:
# Take a clean comment
comment_clean = df['comment_text'].iloc[CLEAN]
comment_clean

"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."

In [33]:
# take a toxic comment
comment_toxic = df['comment_text'].iloc[19191]
comment_toxic

'Did you know... \n\n... that you are an ugly piece of shit?\n\nSerious question.'

In [34]:
# Output of clean comment
df[df.columns[2:]].iloc[CLEAN]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 2, dtype: int64

In [35]:
# Output of toxic comment
df[df.columns[2:]].iloc[TOXIC]

toxic            1
severe_toxic     0
obscene          1
threat           0
insult           1
identity_hate    0
Name: 19191, dtype: int64

In [36]:
# Vectorized  clean
vec_clean = tfidf.transform([comment_clean])

In [37]:
vec_toxic= tfidf.transform([comment_toxic])

In [38]:
# Predict for clean comment
binary_rel_clf.predict(vec_clean).toarray()

array([[0, 0, 0, 0, 0, 0]])

In [39]:
# Predict for toxic comment
binary_rel_clf.predict(vec_toxic).toarray()

array([[1, 0, 1, 0, 1, 0]])

### Hence we observe that it correctly predicts the clean and toxic comments. Hence our classifier is ready.

#### Save the Toxic Comment Classifier for creating API and REST End points

In [40]:
# Save the Model
binary_rel_clf_file = open("toxic_comments_classifier.pkl","wb")
joblib.dump(binary_rel_clf,binary_rel_clf_file)
binary_rel_clf_file.close()

In [41]:
# Save the vectorizer
tfidf_vectorizer_file = open("toxic_comments_tfidf_vectorizer.pkl","wb")
joblib.dump(tfidf,tfidf_vectorizer_file)
tfidf_vectorizer_file.close()

The saved classifier and vectorizer can be used for creating an API and REST Service to classify any toxic comments and implement in any Desktop, Web or Mobile applications.