<a href="https://colab.research.google.com/github/joynaomi81/Toxic-Comment-Detection/blob/main/Toxic_Comment_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/text.csv')

In [None]:
df.head() # Check for the first 5 rows

Unnamed: 0,comment_text,toxic
0,This letter perfectly illustrates why any hope...,1
1,One muslim casualty vs the hundreds and thousa...,1
2,(fuck you Osama bin laden and your afghanistan...,1
3,As long as Trump keeps Stiggin' It to the libs...,1
4,This article is a load of crap.... Another Fa...,1


In [None]:
df.tail() # Check for the last 5 rows

Unnamed: 0,comment_text,toxic
19995,i like smiley pancakes and crap on stick,0
19996,"""\n\n""""żem"""" is not equal to """"że"""". """"żem"""" ...",0
19997,"""\n\n Headlines \n\nCan you please add this co...",0
19998,"Thank You, sorry.–",0
19999,Schooling \n\nI attended Harrison Trimble in M...,0


In [None]:
# Check for the columns lables
df.columns

Index(['comment_text', 'toxic'], dtype='object')

In [None]:
df.info() # Information about the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment_text  20000 non-null  object
 1   toxic         20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [None]:
df.describe() # Descriptive statistics of the dataset

Unnamed: 0,toxic
count,20000.0
mean,0.5
std,0.500013
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [None]:
df.shape # Check for the shape of the dataset

(20000, 2)

In [None]:
# Check for unique values
df.nunique()

Unnamed: 0,0
comment_text,19960
toxic,2


# Data cleaning

In [None]:
#check for missing values
df.isna().sum()

Unnamed: 0,0
comment_text,0
toxic,0


In [None]:
# Checking for duplicate rows in the DataFrame
df.duplicated().sum()

39

There are 39 rows in the DataFrame that are duplicates of other rows.

In [None]:
# Drop duplicates rows
df = df.drop_duplicates()


In [None]:
# Check for the new data shape
df.shape

(19961, 2)

In [None]:
df.loc[:1] # locate a specific row

Unnamed: 0,comment_text,toxic
0,This letter perfectly illustrates why any hope...,1
1,One muslim casualty vs the hundreds and thousa...,1


# Data Pre-processing

## Convert data to Lowercase

In [None]:
df['clean_text'] = df['comment_text'].str.lower()
df.sample(5)

Unnamed: 0,comment_text,toxic,clean_text
637,You got it.....you nailed it on the head.........,1,you got it.....you nailed it on the head.........
203,Oh! I found out what his knickers are in a not...,1,oh! i found out what his knickers are in a not...
12118,"You're right, in a case like this it should be...",0,"you're right, in a case like this it should be..."
11053,Blocked for truth seeking! \n\nThis user shoul...,0,blocked for truth seeking! \n\nthis user shoul...
16682,on creepshow 2 you after he eats everybody you...,0,on creepshow 2 you after he eats everybody you...


## Removal of Punctuations

In [None]:
def remove_punctuations(text):
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  for x in text:
    if x in punctuations:
      text = text.replace(x, "")
  return text

In [None]:
df['clean_text'] = df['comment_text'].apply(remove_punctuations)
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,This letter perfectly illustrates why any hope...
1,One muslim casualty vs the hundreds and thousa...,1,One muslim casualty vs the hundreds and thousa...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck you Osama bin laden and your afghanistani...
3,As long as Trump keeps Stiggin' It to the libs...,1,As long as Trump keeps Stiggin It to the libs ...
4,This article is a load of crap.... Another Fa...,1,This article is a load of crap Another Fake N...


## Removal of Stopwords

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [None]:
words = set(stopwords.words('english'))
def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in words])

In [None]:
df['clean_text'] = df['comment_text'].apply(lambda a: remove_stopwords(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,"This letter perfectly illustrates hoped ""recon..."
1,One muslim casualty vs the hundreds and thousa...,1,One muslim casualty vs hundreds thousands vict...
2,(fuck you Osama bin laden and your afghanistan...,1,(fuck Osama bin laden afghanistani terrorist c...
3,As long as Trump keeps Stiggin' It to the libs...,1,"As long Trump keeps Stiggin' It libs, Palin-Am..."
4,This article is a load of crap.... Another Fa...,1,This article load crap.... Another Fake News P...


## Removal of Special Characters

In [None]:
import re
def remove_spl_chars(text):
  text = re.sub('[^a-zA-Z0-9]', ' ', text)
  text = re.sub('\s+', ' ', text)
  return text

In [None]:
df['clean_text'] = df['comment_text'].apply(lambda a: remove_spl_chars(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,This letter perfectly illustrates why any hope...
1,One muslim casualty vs the hundreds and thousa...,1,One muslim casualty vs the hundreds and thousa...
2,(fuck you Osama bin laden and your afghanistan...,1,fuck you Osama bin laden and your afghanistan...
3,As long as Trump keeps Stiggin' It to the libs...,1,As long as Trump keeps Stiggin It to the libs ...
4,This article is a load of crap.... Another Fa...,1,This article is a load of crap Another Fake Ne...


## Stemming

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
  return " ".join([stemmer.stem(word) for word in text.split()])

In [None]:
df['clean_text'] = df['comment_text'].apply(lambda a: stem_words(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,thi letter perfectli illustr whi ani hope for ...
1,One muslim casualty vs the hundreds and thousa...,1,one muslim casualti vs the hundr and thousand ...
2,(fuck you Osama bin laden and your afghanistan...,1,(fuck you osama bin laden and your afghanistan...
3,As long as Trump keeps Stiggin' It to the libs...,1,"as long as trump keep stiggin' it to the libs,..."
4,This article is a load of crap.... Another Fa...,1,thi articl is a load of crap.... anoth fake ne...


## Lemmatization

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmetizer = WordNetLemmatizer()
nltk.download('wordnet')
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "AV": wordnet.ADV}

def lemmatize_words(text):
  pos_tagged_text = nltk.pos_tag(text.split())
  return " ".join([lemmetizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df['clean_text'] = df['comment_text'].apply(lambda a: lemmatize_words(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,This letter perfectly illustrate why any hoped...
1,One muslim casualty vs the hundreds and thousa...,1,One muslim casualty v the hundred and thousand...
2,(fuck you Osama bin laden and your afghanistan...,1,(fuck you Osama bin laden and your afghanistan...
3,As long as Trump keeps Stiggin' It to the libs...,1,"As long a Trump keep Stiggin' It to the libs, ..."
4,This article is a load of crap.... Another Fa...,1,This article be a load of crap.... Another Fak...


## Removal of URLs

In [None]:
import re

def remove_urls(text):
  return re.sub(r'https?://\S+|www\.\S+', '', text) # remove urls

df['clean_text'] = df['comment_text'].apply(lambda a: remove_urls(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,This letter perfectly illustrates why any hope...
1,One muslim casualty vs the hundreds and thousa...,1,One muslim casualty vs the hundreds and thousa...
2,(fuck you Osama bin laden and your afghanistan...,1,(fuck you Osama bin laden and your afghanistan...
3,As long as Trump keeps Stiggin' It to the libs...,1,As long as Trump keeps Stiggin' It to the libs...
4,This article is a load of crap.... Another Fa...,1,This article is a load of crap.... Another Fa...


## Removal of HTML Tags

In [None]:
def remove_html_tags(text):
  return re.sub(r'<.*?>', '', text)

df['clean_text'] = df['comment_text'].apply(lambda a: remove_html_tags(a))
df.head()

Unnamed: 0,comment_text,toxic,clean_text
0,This letter perfectly illustrates why any hope...,1,This letter perfectly illustrates why any hope...
1,One muslim casualty vs the hundreds and thousa...,1,One muslim casualty vs the hundreds and thousa...
2,(fuck you Osama bin laden and your afghanistan...,1,(fuck you Osama bin laden and your afghanistan...
3,As long as Trump keeps Stiggin' It to the libs...,1,As long as Trump keeps Stiggin' It to the libs...
4,This article is a load of crap.... Another Fa...,1,This article is a load of crap.... Another Fa...


In [None]:
df.columns

Index(['comment_text', 'toxic', 'clean_text'], dtype='object')

## Model Training/ Model Evaluation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
cv = TfidfVectorizer(max_features=5000)
x = cv.fit_transform(df['comment_text']).toarray()
y=  df['toxic']

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [227]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

cv = TfidfVectorizer(max_features=5000)

# Transform the data
x = cv.fit_transform(df['comment_text']).toarray()
y = df['toxic']

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Train your model
model = MultinomialNB()
model.fit(x_train, y_train)

# Save the TfidfVectorizer and model
joblib.dump(cv, 'feature.pkt')
joblib.dump(model, 'toxic model.pkt')


['toxic model.pkt']

In [None]:
prediction = mnb.predict(x_test)

In [None]:
comparison_df = pd.DataFrame(np.c_[y_test, prediction], columns=['Actual_Labels', 'Predicted_Labels'])

print(comparison_df)

      Actual_Labels  Predicted_Labels
0                 1                 1
1                 1                 1
2                 0                 0
3                 0                 0
4                 1                 1
...             ...               ...
3988              1                 1
3989              0                 0
3990              1                 1
3991              0                 0
3992              0                 0

[3993 rows x 2 columns]


In [None]:
accuracy = accuracy_score(y_test, prediction)
print(f'Accuracy: {accuracy}')

print('\nClassification Report:')
print(classification_report(y_test, prediction))


Accuracy: 0.8993238166791886

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      1994
           1       0.88      0.92      0.90      1999

    accuracy                           0.90      3993
   macro avg       0.90      0.90      0.90      3993
weighted avg       0.90      0.90      0.90      3993



In [None]:
cm = confusion_matrix(y_test, prediction)

In [None]:
conf_matrix = confusion_matrix(comparison_df['Actual_Labels'], comparison_df['Predicted_Labels'])
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[1753  241]
 [ 161 1838]]


In [None]:
# Save the TfidfVectorizer
joblib.dump(cv, 'feature.pkt')

# Save the trained model
joblib.dump(model, 'toxic model.pkt')


['toxic model.pkt']

In [None]:
# Load the TfidfVectorizer
loaded_cv = joblib.load('feature.pkt')

# Load the trained model
loaded_model = joblib.load('toxic model.pkt')

new_comments = ["I hate You"]
new_x = loaded_cv.transform(new_comments).toarray()
predictions = loaded_model.predict(new_x)

predictions = loaded_model.predict(new_x)

# Print the results of the prediction
for comment, prediction in zip(new_comments, predictions):
    print(f"Comment: '{comment}' - Prediction: {'Toxic' if prediction == 1 else 'Not Toxic'}")


Comment: 'I hate You' - Prediction: Toxic
