In [19]:
# import all the libraries needed to build a toxicity classifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [20]:
toxicity = pd.read_json("social_media_toxicity_dataset.json")


In [21]:
toxicity.head()


Unnamed: 0,worker_id,task_id,task_response_id,text,Is this text toxic?
0,4RHRV9MACQGW,ccd0a1da-0e6a-453e-a6d1-6418cc4a5546,3d66075c-623e-4e64-93ac-123d379d28f0,I came here to say this exactly!,Not Toxic
1,R7XYZ4FKMFF9,d977c704-4ae3-4381-99de-95014ee57a05,acbd5847-698e-4098-9499-d547948a2da2,Thank you :),Not Toxic
2,MNTYRY6PYPQF,89a4c206-dfbc-40e8-ac73-9bb1b5c8e56e,2def7071-3e27-4b63-bd66-99f041363e2a,I feel a subreddit being born,Not Toxic
3,DR6XNZMT9KRH,41add9eb-bbe3-47d9-8ea2-b571f96fe655,b6e11a16-686d-43da-87f4-8201d8bb2238,Disturbing wholesomeness should be a thing,Not Toxic
4,G6VJRCCGZ9D6,883f5bd6-058c-460f-8e06-c0793af441d1,7f0bdd8f-428a-4ecb-b432-8e6b1f3706c0,Smite jinx... int or pentakill?\n \n \n \n Both.,Not Toxic


In [22]:
# create a pipeline that uses the CountVectorizer and MultinomialNB to classify the text 
# to predict the toxicity of a text 
pipeline = Pipeline([
    ("vectorizer", CountVectorizer()), 
    ("tfidf", TfidfTransformer()), 
    ("clf", MultinomialNB()) 
])

In [23]:

# use the pipeline to classify 
pipeline.fit(toxicity["text"], toxicity["Is this text toxic?"])

In [24]:
# predict the toxicity of the text 
predictions = pipeline.predict(["this is a toxic text", "this is not a toxic text"])
print(predictions)

['Not Toxic' 'Not Toxic']


In [25]:

# create an array with the following toxic comments: "fuck you!", "you're so stupid", "go kys duckwad"
toxic_comments = ["fuck you!", "you\'re so stupid", "go kys duckwad"]
print(pipeline.predict(toxic_comments))

['Toxic' 'Toxic' 'Toxic']


In [26]:
# split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(toxicity["text"], toxicity["Is this text toxic?"], test_size=0.2, random_state=42)

In [27]:
pipeline.fit(X_train, y_train)


In [28]:
# predict the labels of the testing data 
y_pred = pipeline.predict(X_test)

In [29]:
# print the accuracy score 
print(accuracy_score(y_test, y_pred))

0.855


In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   Not Toxic       0.84      0.87      0.85        97
       Toxic       0.87      0.84      0.86       103

    accuracy                           0.85       200
   macro avg       0.85      0.86      0.85       200
weighted avg       0.86      0.85      0.86       200



In [31]:
# binarize y_test and y_pred 
y_test_bin = y_test.copy() 
y_pred_bin = y_pred.copy() 
y_test_bin[y_test_bin==0] = -1 
y_pred_bin[y_pred_bin==0] = -1 

  y_pred_bin[y_pred_bin==0] = -1


In [32]:
toxic_comments = ["fuck you!", "hello :) ","you\'re so stupid", "yo o", "go kys duckwad"]
print(pipeline.predict(toxic_comments))

['Toxic' 'Not Toxic' 'Toxic' 'Not Toxic' 'Toxic']


In [34]:
chat_log = pd.read_json("chat_log_20230825_zackrawrr.json")
chat_log.head()

Unnamed: 0,username,chat_message,timestamp
0,captainborat2,Baldur's Gate 3 Romance Just Isn't Interesting...,2023-08-25 13:18:10.912280
1,lore_sound,Halo 2,2023-08-25 13:18:10.968594
2,juanmajfry,xdd,2023-08-25 13:18:11.384643
3,w3btree,@zackrawrr i got over diablo 4 by playing lost...,2023-08-25 13:18:11.384786
4,steeltarkus,based,2023-08-25 13:18:11.420514


In [38]:
for index, row in chat_log.iterrows():
    print(
        f"Index: {index}, Name: {row['username']}, message: {row['chat_message']}, toxic:{pipeline.predict({row['chat_message']})}"
    )

Index: 0, Name: captainborat2, message: Baldur's Gate 3 Romance Just Isn't Interesting | Extra Punctuation ***, toxic:['Not Toxic']
Index: 1, Name: lore_sound, message: Halo 2, toxic:['Not Toxic']
Index: 2, Name: juanmajfry, message: xdd, toxic:['Not Toxic']
Index: 3, Name: w3btree, message: @zackrawrr i got over diablo 4 by playing lost ark, toxic:['Not Toxic']
Index: 4, Name: steeltarkus, message: based, toxic:['Not Toxic']
Index: 5, Name: chadadam, message: God of War, toxic:['Toxic']
Index: 6, Name: papalotapuss, message: spoilers D:, toxic:['Not Toxic']
Index: 7, Name: parcelazo, message: the moon monkaS, toxic:['Toxic']
Index: 8, Name: rando93, message: Spoilers perfection does not exist... The games are good or they are Diablo IV KEKW, toxic:['Toxic']
Index: 9, Name: floppytop2, message: lol, toxic:['Not Toxic']
Index: 10, Name: kard_ttv, message: super mario world is perfect, toxic:['Not Toxic']
Index: 11, Name: darkshadowx91, message: Wind waker KEKW, toxic:['Toxic']
Index: 12