In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
df[df['toxic'] == 1]['comment_text'].head(5)


6          COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
12    Hey... what is it..\n@ | talk .\nWhat is it......
16    Bye! \n\nDon't look, come or think of comming ...
42    You are gay or antisemmitian? \n\nArchangel WH...
43             FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!
Name: comment_text, dtype: object

In [5]:
df.shape

(159571, 8)

In [6]:
label_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
df[label_cols].sum()


toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [7]:
df['comment_text'].iloc[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [5]:
import re


In [7]:

def clean_text(text):
    text = str(text).lower()
    
    # remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    
    # remove HTML tags
    text = re.sub(r"<.*?>", " ", text)
    
    # remove non-alphanumeric characters
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    
    # remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text


In [9]:
df['clean_text'] = df['comment_text'].apply(clean_text)
df[['comment_text', 'clean_text']].head(5)


Unnamed: 0,comment_text,clean_text
0,Explanation\nWhy the edits made under my usern...,explanation why the edits made under my userna...
1,D'aww! He matches this background colour I'm s...,d aww he matches this background colour i m se...
2,"Hey man, I'm really not trying to edit war. It...",hey man i m really not trying to edit war it s...
3,"""\nMore\nI can't make any real suggestions on ...",more i can t make any real suggestions on impr...
4,"You, sir, are my hero. Any chance you remember...",you sir are my hero any chance you remember wh...


In [10]:
sample = df[df['toxic'] == 1].iloc[0]
print("ORIGINAL:\n", sample['comment_text'])
print("\nCLEANED:\n", sample['clean_text'])


ORIGINAL:
 COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK

CLEANED:
 cocksucker before you piss around on my work


In [11]:
label_cols = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

X = df['clean_text']
y = df[label_cols]


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer = TfidfVectorizer(
        stop_words='english',
        max_features = 50000,
        ngram_range = (1,2)
)
        

In [16]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [17]:
X_train_vec.shape


(127656, 50000)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier


In [29]:
model = OneVsRestClassifier(
    LogisticRegression(
        max_iter=1000,
        solver="saga",
        class_weight="balanced"
    )
)


In [31]:
model.fit(X_train_vec, y_train)




In [33]:
    y_pred = model.predict(X_test_vec)


In [35]:
y_pred

array([[1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 1, 1, 0],
       [0, 0, 0, 0, 0, 0]])

In [37]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=label_cols))


               precision    recall  f1-score   support

        toxic       0.54      0.90      0.67      3056
 severe_toxic       0.32      0.64      0.42       321
      obscene       0.57      0.90      0.70      1715
       threat       0.02      0.99      0.03        74
       insult       0.40      0.91      0.56      1614
identity_hate       0.55      0.22      0.32       294

    micro avg       0.36      0.86      0.51      7074
    macro avg       0.40      0.76      0.45      7074
 weighted avg       0.50      0.86      0.62      7074
  samples avg       0.06      0.09      0.07      7074



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
samples = [
    "You are an idiot",
    "I love this video so much",
    "I will kill you",
]

sample_vec = vectorizer.transform(samples)
sample_preds = model.predict(sample_vec)

for text, pred in zip(samples, sample_preds):
    print(text)
    print(dict(zip(label_cols, pred)))
    print()


You are an idiot
{'toxic': 1, 'severe_toxic': 0, 'obscene': 1, 'threat': 0, 'insult': 1, 'identity_hate': 0}

I love this video so much
{'toxic': 0, 'severe_toxic': 0, 'obscene': 0, 'threat': 0, 'insult': 0, 'identity_hate': 0}

I will kill you
{'toxic': 1, 'severe_toxic': 0, 'obscene': 1, 'threat': 1, 'insult': 1, 'identity_hate': 0}



In [41]:
import joblib

joblib.dump(model, "toxic_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("Model and vectorizer saved successfully")


Model and vectorizer saved successfully
