In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_score

In [23]:
df=pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [24]:
df.shape

(159571, 8)

In [25]:
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


In [26]:
df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
df.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

**PRE-PROCESSING**

In [28]:
columns_lst=['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [29]:
df['is_profane'] = df[columns_lst].any(axis=1).astype(int) #if any of column is 1, then it is profane


In [30]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,is_profane
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,0


In [31]:
df['is_profane'].value_counts()
#it shows more 0s than 1s.

is_profane
0    143346
1     16225
Name: count, dtype: int64

## Text preprocessing

In [52]:
import re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

In [50]:
df['comment_text'] = df['comment_text'].apply(preprocess_text)

In [53]:
preprocess_text("This is a sample comment! Visit http://example.com #example @user")

'sample comment visit example'

In [32]:
X=df['comment_text']
y=df['is_profane']

In [33]:
X,y

(0         Explanation\nWhy the edits made under my usern...
 1         D'aww! He matches this background colour I'm s...
 2         Hey man, I'm really not trying to edit war. It...
 3         "\nMore\nI can't make any real suggestions on ...
 4         You, sir, are my hero. Any chance you remember...
                                 ...                        
 159566    ":::::And for the second time of asking, when ...
 159567    You should be ashamed of yourself \n\nThat is ...
 159568    Spitzer \n\nUmm, theres no actual article for ...
 159569    And it looks like it was actually you who put ...
 159570    "\nAnd ... I really don't think you understand...
 Name: comment_text, Length: 159571, dtype: object,
 0         0
 1         0
 2         0
 3         0
 4         0
          ..
 159566    0
 159567    0
 159568    0
 159569    0
 159570    0
 Name: is_profane, Length: 159571, dtype: int64)

In [34]:
#splitting the dataset into training and testing data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
profanity_pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english'),   # Convert the text data into TF-IDF features
    LogisticRegression(class_weight='balanced', max_iter=1000)  # Use Logistic Regression with balanced class weights
)
profanity_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('tfidfvectorizer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [37]:
# profanity_pipeline = make_pipeline(
#     TfidfVectorizer(stop_words='english'),
#     MultinomialNB()
# )
# profanity_pipeline.fit(X_train, y_train)

#we have not used MultinomialNB because it was giving high precision but low recall.

In [38]:
y_pred = profanity_pipeline.predict(X_test)

In [39]:
print(classification_report(y_test, y_pred)) 
print(accuracy_score(y_test,y_pred))
print(precision_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.97     28670
           1       0.67      0.85      0.75      3245

    accuracy                           0.94     31915
   macro avg       0.83      0.90      0.86     31915
weighted avg       0.95      0.94      0.95     31915

0.9423781920726931
0.6710462287104623


In [40]:
#hence, recall value for 1(profane) increases from 0.21 to 0.85 on using Logistic reg. and unbalanced dataset problem solved.

In [47]:
profanity_pipeline.predict(["Nice pic"])  # Example usage

array([0])

**Saving the trained model through pickling**

In [41]:
import pickle
model_filename = 'profanity_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(profanity_pipeline, file)

In [42]:
import sys
print(sys.executable)

c:\python\New folder\spam\Scripts\python.exe
