In [218]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns   


In [219]:
df = pd.read_csv("fixed_balanced_dataset.csv")
df

Unnamed: 0,text,label_spam,label_obscene,target
0,DESIGNER INTERNSHIP â€” 70000 PER HOUR. GU...,1,0,1
1,Sharing my portfolio: portfolio.example.com/De...,0,0,0
2,Kyl* R*by \n\nS*xy c*n *\nKyl* r*by *s th* s*x...,0,1,1
3,Python internship â€” stipend â‚¹70000. Guara...,1,0,1
4,Resume attached. Kindly review and revert. ðŸ¤‘ ...,0,0,0
...,...,...,...,...
1345,Could you please confirm if it is remote or on...,0,0,0
1346,Can you share the detailed job description for...,0,0,0
1347,Can you share the detailed job description for...,0,0,0
1348,Resume attached. Kindly review and revert. ðŸ™‚,0,0,0


In [220]:
df.shape

(1350, 4)

In [221]:
# replace masked profanity automatically
def unmask_profane_words(text):
    mapping = {
        "f***": "fuck",
        "s***": "shit",
        "b******": "bastard",
        "a******": "asshole",
        "d***": "dumb",
        "h***": "hell",
        "b******": "bitch",
        "n****": "nigger",
        "c***": "cunt",
        "p***": "piss",
        "t***": "twat",
        "w****": "whore",
        "i*****": "idiot",
        "mf***": "motherfucker",
    }

    for masked, original in mapping.items():
        text = text.replace(masked, original)
    return text


df['text'] = df['text'].apply(unmask_profane_words)


### creating a new column inappropriate(target) for the combination of both spam and obscene comments , which will be the main target feature

In [222]:
df['inappropriate'] = df['label_spam'] + df['label_obscene']

In [223]:
df.sample(5)

Unnamed: 0,text,label_spam,label_obscene,target,inappropriate
589,am i applicable for this job?,0,0,0,0
990,What is the stipend or salary range for this p...,0,0,0,0
466,Can you share the detailed job description for...,0,0,0,0
12,Resume attached. Kindly review and revert. ðŸš€ T...,0,0,0,0
351,Y**'r* w*tch*ng D*rty J*bs t** * g**ss? H*h*...,0,1,1,1


In [224]:
df['inappropriate'].value_counts()

inappropriate
1    675
0    675
Name: count, dtype: int64

## 1. Data Cleaning

In [225]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1350 entries, 0 to 1349
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           1350 non-null   object
 1   label_spam     1350 non-null   int64 
 2   label_obscene  1350 non-null   int64 
 3   target         1350 non-null   int64 
 4   inappropriate  1350 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 52.9+ KB


In [226]:
df.duplicated().sum()

np.int64(0)

In [227]:
df.drop_duplicates(inplace=True)

In [228]:
df.duplicated().sum()

np.int64(0)

In [229]:
df.shape

(1350, 5)

In [230]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\karti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [231]:
df.drop(columns = ['label_spam', 'label_obscene', 'target'], inplace=True)

## Text preprocessing
1. lower case
2. tokenization
3. removing special characters
4. removing stop words and punctuation
5. stemming

In [232]:
# from nltk.corpus import stopwords
# stopwords.words('english')
# import string
# string.punctuation
# from nltk.stem.porter import PorterStemmer
# ps = PorterStemmer()





# def transform_text(text):
#     text = text.lower()
#     text = nltk.word_tokenize(text)
#     y = []  
#     for i in text:
#         if i.isalnum():
#             y.append(i)
   
    
#     text = y[:]
#     y.clear()
#     for i in text:
#         if i not in stopwords.words('english') and i not in string.punctuation:
#             y.append(i)
   

#     text = y[:]
#     y.clear()
#     for i in text:
#         y.append(ps.stem(i))
#     return " ".join(y)

In [233]:
import re
from nltk.corpus import stopwords
stopwords.words('english')
import string
string.punctuation

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)     # remove URLs
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)        # remove numbers/special chars
    text = re.sub(r"\s+", " ", text).strip()          # remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])    # remove stopwords
    text = ' '.join([word for word in text.split() if word not in string.punctuation])           # remove punctuation
    
    return text

df["transformed_text"] = df["text"].apply(clean_text)
# df["transformed_text"] = df["clean_text"].apply(transform_text)

In [234]:
clean_text("This is A loved Sample!!!! 1 TEXTðŸ¤‘.")

'loved sample 1 text'

In [235]:
# df['transformed_text'] = df['text'].apply(transform_text)

In [236]:
df.head()

Unnamed: 0,text,inappropriate,transformed_text
0,DESIGNER INTERNSHIP â€” 70000 PER HOUR. GU...,1,designer internship 70000 per hour guaranteed ...
1,Sharing my portfolio: portfolio.example.com/De...,0,sharing portfolio portfolioexamplecomdesigner ...
2,Kyl* R*by \n\nS*xy c*n *\nKyl* r*by *s th* s*x...,1,kyl rby sxy cn kyl rby th sxst mn lv nd h gts ...
3,Python internship â€” stipend â‚¹70000. Guara...,1,python internship stipend 70000 guaranteed job...
4,Resume attached. Kindly review and revert. ðŸ¤‘ ...,0,resume attached kindly review revert


## Model building  

In [237]:
# Splitting the data into training and testing data


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df["transformed_text"], df["inappropriate"], test_size=0.2, random_state=42, stratify=df["inappropriate"]
)


In [238]:
# converting text data to feature vectors using TF-IDF(because it weighs the important words)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [253]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score
 #gnb = GaussianNB()  not recommended for sparse data like TF-IDF(it only works for dense numpy arrays)
mnb = MultinomialNB()
bnb = BernoulliNB()

In [255]:
mnb.fit(X_train_tfidf, y_train)
y_prob = mnb.predict_proba(X_test_tfidf)[:, 1]
threshold = 0.65
y_pred2 = (y_prob >= threshold).astype(int)
print(accuracy_score(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))
print(precision_score(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

0.9518518518518518
[[135   0]
 [ 13 122]]
1.0
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       135
           1       1.00      0.90      0.95       135

    accuracy                           0.95       270
   macro avg       0.96      0.95      0.95       270
weighted avg       0.96      0.95      0.95       270



In [256]:
bnb.fit(X_train_tfidf, y_train)
y_prob1 = bnb.predict_proba(X_test_tfidf)[:, 1]
threshold = 0.65
y_pred3 = (y_prob1 >= threshold).astype(int)
print(accuracy_score(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))
print(precision_score(y_test, y_pred3))
print(classification_report(y_test, y_pred3))

0.8481481481481481
[[135   0]
 [ 41  94]]
1.0
              precision    recall  f1-score   support

           0       0.77      1.00      0.87       135
           1       1.00      0.70      0.82       135

    accuracy                           0.85       270
   macro avg       0.88      0.85      0.84       270
weighted avg       0.88      0.85      0.84       270



### out of all we will go with multinomial NB as accuracy and precision score ,both are high

# PREDICTION TIME

In [263]:
def predict(text):
    cleaned = clean_text(text)
    vectorized = tfidf.transform([cleaned])
    return "SPAM/PROFANITY" if mnb.predict(vectorized)[0] == 1 else "CLEAN"

predict("earn money â€” message me on WhatsApp wa.me/xxxx")



'SPAM/PROFANITY'