In [34]:
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None
from nltk.corpus import wordnet
import os
import shutil

In [33]:
import os
custom_nltk_dir = "/kaggle/working/nltk_data"

# Check if WordNet is actually downloaded
print("WordNet exists:", os.path.exists(f"{custom_nltk_dir}/corpora/wordnet"))


WordNet exists: True


In [15]:

custom_nltk_dir = "/kaggle/working/nltk_data"
nltk.data.path.append(custom_nltk_dir)

# Ensure the directory exists
os.makedirs(custom_nltk_dir, exist_ok=True)

# Download WordNet manually
nltk.download('wordnet', download_dir=custom_nltk_dir)
nltk.download('omw-1.4', download_dir=custom_nltk_dir)  # Optional for extended WordNet

# Extract if necessary
wordnet_zip_path = os.path.join(custom_nltk_dir, "corpora/wordnet.zip")
wordnet_dir_path = os.path.join(custom_nltk_dir, "corpora/wordnet")

if os.path.exists(wordnet_zip_path):
    shutil.unpack_archive(wordnet_zip_path, os.path.join(custom_nltk_dir, "corpora"))
    print("WordNet extracted successfully!")


[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
WordNet extracted successfully!


In [24]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv',encoding="ISO-8859-1")

In [25]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
df['v2'] = df['v2'].str.lower()

In [27]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df['v2'] = df["v2"].apply(lambda text: remove_punctuation(text))

In [28]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df['v2_stopWordRem'] = df["v2"].apply(lambda text: remove_stopwords(text))

In [23]:
# for i in range(df.shape[0]):
    # print([word for word in df["v2_stopWordRem"][i].split()])

In [None]:
# # corrected_text = str(TextBlob(text).correct())
# # print(corrected_text)
# # [word for word in df["v2_stopWordRem"][i].split()]
# def spelling_nazi(text):
#     return " ".join([str(TextBlob(word).correct()) for word in text.split()])

# df['spell_checked_v2'] = df["v2_stopWordRem"].apply(lambda text: spelling_nazi(text))

In [30]:
nltk.download('wordnet', download_dir=custom_nltk_dir)

[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

df["v2_lemmatized"] = df["v2_stopWordRem"].apply(lambda text: lemmatize_words(text))

In [36]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Transform text data into BoW features
X_bow = vectorizer.fit_transform(df['v2_lemmatized'])

# Convert to DataFrame
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

# Merge BoW features with original DataFrame
df_bow = pd.concat([df, bow_df], axis=1)

# Drop the original text column (optional)
df_bow.drop(columns=['v2_lemmatized','v2_stopWordRem','v2'], axis=1, inplace=True)

# Display result
print(df_bow)


        v1  008704050406  0089my  0121  01223585236  01223585334  0125698789  \
0      ham             0       0     0            0            0           0   
1      ham             0       0     0            0            0           0   
2     spam             0       0     0            0            0           0   
3      ham             0       0     0            0            0           0   
4      ham             0       0     0            0            0           0   
...    ...           ...     ...   ...          ...          ...         ...   
5567  spam             0       0     0            0            0           0   
5568   ham             0       0     0            0            0           0   
5569   ham             0       0     0            0            0           0   
5570   ham             0       0     0            0            0           0   
5571   ham             0       0     0            0            0           0   

      02  020603  0207  ...  ìï  ìïll  

In [41]:
from sklearn.preprocessing import LabelEncoder

X = df_bow.drop(['v1'],axis=1)
y = df['v1']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# # Train Naive Bayes (for classification)
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)

# Train XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

# Print results
print(f"Naive Bayes Accuracy: {acc_nb:.4f}")
print(f"Random Forest Accuracy: {acc_rf:.4f}")
print(f"XGBoost Accuracy: {acc_xgb:.4f}")


Naive Bayes Accuracy: 0.9785
Random Forest Accuracy: 0.9749
XGBoost Accuracy: 0.9713
