In [16]:
# Importing libraries 

In [17]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import joblib





In [18]:
# Download stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to C:\Users\ryzen
[nltk_data]     5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Load data
data = pd.read_csv("spam.csv", encoding='latin1')
df = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)


In [20]:
# Replace ham with 0 and spam with 1
df = df.replace(['ham', 'spam'], [0, 1])


In [21]:
# Define a function for text preprocessing
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    return text

In [22]:
# Apply text preprocessing to the 'v2' column
df['v2'] = df['v2'].apply(preprocess_text)

In [23]:
# Split the dataset into features (X) and labels (y)
X = df['v2']
y = df['v1']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_vect = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_vect = vectorizer.transform(X_test)

# Initialize Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier on the training data
classifier.fit(X_train_vect, y_train)

# Predict labels for the testing data
y_pred = classifier.predict(X_test_vect)


In [24]:
# Evaluate the model performance
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [25]:
# Define a pipeline with TF-IDF vectorizer and Multinomial Naive Bayes classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

In [26]:
# Define hyperparameters to tune
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__alpha': (0.1, 0.5, 1.0)
}


In [27]:
# Perform grid search with cross-validation (5-fold)
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

# Fit grid search to the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'clf__alpha': (0.1, 0.5, 1.0),
                         'tfidf__max_df': (0.25, 0.5, 0.75),
                         'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]},
             verbose=1)

In [28]:
# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'clf__alpha': 0.1, 'tfidf__max_df': 0.25, 'tfidf__ngram_range': (1, 2)}


In [29]:
# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the testing data
y_pred = best_model.predict(X_test)

In [30]:
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.88      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
#The classification reports indicate the performance of the model before and after hyperparameter tuning using grid search.

#Before Hyperparameter Tuning:

#Precision for class 0: 0.96
#Recall for class 0: 1.00
#F1-score for class 0: 0.98
#Precision for class 1: 1.00
#Recall for class 1: 0.76
#F1-score for class 1: 0.86
#Accuracy: 0.97
#After Hyperparameter Tuning:

#Precision for class 0: 0.98
#Recall for class 0: 1.00
#F1-score for class 0: 0.99
#Precision for class 1: 0.99
#Recall for class 1: 0.88
#F1-score for class 1: 0.93
#Accuracy: 0.98
#After hyperparameter tuning, there are improvements in precision, 
#recall, and F1-score for class 1 (spam). The accuracy of the model also increased to 0.98. Overall,
#the model performance improved after hyperparameter tuning.

In [31]:
# Misclassified messages
misclassified_indices = y_test[y_test != y_pred].index
misclassified_messages = X_test[misclassified_indices]
print("Misclassified messages:")
for i, message in enumerate(misclassified_messages):
    print(f"Message {misclassified_indices[i]}: {message}")

Misclassified messages:
Message 683: hi im sue 20 years old work lapdancer love sex text live im bedroom text sue 89555 textoperator g2 1da 150ppmsg 18
Message 2312: tddnewsletteremc1couk games thedailydraw dear helen dozens free games great prizeswith
Message 3979: ringtoneking 84484
Message 1268: u get 2 phone wan na chat 2 set meet call 09096102316 u cum 2moro luv jane xx callså£1minmoremobsemspobox45po139wa
Message 730: email alertfrom jeri stewartsize 2kbsubject lowcost prescripiton drvgsto listen email call 123
Message 2662: hello darling today would love chat dont tell look like sexy
Message 4417: get free call
Message 4296: thesmszonecom lets send free anonymous masked messagesim sending message theredo see potential abuse
Message 1468: hi lucy hubby meetins day fri b alone hotel u fancy cumin pls leave msg 2day 09099726395 lucy x callså£1minmobsmorelkpobox177hp51fl
Message 787: ever thought living good life perfect partner txt back name age join mobile community 100psms
Messag

In [32]:
# Fit and evaluate a Decision Tree classifier
dt = DecisionTreeClassifier(random_state=50)
dt.fit(X_train_vect, y_train)
y_pred_dt = dt.predict(X_test_vect)
cm = confusion_matrix(y_test, y_pred_dt)
print("Decision Tree Classifier:")
print(cm)
print("Accuracy : %0.5f \n\n" % accuracy_score(y_test, dt.predict(X_test_vect)))
print(classification_report(y_test, dt.predict(X_test_vect)))



Decision Tree Classifier:
[[949  16]
 [ 31 119]]
Accuracy : 0.95785 


              precision    recall  f1-score   support

           0       0.97      0.98      0.98       965
           1       0.88      0.79      0.84       150

    accuracy                           0.96      1115
   macro avg       0.92      0.89      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [33]:
# Fit and evaluate a RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_vect, y_train)
y_pred_rf = rf.predict(X_test_vect)
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Random Forest Classifier:")
print(cm_rf)
print("Accuracy : %0.5f \n\n" % accuracy_score(y_test, rf.predict(X_test_vect)))
print(classification_report(y_test, rf.predict(X_test_vect)))




Random Forest Classifier:
[[965   0]
 [ 28 122]]
Accuracy : 0.97489 


              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115



In [34]:
# Fit and evaluate a VotingClassifier using Logistic Regression and SVC
classifier1 = LogisticRegression()
classifier2 = SVC()
ensemble_classifier = VotingClassifier(estimators=[('lr', classifier1), ('svm', classifier2)])
ensemble_classifier.fit(X_train_vect, y_train)
y_pred_vc = ensemble_classifier.predict(X_test_vect)
cm_vc = confusion_matrix(y_test, y_pred_vc)
print("Voting Classifier:")
print(cm_vc)
print("Accuracy : %0.5f \n\n" % accuracy_score(y_test, ensemble_classifier.predict(X_test_vect)))



Voting Classifier:
[[963   2]
 [ 51  99]]
Accuracy : 0.95247 




In [35]:
# Cross-validation scores for RandomForestClassifier
scores = cross_val_score(rf, X_train_vect, y_train, cv=5)
print("Cross-validation scores (Random Forest Classifier):", scores)


Cross-validation scores (Random Forest Classifier): [0.97421525 0.9764574  0.9708193  0.9640853  0.9708193 ]


In [37]:
# Fit and evaluate an SVC classifier with balanced class weights
svc_classifier = SVC(class_weight='balanced')
svc_classifier.fit(X_train_vect, y_train)
y_pred_svc = svc_classifier.predict(X_test_vect)
cm_svc = confusion_matrix(y_test, y_pred_svc)
print("SVC Classifier:")
print(cm_svc)
print("Accuracy : %0.5f \n\n" % accuracy_score(y_test, svc_classifier.predict(X_test_vect))) 

SVC Classifier:
[[962   3]
 [ 24 126]]
Accuracy : 0.97578 




In [38]:
#To calculate the final accuracy of each model, we need to follow these steps:

#Train each model on the training data.
#Evaluate each model's performance on the testing data.
#Calculate the accuracy for each model.

In [39]:

# Evaluate the model performance
report = classification_report(y_test, y_pred)
print("Multinomial Naive Bayes Classifier Report:\n", report)

# Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=50)
dt_classifier.fit(X_train_vect, y_train)
y_pred_dt = dt_classifier.predict(X_test_vect)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

print("Decision Tree Classifier Accuracy:", accuracy_dt)

# RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_vect, y_train)
y_pred_rf = rf_classifier.predict(X_test_vect)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print("RandomForestClassifier Accuracy:", accuracy_rf)

# VotingClassifier using Logistic Regression and SVC
log_reg_classifier = LogisticRegression()
svc_classifier = SVC()
voting_classifier = VotingClassifier(estimators=[('lr', log_reg_classifier), ('svm', svc_classifier)])
voting_classifier.fit(X_train_vect, y_train)
y_pred_voting = voting_classifier.predict(X_test_vect)
accuracy_voting = accuracy_score(y_test, y_pred_voting)

print("VotingClassifier Accuracy:", accuracy_voting)


Multinomial Naive Bayes Classifier Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.88      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Decision Tree Classifier Accuracy: 0.957847533632287
RandomForestClassifier Accuracy: 0.9748878923766816
VotingClassifier Accuracy: 0.9524663677130045


In [40]:
# Save the trained model
joblib.dump(svc_classifier, 'spam_classifier_model.pk22')

['spam_classifier_model.pk22']

In [None]:
# Download stopwords
nltk.download('stopwords')

# Load data
data = pd.read_csv("spam.csv", encoding='latin1')
df = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# Replace ham with 0 and spam with 1
df = df.replace(['ham', 'spam'], [0, 1])

# Define a function for text preprocessing
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    return text

# Apply text preprocessing to the 'v2' column
df['v2'] = df['v2'].apply(preprocess_text)

# Split the dataset into features (X) and labels (y)
X = df['v2']
y = df['v1']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_vect = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_vect = vectorizer.transform(X_test)

# Initialize Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier on the training data
classifier.fit(X_train_vect, y_train)

# Predict labels for the testing data
y_pred = classifier.predict(X_test_vect)

# Evaluate the model performance
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

# Define a pipeline with TF-IDF vectorizer and Multinomial Naive Bayes classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Define hyperparameters to tune
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__alpha': (0.1, 0.5, 1.0)
}

# Perform grid search with cross-validation (5-fold)
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

# Fit grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the testing data
y_pred = best_model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Misclassified messages
misclassified_indices = y_test[y_test != y_pred].index
misclassified_messages = X_test[misclassified_indices]
print("Misclassified messages:")
for i, message in enumerate(misclassified_messages):
    print(f"Message {misclassified_indices[i]}: {message}")

# Fit and evaluate a Decision Tree classifier
dt = DecisionTreeClassifier(random_state=50)
dt.fit(X_train_vect, y_train)
y_pred_dt = dt.predict(X_test_vect)
cm = confusion_matrix(y_test, y_pred_dt)
print("Decision Tree Classifier:")
print(cm)
print("Accuracy : %0.5f \n\n" % accuracy_score(y_test, dt.predict(X_test_vect)))
print(classification_report(y_test, dt.predict(X_test_vect)))

# Fit and evaluate a RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_vect, y_train)
y_pred_rf = rf.predict(X_test_vect)
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Random Forest Classifier:")
print(cm_rf)
print("Accuracy : %0.5f \n\n" % accuracy_score(y_test, rf.predict(X_test_vect)))
print(classification_report(y_test, rf.predict(X_test_vect)))

# Fit and evaluate a VotingClassifier using Logistic Regression and SVC
classifier1 = LogisticRegression()
classifier2 = SVC()
ensemble_classifier = VotingClassifier(estimators=[('lr', classifier1), ('svm', classifier2)])
ensemble_classifier.fit(X_train_vect, y_train)
y_pred_vc = ensemble_classifier.predict(X_test_vect)
cm_vc = confusion_matrix(y_test, y_pred_vc)
print("Voting Classifier:")
print(cm_vc)
print("Accuracy : %0.5f \n\n" % accuracy_score(y_test, ensemble_classifier.predict(X_test_vect)))

# Cross-validation scores for RandomForestClassifier
scores = cross_val_score(rf, X_train_vect, y_train, cv=5)
print("Cross-validation scores (Random Forest Classifier):", scores)

# Fit and evaluate an SVC classifier with balanced class weights
svc_classifier = SVC(class_weight='balanced')
svc_classifier.fit(X_train_vect, y_train)
y_pred_svc = svc_classifier.predict(X_test_vect)
cm_svc = confusion_matrix(y_test, y_pred_svc)
print("SVC Classifier:")
print(cm_svc)
print("Accuracy : %0.5f \n\n" % accuracy_score(y_test, svc_classifier.predict(X_test_vect)))
