In [2]:
import pandas as pd
data = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\ML\Assignment 5\SMSSpamCollection.csv",sep="\t",header=None,names=['Label','Message'])
data

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
from sklearn.preprocessing import LabelEncoder
import string
data['Label'] = LabelEncoder().fit_transform(data['Label'])

data['Message'] = data['Message'].str.lower().str.replace('[{}]'.format(string.punctuation),'',regex=True)
data['Message_Length'] = data['Message'].apply(len)
data['Word_Count'] = data['Message'].apply(lambda x:len(x.split()))

print(data['Message'])
print(data['Message_Length'])
print(data['Word_Count'])

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                  will ü b going to esplanade fr home
5569    pity  was in mood for that soany other suggest...
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
Name: Message, Length: 5572, dtype: object
0       102
1        23
2       149
3        43
4        59
       ... 
5567    152
5568     35
5569     50
5570    124
5571     25
Name: Message_Length, Length: 5572, dtype: int64
0       20
1        6
2       28
3       11
4       13
        ..
5567    30
5568     8
5569     9
5570    26
5571     6
Name: Word_Count,

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf.fit_transform(data['Message']).toarray()

import numpy as np
X=np.hstack((X_tfidf,data[['Message_Length','Word_Count']].values))
y=data['Label']

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

nb_model = MultinomialNB()
nb_model.fit(x_train,y_train)
y_pred_nb = nb_model.predict(x_test)
print("Naive Bayes Classification Report :\n ",classification_report(y_test,y_pred_nb))

lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(x_train,y_train)
y_pred_lr = lr_model.predict(x_test)
print("Logistic Regression Classification Report:\n",classification_report(y_test,y_pred_nb))

Naive Bayes Classification Report :
                precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.98      0.92      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.98      0.92      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [18]:
from sklearn.model_selection import cross_val_score

cv_nb = cross_val_score(nb_model,X,y,cv=5)
cv_lr = cross_val_score(lr_model,X,y,cv=5)
print("Naive Bayes Cross Validation Score:",cv_nb)
print("Logistic Regression Cross Validation Score:",cv_lr)

Naive Bayes Cross Validation Score: [0.967713   0.95874439 0.95960503 0.95870736 0.96319569]
Logistic Regression Cross Validation Score: [0.97040359 0.96412556 0.96588869 0.96858169 0.97217235]


In [22]:
from sklearn.model_selection import GridSearchCV

param_grid_nb = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0]}
grid_search_nb = GridSearchCV(nb_model,param_grid_nb,cv=5,scoring='f1')
grid_search_nb.fit(x_train,y_train)
print("Best Parameters for Naive Bayes :",grid_search_nb.best_params_)
print("Best cross validation F1 score for Naive Bayes :",grid_search_nb.best_score_)

param_grid_lr = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search_lr = GridSearchCV(lr_model,param_grid_lr,cv=5,scoring='f1')
grid_search_lr.fit(x_train,y_train)
print("Best Parameters for Logistic Regression :",grid_search_lr.best_params_)
print("Best cross validation F1 score for Logistic Regression:",grid_search_lr.best_score_)

Best Parameters for Naive Bayes : {'alpha': 0.5}
Best cross validation F1 score for Naive Bayes : 0.9350003268950896
Best Parameters for Logistic Regression : {'C': 100}
Best cross validation F1 score for Logistic Regression: 0.931820205937853
