In [38]:
##USING LOGISTIC REGRESSION FOR MACHINE LEARNING##
#libraries
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [39]:
#load the dataset
df = pd.read_csv('spam.csv')

df = df[['v1', 'v2']]

df['message'] = df['v2']
df['label_num'] = df['v1'].apply(lambda x: 0 if x=='ham' else 1)

df = df[['message', 'label_num']]

In [40]:
X = df['message']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [42]:
pipeline.fit(X_train, y_train)

In [43]:
#predict on test data
y_pred = pipeline.predict(X_test)

#Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy', accuracy)

#Detailed classification report
print('classification_report: \n', classification_report(y_test, y_pred))

#Show confusion matrix 
print('confusion_matrix: \n', confusion_matrix(y_test, y_pred))

Accuracy 0.9533492822966507
classification_report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      1453
           1       0.98      0.66      0.79       219

    accuracy                           0.95      1672
   macro avg       0.97      0.83      0.88      1672
weighted avg       0.95      0.95      0.95      1672

confusion_matrix: 
 [[1450    3]
 [  75  144]]


In [44]:
#Get the logistic regression classifier from the pipeline
lr_model = pipeline.named_steps['classifier']

#Access coefficients and intercept
print('Coefficient shape: ', lr_model.coef_.shape)
print('Intercept: ', lr_model.intercept_)

Coefficient shape:  (1, 6904)
Intercept:  [-2.37328217]


In [65]:
##MultinomialNB##
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [66]:
#Load the dataset
spam_data = pd.read_csv('spam.csv', encoding='latin-1')

In [67]:
spam_data = spam_data[['v1', 'v2']]
spam_data = spam_data.rename(columns={'v1':'label', 'v2':'text'})
spam_data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ï¿½_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [58]:
#Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], spam_data['label'],
                                                    test_size=0.2, random_state=42)

In [59]:
#Create the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')), #Convert text to numerical data using
    ('classifier', MultinomialNB()) # Use Naive Bayes classsifier
])

In [68]:
#Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
predictions = pipeline.predict(X_test)

#Evaluate the model
print("Accuracy: ", accuracy_score(y_test, predictions))
print("\nClassification Report: \n", classification_report(y_test, predictions))

Accuracy:  0.9659192825112107

Classification Report: 
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115

