In [39]:
#Import libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [40]:
#load the dataset
data = pd.read_csv("spam.csv", encoding='latin-1')

#look at the data and columns
print(data.head())
print(data.columns)

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [41]:
#keep only the first two columns and rename
data = data[['v1', 'v2']]
data.columns =['label', 'message']

#Look the data again
print(data.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [42]:
#check for missing values
print(data.isnull().sum())

label      0
message    0
dtype: int64


In [43]:
#seperate features and labels
X = data['message'] #email text
y = data['label']   #spam or ham

#split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"\nTraining label distribution:")
print(y_train.value_counts())

Training samples: 4457
Testing samples: 1115

Training label distribution:
label
ham     3860
spam     597
Name: count, dtype: int64


In [44]:
#vectorize text data
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')

#fit and transform training data
X_train_vectors = vectorizer.fit_transform(X_train)
#transform testing data
X_test_vectors = vectorizer.transform(X_test)

print(f"Vector shape: {X_train_vectors.shape}")
print(f"This means: {X_train_vectors.shape[0]} messages, {X_train_vectors.shape[1]} features")

Vector shape: (4457, 3000)
This means: 4457 messages, 3000 features


In [45]:
#Train the model
model = LogisticRegression(max_iter=1000)

print("Training the model..")
model.fit(X_train_vectors, y_train)
print("Training Complete!")

Training the model..
Training Complete!


In [46]:
#make predictions on test data
predictions = model.predict(X_test_vectors)

#calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 96.41%
