In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
#load dataset
data_path = r'C:\Users\PRINTER WORLD\Desktop\spam.csv'

try:
    data = pd.read_csv(data_path, encoding='latin1')
except FileNotFoundError:
    print(f"File not found at {data_path}. Please check the path.")
    exit()
except UnicodeDecodeError as e:
    print(f"Unicode decode error: {e}")
    exit()

In [3]:
print("Data preview:")
print(data.head())
print("Columns in the dataset:", data.columns)

Data preview:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Columns in the dataset: Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [4]:
data.columns = ['Label', 'Message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

In [5]:
#prepare data
X = data['Message']  #SMS messages
y = data['Label']    #labels (spam or legitimate)

In [6]:
#check missing values
if X.isnull().any() or y.isnull().any():
    print("There are missing values in the dataset.")
    exit()

In [7]:
vectorizer = TfidfVectorizer(max_features=3000)
X_tfidf = vectorizer.fit_transform(X)

In [8]:
#split data in training and validation
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [9]:
# choose and train a classifier
model = LogisticRegression()

try:
    model.fit(X_train, y_train)
except ValueError as e:
    print(f"Error during model fitting: {e}")
    exit()

In [10]:
#predictions and evaluation
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

Accuracy: 0.9632286995515695
Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       0.99      0.73      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.96      0.96      0.96      1115

