In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [7]:
file_path = '/content/spam.csv'
data = pd.read_csv(file_path, encoding='latin-1')

In [8]:
print(data.shape)

(5572, 5)


In [9]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB
None


In [None]:
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [10]:
data = data[['v1', 'v2']]
data.columns = ['label', 'text']


In [11]:
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])


In [12]:
keywords = ["free", "win", "winner", "cash", "prize", "click", "offer", "money"]
features = []
for text in data['text']:
    text_lower = text.lower()
    features.append([text_lower.count(keyword) for keyword in keywords])

X = np.array(features)
y = data['label'].values
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_accuracies = []

for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    weights = np.zeros(X_train.shape[1])
    bias = 0
    learning_rate = 0.1
    epochs = 200

    for epoch in range(epochs):
        for i in range(len(X_train)):
            linear_output = np.dot(weights, X_train[i]) + bias
            prediction = 1 if linear_output > 0 else 0
            error = y_train[i] - prediction
            weights += learning_rate * error * X_train[i]
            bias += learning_rate * error
    y_pred = []
    for i in range(len(X_test)):
        linear_output = np.dot(weights, X_test[i]) + bias
        prediction = 1 if linear_output > 0 else 0
        y_pred.append(prediction)

    accuracy = accuracy_score(y_test, y_pred)
    all_accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy}")

average_accuracy = np.mean(all_accuracies)
print("Average Accuracy:", average_accuracy)



Fold 1 Accuracy: 0.8869955156950673
Fold 2 Accuracy: 0.8816143497757848
Fold 3 Accuracy: 0.8931777378815081
Fold 4 Accuracy: 0.9201077199281867
Fold 5 Accuracy: 0.8761220825852782
Average Accuracy: 0.8916034811731649
