In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.svm import SVC
from tabulate import tabulate

df = pd.read_csv("emails.csv",  low_memory=False)

df = df.rename(columns={df.columns[0]: 'Email_Name', df.columns[-1]: 'Label'})

X = df.iloc[:, 1:-1].apply(pd.to_numeric, errors='coerce')

y = pd.to_numeric(df['Label'], errors='coerce').astype(int)

df_clean = pd.concat([X, y], axis=1).dropna()
X, y = df_clean.iloc[:, :-1], df_clean['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn_model = KNeighborsClassifier(n_neighbors=5, metric='euclidean', n_jobs=-1)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)

def metrics(y_true, y_pred, name):
    return [
        name,
        accuracy_score(y_true, y_pred),
        1 - accuracy_score(y_true, y_pred),
        precision_score(y_true, y_pred, zero_division=0),
        recall_score(y_true, y_pred, zero_division=0)
    ]

results = [
    metrics(y_test, y_pred_knn, "K-Nearest Neighbors (KNN)"),
    metrics(y_test, y_pred_svm, "Support Vector Machine (SVM)")
]

headers = ["Model", "Accuracy", "Error Rate", "Precision", "Recall"]
print(tabulate(results, headers=headers, tablefmt="fancy_grid", floatfmt=".4f"))

╒══════════════════════════════╤════════════╤══════════════╤═════════════╤══════════╕
│ Model                        │   Accuracy │   Error Rate │   Precision │   Recall │
╞══════════════════════════════╪════════════╪══════════════╪═════════════╪══════════╡
│ K-Nearest Neighbors (KNN)    │     0.8215 │       0.1785 │      0.6301 │   0.9311 │
├──────────────────────────────┼────────────┼──────────────┼─────────────┼──────────┤
│ Support Vector Machine (SVM) │     0.9517 │       0.0483 │      0.9176 │   0.9156 │
╘══════════════════════════════╧════════════╧══════════════╧═════════════╧══════════╛


In [8]:
df = pd.read_csv("emails.csv",  low_memory=False)

In [12]:
df = df.rename(columns={df.columns[0]: 'Email_Name', df.columns[-1]: 'Label'})

In [13]:
df.head()

Unnamed: 0,Email_Name,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Label
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [14]:
df.shape

(5172, 3002)

In [15]:
X = df.iloc[:, 1:-1].apply(pd.to_numeric, errors='coerce')

In [22]:
y = pd.to_numeric(df['Label'], errors='coerce').astype(int)

In [24]:
df_clean = pd.concat([X, y], axis=1).dropna()
X, y = df_clean.iloc[:, :-1], df_clean['Label']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
knn_model = KNeighborsClassifier(n_neighbors=5, metric='euclidean', n_jobs=-1)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)

In [35]:
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)
y_pred_svm = svm_model.predict(X_test_scaled)

In [36]:
def metrics(y_true, y_pred, name):
    return [
        name,
        accuracy_score(y_true, y_pred),
        1 - accuracy_score(y_true, y_pred),
        precision_score(y_true, y_pred, zero_division=0),
        recall_score(y_true, y_pred, zero_division=0)
    ]

In [40]:
results = [
    metrics(y_test, y_pred_knn, "K-Nearest Neighbors (KNN)"),
    metrics(y_test, y_pred_svm, "Support Vector Machine (SVM)")
]

In [43]:
headers = ["Model", "Accuracy", "Error Rate", "Precision", "Recall"]
print(tabulate(results, headers=headers, tablefmt="fancy_grid", floatfmt=".4f"))

╒══════════════════════════════╤════════════╤══════════════╤═════════════╤══════════╕
│ Model                        │   Accuracy │   Error Rate │   Precision │   Recall │
╞══════════════════════════════╪════════════╪══════════════╪═════════════╪══════════╡
│ K-Nearest Neighbors (KNN)    │     0.8215 │       0.1785 │      0.6301 │   0.9311 │
├──────────────────────────────┼────────────┼──────────────┼─────────────┼──────────┤
│ Support Vector Machine (SVM) │     0.9517 │       0.0483 │      0.9176 │   0.9156 │
╘══════════════════════════════╧════════════╧══════════════╧═════════════╧══════════╛
