## 0. Import modules needed

In [1]:
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# Now import from data
from data.json import read_json

## 1. Implement the function to transform the ds into a dataframe

In [2]:
def build_dataframe(emails):
    rows = []
    for email in emails:
        full_text = f"{email.subject} {email.content}"  # Combine fields if needed
        rows.append({"text": full_text, "label": email.label})
    return pd.DataFrame(rows)

In [3]:
data = read_json("../dataset/emails.json", True)

# Step 1: Build DataFrame
df = build_dataframe(data)

In [4]:
# Step 2: Vectorize the text content
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["text"])
y = df["label"]

In [5]:
# Step 3: Split and run LazyPredict
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
clf = LazyClassifier(verbose=1)
models, predictions = clf.fit(X_train.toarray(), X_test.toarray(), y_train, y_test)

  0%|          | 0/32 [00:00<?, ?it/s]

{'Model': 'AdaBoostClassifier', 'Accuracy': 1.0, 'Balanced Accuracy': np.float64(1.0), 'ROC AUC': None, 'F1 Score': 1.0, 'Time taken': 0.20032930374145508}
{'Model': 'BaggingClassifier', 'Accuracy': 1.0, 'Balanced Accuracy': np.float64(1.0), 'ROC AUC': None, 'F1 Score': 1.0, 'Time taken': 0.06142568588256836}
{'Model': 'BernoulliNB', 'Accuracy': 1.0, 'Balanced Accuracy': np.float64(1.0), 'ROC AUC': None, 'F1 Score': 1.0, 'Time taken': 0.01731276512145996}
{'Model': 'CalibratedClassifierCV', 'Accuracy': 1.0, 'Balanced Accuracy': np.float64(1.0), 'ROC AUC': None, 'F1 Score': 1.0, 'Time taken': 6.959654808044434}
{'Model': 'DecisionTreeClassifier', 'Accuracy': 1.0, 'Balanced Accuracy': np.float64(1.0), 'ROC AUC': None, 'F1 Score': 1.0, 'Time taken': 0.023409605026245117}
{'Model': 'DummyClassifier', 'Accuracy': 0.22, 'Balanced Accuracy': np.float64(0.2), 'ROC AUC': None, 'F1 Score': 0.07934426229508197, 'Time taken': 0.018862009048461914}
{'Model': 'ExtraTreeClassifier', 'Accuracy': 1.0, 

In [7]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,1.0,1.0,,1.0,0.2
BaggingClassifier,1.0,1.0,,1.0,0.06
BernoulliNB,1.0,1.0,,1.0,0.02
CalibratedClassifierCV,1.0,1.0,,1.0,6.96
DecisionTreeClassifier,1.0,1.0,,1.0,0.02
ExtraTreeClassifier,1.0,1.0,,1.0,0.02
GaussianNB,1.0,1.0,,1.0,0.02
ExtraTreesClassifier,1.0,1.0,,1.0,0.11
KNeighborsClassifier,1.0,1.0,,1.0,0.04
LabelPropagation,1.0,1.0,,1.0,0.04
