# 2. Train Several models and generate report

In [9]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [11]:
#Load the data
df = pd.read_csv(r'C:\Users\james\Documents\Capstone\Final\synthetic_email_data.csv')


# Prepare for Scaling and Splitting

# Ensure correct data types
df['subscriber_id'] = df['subscriber_id'].astype(int)
df['email_frequency_per_week'] = df['email_frequency_per_week'].astype(int)
df['email_send_hour'] = df['email_send_hour'].astype(int)
df['personalized'] = df['personalized'].astype(int)
#df['spam_complaints'] = df['spam_complaints'].astype(int)
df['time_in_business'] = df['time_in_business'].astype(int)
df['unsubscribe'] = df['unsubscribe'].astype(int)

# Separate features and target
X = df.drop(['subscriber_id', 'unsubscribe'], axis=1)
y = df['unsubscribe']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (8000, 6)
X_test shape: (2000, 6)
y_train shape: (8000,)
y_test shape: (2000,)


In [12]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train and evaluate models
results = {}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store the results
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Display the results
results_df = pd.DataFrame(results).T
print(results_df)


                        Accuracy  Precision    Recall  F1 Score
Logistic Regression       0.7925   0.794621  0.495427  0.610329
Decision Tree             0.6995   0.538354  0.588415  0.562272
Random Forest             0.8215   0.977636  0.466463  0.631579
Support Vector Machine    0.8095   0.936508  0.449695  0.607621
K-Nearest Neighbors       0.7650   0.723558  0.458841  0.561567
Gradient Boosting         0.8245   0.990354  0.469512  0.637022
