import the libraries

In [84]:
import pandas as pd
import random
import os
import joblib
from scipy.stats import randint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


import,check, split, and scale the features

In [86]:
#Load the data
df = pd.read_csv(r'C:\Users\james\Documents\Capstone\Final\synthetic_email_data.csv')

folder_path =r'C:\Users\james\Documents\Capstone\Final'

# Prepare for Scaling and Splitting

# Ensure correct data types
df['subscriber_id'] = df['subscriber_id'].astype(int)
df['email_frequency_per_week'] = df['email_frequency_per_week'].astype(int)
df['email_send_hour'] = df['email_send_hour'].astype(int)
df['personalized'] = df['personalized'].astype(int)
#df['spam_complaints'] = df['spam_complaints'].astype(int)
df['time_in_business'] = df['time_in_business'].astype(int)
df['unsubscribe'] = df['unsubscribe'].astype(int)

# Separate features and target
X = df.drop(['subscriber_id', 'unsubscribe'], axis=1)
y = df['unsubscribe']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (8000, 6)
X_test shape: (2000, 6)
y_train shape: (8000,)
y_test shape: (2000,)


initialize and train the best model from scratch work

In [89]:
#initialize model
GB_Classifier = GradientBoostingClassifier(random_state=42)
RF_Classifier = RandomForestClassifier(random_state=42)

#Fit the model
GB_Classifier.fit(X_train_scaled, y_train)
RF_Classifier.fit(X_train_scaled, y_train)

# Make predictions
gb_y_pred = GB_Classifier.predict(X_test_scaled)
rf_y_pred = RF_Classifier.predict(X_test_scaled)

# Evaluate the model
gb_accuracy = accuracy_score(y_test, gb_y_pred)
gb_precision = precision_score(y_test, gb_y_pred)
gb_recall = recall_score(y_test, gb_y_pred)
gb_f1 = f1_score(y_test, gb_y_pred)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)

print("\nGradient Boosting")
print(f"Accuracy: {gb_accuracy:.4f}")
print(f"Precision: {gb_precision:.4f}")
print(f"Recall: {gb_recall:.4f}")
print(f"F1 Score: {gb_f1:.4f}")

print("\nRandom Forest")
print(f"Accuracy: {rf_accuracy_gb:.4f}")
print(f"Precision: {rf_precision_gb:.4f}")
print(f"Recall: {rf_recall_gb:.4f}")
print(f"F1 Score: {rf_f1_gb:.4f}")


Gradient Boosting
Accuracy: 0.8215
Precision: 0.9776
Recall: 0.4665
F1 Score: 0.6316

Random Forest
Accuracy: 0.8000
Precision: 0.8000
Recall: 0.4590
F1 Score: 0.5833


Test different parameters for the gradient boosting

In [92]:
# Define parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': randint(50, 500),
    'learning_rate': [round(random.uniform(0.1, 0.3), 2) for _ in range(10)],
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

param_grid_rf = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

# Initialize GridSearchCV for Random Forest
random_search_rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), 
                                      param_distributions=param_grid_rf, 
                                      scoring='f1', 
                                      n_jobs=4,
                                      n_iter=100,  
                                      random_state=42)

# Initialize GridSearchCV for Gradient Boosting
random_search_gb = RandomizedSearchCV(GradientBoostingClassifier(random_state=42), 
                                      param_distributions=param_grid_gb, 
                                      scoring='f1', 
                                      n_jobs=4,
                                      n_iter=100, 
                                      random_state=42)

# Fit RandomizedSearchCV for Gradient Boosting
random_search_gb.fit(X_train_scaled, y_train)
print("Best parameters for Gradient Boosting:", random_search_gb.best_params_)
print("Best score for Gradient Boosting:", random_search_gb.best_score_)

# Fit RandomizedSearchCV for Random Forest
random_search_rf.fit(X_train_scaled, y_train)
print("Best parameters for Random Forest:", random_search_rf.best_params_)
print("Best score for Random Forest:", random_search_rf.best_score_)

Best parameters for Gradient Boosting: {'learning_rate': 0.14, 'max_depth': 1, 'min_samples_leaf': 2, 'min_samples_split': 9, 'n_estimators': 175}
Best score for Gradient Boosting: 0.6596143675784937
Best parameters for Random Forest: {'max_depth': 15, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 54}
Best score for Random Forest: 0.6603297152637864


Apply best parameters to Gradient boosting model

In [97]:
#initialize model
GB_Classifier2 = GradientBoostingClassifier(
    learning_rate=0.14, 
    max_depth=1, 
    min_samples_leaf=2, 
    min_samples_split=9, 
    n_estimators=175, 
    random_state=42
)
RF_Classifier2 = RandomForestClassifier(
    max_depth=15, 
    min_samples_leaf=3, 
    min_samples_split=2, 
    n_estimators=54, 
    random_state=42)


#Fit the model
GB_Classifier2.fit(X_train_scaled, y_train)
RF_Classifier2.fit(X_train_scaled, y_train)

# Make predictions
gb_y_pred2 = GB_Classifier2.predict(X_test_scaled)
rf_y_pred2 = RF_Classifier2.predict(X_test_scaled)

# Evaluate the model
gb_accuracy2 = accuracy_score(y_test, gb_y_pred2)
gb_precision2 = precision_score(y_test, gb_y_pred2)
gb_recall2 = recall_score(y_test, gb_y_pred2)
gb_f12 = f1_score(y_test, gb_y_pred2)

# Evaluate the model
rf_accuracy2 = accuracy_score(y_test, rf_y_pred2)
rf_precision2 = precision_score(y_test, rf_y_pred2)
rf_recall2 = recall_score(y_test, rf_y_pred2)
rf_f12 = f1_score(y_test, rf_y_pred2)

print("\nGradient Boosting")
print(f"Accuracy: {gb_accuracy2:.4f}")
print(f"Precision: {gb_precision2:.4f}")
print(f"Recall: {gb_recall2:.4f}")
print(f"F1 Score: {gb_f12:.4f}")

print("\nRadom Forest")
print(f"Accuracy: {rf_accuracy2:.4f}")
print(f"Precision: {rf_precision2:.4f}")
print(f"Recall: {rf_recall2:.4f}")
print(f"F1 Score: {rf_f12:.4f}")

# Save the models
model_path = os.path.join(folder_path, 'gradient_boosting_model.pkl')
model_path2 = os.path.join(folder_path, 'Random_Forest.pkl')
joblib.dump(GB_Classifier2, model_path)
joblib.dump(RF_Classifier2, model_path2)                           
print("\nGradient Boosting model saved to:", model_path)

# Save the scaler
scaler_path = os.path.join(folder_path, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print("Scaler saved to:", scaler_path)


Gradient Boosting
Accuracy: 0.8250
Precision: 0.9968
Recall: 0.4680
F1 Score: 0.6369

Radom Forest
Accuracy: 0.8240
Precision: 0.9935
Recall: 0.4665
F1 Score: 0.6349

Gradient Boosting model saved to: C:\Users\james\Documents\Capstone\Final\gradient_boosting_model.pkl
Scaler saved to: C:\Users\james\Documents\Capstone\Final\scaler.pkl
