# Wine Quality Analysis

## Process and Clean Data

In [None]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
# Get data from csv file
red_wine_df = pd.read_csv('resources/winequality-red.csv', sep=';')
white_wine_df = pd.read_csv('resources/winequality-white.csv', sep=';')

# Create a new column 'color' and assign '1' to all rows for red wine
red_wine_df['color'] = 1

# Create a new column 'color' and assign '0' to all rows for white wine
white_wine_df['color'] = 0

# Create a new dataframe 'wine_df' by combining red_wine_df and white_wine_df and reset the index
wine_df = pd.concat([red_wine_df, white_wine_df], ignore_index=True)

# Display wine_df
wine_df

In [None]:
# Describe the dataframe
wine_df.describe()

In [None]:
# Get the dataframe's info
wine_df.info()

In [None]:
# Look for null values
wine_df.isnull().sum()

In [None]:
# Drop any null values
wine_df = wine_df.dropna().reset_index(drop=True)

wine_df

In [None]:
# Check for duplicate rows
wine_df.duplicated().sum()

In [None]:
# Drop duplicate rows and reset index
wine_df = wine_df.drop_duplicates().reset_index(drop=True)

wine_df

## Explore the Data

In [None]:
# Get a count of the unique values in the quality column
wine_df['quality'].value_counts()

In [None]:
# Export the cleaned data to a new csv file
wine_df.to_csv('resources/winequality-cleaned.csv', index=False)

In [None]:
#Red Wine Scatter plot
pd.plotting.scatter_matrix(red_wine_df, alpha=0.2, figsize=(20, 20), diagonal='hist')
plt.show()

In [None]:
#White Wine Scatter plot
pd.plotting.scatter_matrix(white_wine_df, alpha=0.2, figsize=(20, 20), diagonal='hist')
plt.show()

In [None]:
import seaborn as sns
plt.figure(figsize=(12, 8))
sns.heatmap(wine_df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Red Wine Corelation
plt.figure(figsize=(12, 8))
sns.heatmap(red_wine_df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# White Wine Corelation
plt.figure(figsize=(12, 8))
sns.heatmap(white_wine_df.corr(numeric_only=True), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=wine_df, x='quality', hue='color')
plt.title("Wine Quality Distribution by Type")
plt.xlabel("Quality")
plt.ylabel("Count")
plt.legend(title="Wine Color", labels=["White", "Red"])
plt.show()

In [None]:
# drop the color column due to it not being relevant to quality, drop the free sulfur dioxide column due to it being highly correlated with total sulfur dioxide
cleaned_wine = wine_df.drop(['color', 'free sulfur dioxide'], axis=1)

In [None]:
cleaned_wine2 = wine_df.drop(['color', 'free sulfur dioxide', 'chlorides', 'citric acid', 'fixed acidity'], axis=1)

In [None]:
# do a value counts on the quality column
cleaned_wine['quality'].value_counts()

In [None]:
cleaned_wine2['quality'].value_counts()

In [None]:
features_to_plot = ['alcohol', 'volatile acidity', 'citric acid']
for feature in features_to_plot:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='quality', y=feature, data=cleaned_wine)
    plt.title(f"{feature.title()} by Wine Quality")
    plt.show()

In [None]:
top_features = ['alcohol', 'volatile acidity', 'density', 'quality']
sns.pairplot(cleaned_wine[top_features], hue='quality', palette='coolwarm')
plt.suptitle("Pairplot of Selected Features", y=1.02)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='quality', y='alcohol', data=cleaned_wine)
plt.title("Alcohol Distribution by Wine Quality")
plt.show()

## Split the Test and Training Data

In [None]:
# Create bins for the quality column


# Two Bins for 0-5 and 6-10
bins = (0, 5, 10)

# Name the bins 0 for low quality and 1 for high quality
group_names = [0, 1]

# Rename teh values in the quality column to the bin names
cleaned_wine['quality'] = pd.cut(cleaned_wine['quality'], bins=bins, labels=group_names)

# List unique values in the quality column
cleaned_wine['quality'].unique()

In [None]:
# Rename the values in the quality column to the bin names
cleaned_wine2['quality'] = pd.cut(cleaned_wine2['quality'], bins=bins, labels=group_names)

# List unique values in the quality column
cleaned_wine2['quality'].unique()

In [None]:
cleaned_wine['quality'].value_counts()

In [None]:
cleaned_wine2['quality'].value_counts()

In [None]:
# Separate features and target
X = cleaned_wine.drop(columns= ['quality'])
y= cleaned_wine['quality']

In [None]:
# Separate features and target - cleaned 2
X2 = cleaned_wine2.drop(columns= ['quality'])
y2= cleaned_wine2['quality']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=1)


In [None]:
# Split the data into training and testing sets - cleaned 2
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size= 0.2, random_state=1)

In [None]:
import statsmodels.api as sm

In [None]:
# cleaned
lr = sm.OLS(y_train, X_train).fit()
pvals = lr.pvalues.sort_values()
for index, value in pvals.items():
    print(f"{index}: {value:4f}")

In [None]:
# cleaned 2
lr = sm.OLS(y_train2, X_train2).fit()
pvals = lr.pvalues.sort_values()
for index, value in pvals.items():
    print(f"{index}: {value:4f}")

In [None]:
mnl = sm.MNLogit(y, X).fit()
pvals = mnl.pvalues
pvals_sorted = pvals.iloc[:, 0].sort_values()
for index, value in pvals_sorted.items():
    print(f"{index}: {value:4f}")

In [None]:
logit = sm.Logit(y_train, X_train).fit()
pvals = logit.pvalues.sort_values()
for index, value in pvals.items():
    print(f"{index}: {value:4f}")
    

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(

    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

In [None]:
# Scale the features -  cleaned 2
scaler2 = StandardScaler()
X_train_scaled2 = pd.DataFrame(
    scaler2.fit_transform(X_train2),
    columns=X_train2.columns,
    index=X_train2.index
)

X_test_scaled2 = pd.DataFrame(

    scaler2.transform(X_test2),
    columns=X_test2.columns,
    index=X_test2.index
)

In [None]:
class_counts = y_train.value_counts()
print(class_counts)


In [None]:
class_counts2 = y_train2.value_counts()
print(class_counts2)

# Balancing Data

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter   
# Apply SMOTE on the scaled training data
smote_model = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote_model.fit_resample(X_train_scaled, y_train)

# Count distinct values for the resampled target data
print(y_resampled_smote.value_counts())

In [None]:

# Apply SMOTE on the UNSCALED training data
smote_model_unscaled = SMOTE(random_state=42)
X_resampled_smote_unscaled, y_resampled_smote_unscaled = smote_model_unscaled.fit_resample(X_train, y_train)

# Count distinct values for the resampled target data
print(y_resampled_smote_unscaled.value_counts())

In [None]:
# Apply SMOTE on the scaled training data - cleaned 2
smote_model2 = SMOTE(random_state=42)
X_resampled_smote2, y_resampled_smote2 = smote_model2.fit_resample(X_train_scaled2, y_train2)

# Count distinct values for the resampled target data
print(y_resampled_smote2.value_counts())

In [None]:

from imblearn.combine import SMOTEENN
# Apply SMOTEENN on the scaled training data
smoteenn_model = SMOTEENN(random_state=42)
X_resampled_smoteenn, y_resampled_smoteenn = smoteenn_model.fit_resample(X_train_scaled, y_train)
# Count distinct values for the resampled target data
print(y_resampled_smoteenn.value_counts())

In [None]:

# Apply SMOTEENN on the scaled training data - cleaned 2
smoteenn_model2 = SMOTEENN(random_state=42)
X_resampled_smoteenn2, y_resampled_smoteenn2 = smoteenn_model2.fit_resample(X_train_scaled2, y_train2)
# Count distinct values for the resampled target data
print(y_resampled_smoteenn2.value_counts())

# Compare Different Models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix



## Random Forest

In [None]:
# Instantiate a new RandomForestClassifier model - unscaled
rf_model = RandomForestClassifier()
# Fit the resampled data the new model
rf_model.fit(X_train, y_train)
# Predict labels for resampled testing features
rf_y_pred = rf_model.predict(X_test)

# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, rf_y_pred))

In [None]:
# Instantiate a new RandomForestClassifier model - unscaled, cleaned 2
rf_model2 = RandomForestClassifier()
# Fit the resampled data the new model
rf_model2.fit(X_train2, y_train2)
# Predict labels for resampled testing features
rf_y_pred2 = rf_model2.predict(X_test2)

# Print classification reports
print(f"Classification Report - Original Data, Cleaned 2")
print(classification_report(y_test2, rf_y_pred2))

In [None]:
# Instantiate a new RandomForestClassifier model - scaled
rf_model_scaled = RandomForestClassifier()
# Fit the resampled data the new model
rf_model_scaled.fit(X_train_scaled, y_train)
# Predict labels for resampled testing features
rf_y_pred_scaled = rf_model_scaled.predict(X_test_scaled)

# Print classification reports
print(f"Classification Report - Scaled Data")
print(classification_report(y_test, rf_y_pred_scaled))

In [None]:
# Instantiate a new RandomForestClassifier model with SMOTE data - scaled
rf_smote_model = RandomForestClassifier()
# Fit the resampled data the new model
rf_smote_model.fit(X_resampled_smote2, y_resampled_smote2)
# Predict labels for resampled testing features
rf_smote_y_pred = rf_smote_model.predict(X_test_scaled2)

# Print classification reports
print(f"Classification Report - SMOTE Data")
print(classification_report(y_test2, rf_smote_y_pred))

In [None]:
# Instantiate a new RandomForestClassifier model with SMOTEENN data
rf_smoteenn_model = RandomForestClassifier()
# Fit the resampled data the new model
rf_smoteenn_model.fit(X_resampled_smoteenn, y_resampled_smoteenn)
# Predict labels for resampled testing features
rf_smoteenn_y_pred = rf_smoteenn_model.predict(X_test_scaled)

# Print classification reports
print(f"Classification Report - SMOTEENN Data")
print(classification_report(y_test, rf_smoteenn_y_pred))

In [None]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, rf_y_pred2))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, rf_smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, rf_smoteenn_y_pred))

In [None]:
# Print the class distribution before SMOTE
print("Original training class distribution:", Counter(y_train))

# Print the class distribution after SMOTE
print("Resampled training class distribution:", Counter(y_resampled))

## XG Boost

In [None]:
#Now do a XGboost model
from xgboost import XGBClassifier

XGB_model= XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=200, max_depth=6, random_state=42)
XGB_model.fit(X_train, y_train)
xgb_y_pred = XGB_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, xgb_y_pred))


In [None]:
#Now do a XGboost model
from xgboost import XGBClassifier

xgb_smote_model= XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=200, max_depth=6, random_state=42)
xgb_smote_model.fit(X_resampled_smote, y_resampled_smote)
xgb_smote_y_pred = xgb_smote_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, xgb_smote_y_pred))


In [None]:
#Now do a XGboost model
from xgboost import XGBClassifier

xgb_smoteeen_model= XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=200, max_depth=6, random_state=42)
xgb_smoteeen_model.fit(X_resampled_smoteenn, y_resampled_smoteenn)
xgb_smoteeen_y_pred = xgb_smoteeen_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, xgb_smoteeen_y_pred))


In [None]:
print(f"Classification Report - Original Data")
print(classification_report(y_test, xgb_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, xgb_smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, xgb_smoteeen_y_pred))

## Logistic Regression

In [None]:
#now do a logistic regression model
from sklearn.linear_model import LogisticRegression

lr_model_scaled = LogisticRegression(max_iter=500, random_state=42)
lr_model_scaled.fit(X_train_scaled, y_train)
lr_y_pred_scaled = lr_model_scaled.predict(X_test_scaled)

print("Classification Report: Scaled Data")
print(classification_report(y_test, lr_y_pred_scaled))

In [None]:
#now do a logistic regression model - cleaned 2

lr_model_scaled2 = LogisticRegression(max_iter=500, random_state=42)
lr_model_scaled2.fit(X_train_scaled2, y_train2)
lr_y_pred_scaled2 = lr_model_scaled2.predict(X_test_scaled2)

print("Classification Report: Scaled Data, Cleaned 2")
print(classification_report(y_test2, lr_y_pred_scaled2))

In [None]:
#now do a logistic regression model - unscaled
from sklearn.linear_model import LogisticRegression

lr_model_unscaled = LogisticRegression(max_iter=500, random_state=42)
lr_model_unscaled.fit(X_train, y_train)
lr_y_pred_unscaled = lr_model_unscaled.predict(X_test)

print("Classification Report: Unscaled Data")
print(classification_report(y_test, lr_y_pred_unscaled))


In [None]:
#now do a logistic regression model - unscaled cleaned 2

lr_model_unscaled2 = LogisticRegression(max_iter=500, random_state=42)
lr_model_unscaled2.fit(X_train2, y_train2)
lr_y_pred_unscaled2 = lr_model_unscaled2.predict(X_test2)

print("Classification Report: Unscaled Data, Cleaned 2")
print(classification_report(y_test2, lr_y_pred_unscaled2))

In [None]:
#now do a logistic regression model - SMOTE
from sklearn.linear_model import LogisticRegression

lr_smote_model = LogisticRegression(max_iter=500, random_state=42)
lr_smote_model.fit(X_resampled_smote, y_resampled_smote)
lr_smote_y_pred = lr_smote_model.predict(X_test_scaled)

print("Classification Report: SMOTE Data")
print(classification_report(y_test, lr_smote_y_pred))


In [None]:
#now do a logistic regression model - SMOTE cleaned 2

lr_smote_model2 = LogisticRegression(max_iter=500, random_state=42)
lr_smote_model2.fit(X_resampled_smote2, y_resampled_smote2)
lr_smote_y_pred2 = lr_smote_model2.predict(X_test_scaled2)

print("Classification Report: SMOTE Data, Cleaned 2")
print(classification_report(y_test2, lr_smote_y_pred2))

In [None]:
#now do a logistic regression model - SMOTEENN
from sklearn.linear_model import LogisticRegression

lr_smoteenn_model = LogisticRegression(max_iter=500, random_state=42)
lr_smoteenn_model.fit(X_resampled_smoteenn, y_resampled_smoteenn)
lr_smoteenn_y_pred = lr_smoteenn_model.predict(X_test)

print("Classification Report: SMOTEENN Data")
print(classification_report(y_test, lr_smoteenn_y_pred))


In [None]:
#now do a logistic regression model - SMOTEENN - cleaned 2

lr_smoteenn_model2 = LogisticRegression(max_iter=500, random_state=42)
lr_smoteenn_model2.fit(X_resampled_smoteenn2, y_resampled_smoteenn2)
lr_smoteenn_y_pred2 = lr_smoteenn_model2.predict(X_test_scaled2)

print("Classification Report: SMOTEENN Data, Cleaned 2")
print(classification_report(y_test2, lr_smoteenn_y_pred2))

In [None]:
print(f"Classification Report - Original Data - Scaled")
print(classification_report(y_test, lr_y_pred_scaled))
print("---------")
print(f"Classification Report - Original Data - Unscaled")
print(classification_report(y_test, lr_y_pred_unscaled))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, lr_smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, lr_smoteenn_y_pred))

## LGBMC

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred))

## SVC

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("SVM Classification Report:")
print(classification_report(y_test, y_pred))

## KNC

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("KNN Classification Report:")
print(classification_report(y_test, y_pred))

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Gradient Boosting Report:")
print(classification_report(y_test, y_pred))

## MLPC

In [None]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

print("MLP (Neural Network) Report:")
print(classification_report(y_test, y_pred))

# Identify Most and Least Important Features

In [None]:
# feature_importances_ from the trained RandomForest model
rf_feature_importances = rf_model2.feature_importances_

# Create a DataFrame that pairs each feature with its importance score
rf_importance_df = pd.DataFrame({
    'feature': X2.columns,
    'importance': rf_feature_importances
})

# Sort by importance in descending order (most important at the top)
rf_importance_df.sort_values(by='importance', ascending=False, inplace=True)

# Display the entire list
print(rf_importance_df)

## Hyperparameter Optimization

### Logistic Regression 

In [None]:
# Start with GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga']  # Solvers that support L1/L2 penalties
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    LogisticRegression(max_iter=500, random_state=42),
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1
)

# Fit to training data
grid_search.fit(X_train_scaled2, y_train2)

# Best parameters & score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate on test data
best_lr = grid_search.best_estimator_
y_pred_best = best_lr.predict(X_test_scaled2)
print(classification_report(y_test2, y_pred_best))
print(f"Accuracy: {accuracy_score(y_test2, y_pred_best):.4f}")


In [None]:
# Use RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Define hyperparameter space
param_dist = {
    'C': np.logspace(-3, 3, 10),  # Wide range of C values
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    LogisticRegression(max_iter=500, random_state=42),
    param_distributions=param_dist,
    n_iter=20,  # Number of random samples
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

# Fit model
random_search.fit(X_train_scaled2, y_train2)

# Best results
print("Best Hyperparameters:", random_search.best_params_)
print("Best Accuracy:", random_search.best_score_)

# Evaluate on test data
best_lr_random = random_search.best_estimator_
y_pred_random = best_lr_random.predict(X_test_scaled2)
print(classification_report(y_test2, y_pred_random))
print(f"Accuracy: {accuracy_score(y_test2, y_pred_random):.4f}")


In [None]:
# Use a manual For Loop, instead of GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define hyperparameter ranges
C_values = [0.01, 0.1, 1, 10, 100]  # Regularization strength
penalties = ['l1', 'l2']  # Regularization types
solvers = ['liblinear', 'saga']  # Solvers that support L1/L2 penalties

best_score = 0
best_params = {}
best_y_pred = None

# Nested loops for hyperparameter tuning
for C in C_values:
    for penalty in penalties:
        for solver in solvers:
            try:
                # Initialize model with hyperparameters
                model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=500, random_state=42)

                # Train the model
                model.fit(X_train_scaled2, y_train2)

                # Predict on validation set
                y_pred = model.predict(X_test_scaled2)

                # Compute accuracy
                accuracy = accuracy_score(y_test2, y_pred)

                # Print results
                print(f"C={C}, penalty={penalty}, solver={solver} -> Accuracy: {accuracy:.4f}")

                # Track the best model
                if accuracy > best_score:
                    best_score = accuracy
                    best_params = {'C': C, 'penalty': penalty, 'solver': solver}
                    best_y_pred = y_pred

            except Exception as e:
                print(f"Skipping C={C}, penalty={penalty}, solver={solver} due to error: {e}")

# Print the best hyperparameters and accuracy
print("\nBest Hyperparameters:", best_params)
print("Best Accuracy:", best_score)
print(classification_report(y_test2, best_y_pred))
print(f"Accuracy: {accuracy_score(y_test2, best_y_pred):.4f}")


### Random Forest

In [None]:
# GridSearchCV with RandomForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [None]:


# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit on training data
grid_search.fit(X_train2, y_train2)

# Print the best hyperparameters and score
print("Best Parameters from GridSearchCV:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate on test data
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test2)
print(classification_report(y_test2, y_pred_best))
print(f"Accuracy: {accuracy_score(y_test2, y_pred_best):.4f}")


In [None]:
# RandomizedSearchCV with RandomForest

# Define hyperparameter distribution
param_dist = {
    'n_estimators': np.arange(50, 300, 50),
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist,
                                   n_iter=20, cv=5, n_jobs=-1, verbose=2, random_state=42)

# Fit on training data
random_search.fit(X_train2, y_train2)

# Print the best hyperparameters and score
print("Best Parameters from RandomizedSearchCV:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Evaluate on test data
best_rf_random = random_search.best_estimator_
y_pred_random = best_rf_random.predict(X_test2)
print(classification_report(y_test2, y_pred_random))
print(f"Accuracy: {accuracy_score(y_test2, y_pred_random):.4f}")


In [None]:
# RandomizedSearchCV with RandomForest - TEST

# Define hyperparameter distribution
param_dist = {
    'n_estimators': np.arange(50, 300, 50),
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_dist,
                                   n_iter=20, cv=5, n_jobs=-1, verbose=2, random_state=42)

# Fit on training data
random_search.fit(X_resampled_smote2, y_resampled_smote2)

# Print the best hyperparameters and score
print("Best Parameters from RandomizedSearchCV:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

# Evaluate on test data
best_rf_random = random_search.best_estimator_
y_pred_random = best_rf_random.predict(X_test_scaled2)
print(classification_report(y_test2, y_pred_random))
print(f"Accuracy: {accuracy_score(y_test2, y_pred_random):.4f}")

In [None]:
# Nested For Loop with RandomForest

# Define hyperparameter values to iterate over
n_estimators_list = range(1, 101)    
max_depth_list = [None, 10, 20]
min_samples_split_list = [2, 5, 10]
min_samples_leaf_list = [1, 2, 4]

best_score = 0
best_params = {}
best_y_pred = None

# Nested loops for hyperparameter tuning
for n_estimators in n_estimators_list:
    for max_depth in max_depth_list:
        for min_samples_split in min_samples_split_list:
            for min_samples_leaf in min_samples_leaf_list:
                # Initialize model
                rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                                  min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                                  random_state=42)
                
                # Fit model
                rf_model.fit(X_train2, y_train2)
                
                # Predict on test data
                y_pred = rf_model.predict(X_test2)
                
                # Compute accuracy
                accuracy = accuracy_score(y_test2, y_pred)
                
                # Track best model
                if accuracy > best_score:
                    best_score = accuracy
                    best_params = {
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'min_samples_split': min_samples_split,
                        'min_samples_leaf': min_samples_leaf
                    }
                    best_y_pred = y_pred

                print(f"n_estimators={n_estimators}, max_depth={max_depth}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf} -> Accuracy: {accuracy:.4f}")

# Print best hyperparameters
print("\nBest Hyperparameters from Nested Loops:", best_params)
print("Best Accuracy:", best_score)
print(classification_report(y_test2, best_y_pred))
print(f"Accuracy: {accuracy_score(y_test2, best_y_pred):.4f}")


## Conclusions

### AUC RUC

In [None]:
#import confusion_matrix
from sklearn.metrics import confusion_matrix
# calculate  a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
#print the results of the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# Import AUC ROC curve tools
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt


### RFC Model vizualizations

In [None]:
# Calculate a ROC curve
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test2, rf_y_pred2)
# calculate the ROC AUC
rf_roc_auc = roc_auc_score(y_test2, rf_y_pred2)
# Plot the ROC curve
plt.figure(figsize=(10, 6))
plt.plot(rf_fpr, rf_tpr, color='orange', label='ROC curve (area = %0.2f)' % roc_auc)

In [None]:
#Get predicted probabilities for the positive class, not predicted labels
rf_y_proba = rf_model2.predict_proba(X_test2)[:, 1]

#Calculate the ROC curve using predicted probabilities
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test2, rf_y_proba)
rf_roc_auc = roc_auc_score(y_test2, rf_y_proba)


plt.figure(figsize=(10, 6))
plt.plot(rf_fpr, rf_tpr, color='orange', 
         label='ROC curve (AUC = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend()
plt.show()


In [None]:
from sklearn.inspection import permutation_importance

# Using the same rf model above
rf_result = permutation_importance(rf_model2, X_test2, y_test2, n_repeats=10, random_state=42)

rf_importances = rf_result.importances_mean
rf_feature_names = X_test2.columns
rf_perm_imp_df = pd.DataFrame({
    'Feature': rf_feature_names,
    'Importance': rf_importances
})
rf_perm_imp_df.sort_values('Importance', ascending=True, inplace=True)

plt.figure(figsize=(8, 6))
plt.barh(rf_perm_imp_df['Feature'], rf_perm_imp_df['Importance'])
plt.title("Permutation Importance")
plt.xlabel("Mean Decrease in Accuracy")
plt.ylabel("Feature")
plt.show()


## LR model visualizations 

In [None]:
# Calculate a ROC curve
lr_fpr, lr_tpr, lr_thresholds = roc_curve(y_test2, lr_y_pred_scaled2)
# calculate the ROC AUC
lr_roc_auc = roc_auc_score(y_test2, lr_y_pred_scaled2)
# Plot the ROC curve
plt.figure(figsize=(10, 6))
plt.plot(lr_fpr, lr_tpr, color='orange', label='ROC curve (area = %0.2f)' % roc_auc)

In [None]:
#Get predicted probabilities for the positive class, not predicted labels
lr_y_proba = lr_model_scaled2.predict_proba(X_test2)[:, 1]

#Calculate the ROC curve using predicted probabilities
lr_fpr, lr_tpr, thresholds = roc_curve(y_test2, lr_y_proba)
lr_roc_auc = roc_auc_score(y_test2, lr_y_proba)


plt.figure(figsize=(10, 6))
plt.plot(lr_fpr, lr_tpr, color='orange', 
         label='ROC curve (AUC = %0.2f)' % lr_roc_auc)
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend()
plt.show()

In [None]:
from sklearn.inspection import permutation_importance

# Using the same rf model above
lr_result = permutation_importance(lr_model_scaled2, X_test2, y_test2, n_repeats=10, random_state=42)

lr_importances = lr_result.importances_mean
lr_feature_names = X_test2.columns
lr_perm_imp_df = pd.DataFrame({
    'Feature': lr_feature_names,
    'Importance': lr_importances
})
lr_perm_imp_df.sort_values('Importance', ascending=True, inplace=True)

plt.figure(figsize=(8, 6))
plt.barh(lr_perm_imp_df['Feature'], lr_perm_imp_df['Importance'])
plt.title("Permutation Importance")
plt.xlabel("Mean Decrease in Accuracy")
plt.ylabel("Feature")
plt.show()