# Financial Well-Being Project: Data Analysis

In [None]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from pprint import pprint

# Preparing and scaling data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Machine learning models and optimizers
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Metrics for model evaluation
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, accuracy_score, classification_report

## Load Preprocessed Data

In [None]:
# Set path for CSV file
path = Path('data/fwb_processed_data_2.csv')
# path = Path('resources/NFWBS_PUF_2016_data.csv')

# Load preprocessed data
data_df = pd.read_csv(path, index_col=0)

# Display shape and sample data
print(f'Dataframe shape: {data_df.shape}')
data_df.head()

## Model Definitions and Functions

In [None]:
# Define features set
X = data_df.copy()
X.drop(columns=['FWBscore', 'FWBscore_wt'], axis=1, inplace=True)
X.head()

In [None]:
# Define target
y = data_df['FWBscore'].values.reshape(-1, 1)
y[:5]

In [None]:
# Function to evaluate model using confusion matrix and classification report
def eval_model_cmcr():
    # Calculating the confusion matrix
    index = ['Actual ' + label for label in class_labels]
    columns = ['Predicted ' + label for label in class_labels]

    cm = confusion_matrix(y_test, predictions)
    cm_df = pd.DataFrame(
        cm, index=index,
        columns=columns
    )

    # Calculating the accuracy score
    acc_score = accuracy_score(y_test, predictions)
    
    # Displaying results
    print("Confusion Matrix")
    display(cm_df)
    print(f"Accuracy Score : {acc_score}")
    print("Classification Report")
    print(classification_report(y_test, predictions))
    return round(acc_score, 3)

In [None]:
# Function to split up and classify target based on percentiles
def classify_y(num, labels):
    y = pd.qcut(data_df['FWBscore'], num, labels=labels)
    y = y.ravel()
    print(y[:5])
    return y

In [None]:
# Function to plot a decision tree
def plot_d_tree(model, acc_score, i, num_classes, figsize):
    
    # Configure plot
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=figsize, dpi=600)
    
    # Plot tree
    plot_tree(model, feature_names=feature_names, class_names=class_labels, filled=True)
    title = f'Decision Tree Model #{i+1} with {num_classes} Classes\nAccuracy Score:{acc_score}'
    axes.set_title(title, fontsize = 11)
    
    # Save tree
    plt.savefig(f'images/dataset_2/dt_{num_classes}c_{i+1}.png')

In [None]:
# Function to plot first 4 trees in a random forest
# Code ref: https://stackoverflow.com/questions/40155128/plot-trees-for-a-random-forest-in-python-with-scikit-learn
def plot_r_forest(model, acc_score, i, num_classes, figsize):
    
    # Configure plot
    fig, axes = plt.subplots(nrows=4, ncols=1, figsize=figsize, dpi=600)
    for index in range(0, 4):
        plot_tree(model.estimators_[index],
                       feature_names = feature_names, 
                       class_names=class_labels,
                       filled = True,
                       ax = axes[index])
        if index == 0:
            title = f'Randon Forest Model #{i+1} with {num_classes} Classes\nEstimator: {index}, Accuracy Score:{acc_score}'
        else:
            title = f'Estimator: {index}'
        axes[index].set_title(title, fontsize = 11)
    
    # Save tree
    plt.savefig(f'images/dataset_1/rf_{num_classes}c_{i+1}.png')

## Linear Regression Model

In [None]:
# Create a model with scikit-learn
model_lr = LinearRegression()

### Fit the model and make predictions

In [None]:
# Fit the data into the model
model_lr.fit(X, y)

In [None]:
# Make predictions using the X set
predicted_y_values = model_lr.predict(X)

# Create a copy of the original data
predicted_df = data_df.copy()

# Add a column with the predicted salary values
predicted_df["predicted_FWBscore"] = predicted_y_values

# Display sample data
predicted_df

### Evaluate the model

In [None]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model_lr.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

## Decision Tree with 2 Target Classes

### Setup and scale data

In [None]:
# Display quartiles for cutting
data_df['FWBscore'].quantile([.25, .5, .75])

In [None]:
# Create 2 target classes
num_classes = 2
class_labels = ['Bad', 'Good']
y = classify_y(num_classes, class_labels)

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

### Determine the best set of hyperparameters

In [None]:
# Set possible values for hyperparameters
parameters = {'max_depth': [3, 5, 7, 9, 10, 20, 50],
              'criterion': ['gini', 'entropy'],
              'max_features': ['sqrt', 'log2', None],
              'min_samples_split': [2, 4, 6, 10, 20, 30],
              'min_samples_leaf': [1, 2, 5, 8] 
             }

In [None]:
# Create the decision tree classifier instance
model_dt = RandomizedSearchCV(DecisionTreeClassifier(),param_distributions = parameters, cv = 20, verbose = True)

In [None]:
# Fit the model
model_dt = model_dt.fit(X_train, y_train)

In [None]:
model_dt.best_estimator_

### Fit, Predict and Evaluate Model

In [None]:
# Create list to store decision tree classifier instances
model_dt2c = []

# DT Model #1: Default settings
model_dt2c.append(DecisionTreeClassifier(max_depth=None, max_features=None, min_samples_split=2, min_samples_leaf=1))

# DT Model #1: Simple tree
model_dt2c.append(DecisionTreeClassifier(max_depth=4, max_features=None, min_samples_split=5, min_samples_leaf=3))  

# DT Model #3: Best results after testing
model_dt2c.append(DecisionTreeClassifier(max_depth=5, max_features=None, min_samples_split=10, min_samples_leaf=5))                 
                  
# DT Model #4: One of the best estimators from RandomizedSearchCV
model_dt2c.append(DecisionTreeClassifier(max_depth=7, max_features=None, min_samples_split=4, min_samples_leaf=5))

In [None]:
# Create list to store accurancy scores
acc_score_dt2c = []

# Loop to Fit, predict and evaluate models
for i in range(0, len(model_dt2c)):
    model = model_dt2c[i]           
               
    # Fit the model
    model = model.fit(X_train, y_train)

    # Make predictions using the testing data
    predictions = model.predict(X_test)
    
    # Evaluate the model and store accuracy score
    print(f'Evaluation of Model #{i+1}\n')
    acc_score = eval_model_cmcr()
    acc_score_dt2c.append(acc_score)

### Plot and Save Decision Trees

In [None]:
feature_names = list(X.columns)
figsize = (15, 5)

In [None]:
# Plot trees for each model
for i in range(0, len(model_dt2c)):
    plot_d_tree(model_dt2c[i], acc_score_dt2c[i], i, num_classes, figsize)

In [None]:
# Plot one tree - Use to adjust figsize or test one model
i = 0
num_classes = 2
figsize=(50, 8)

# plot_d_tree(model_dt2c[i], acc_score_dt2c[i], i, num_classes, figsize)

## Random Forest with 2 Target Classes

### Determine the best set of hyperparameters

In [None]:
# Create random grid with parameters
# Resource: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

parameters = {'n_estimators': [200, 250, 300, 500, 800, 1000, 1500, 2000],
               'max_depth': [5, 8, 10, 15, 20, 50],
               'min_samples_split': [2, 3, 5, 7, 10, 20],
               'min_samples_leaf': [1, 2, 4, 6],
               'bootstrap': [True, False]
             }

In [None]:
# Create the RF classifier instance
model_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions = parameters, cv = 5, verbose = True)

In [None]:
# Fit the model
model_rf = model_rf.fit(X_train, y_train)

In [None]:
model_rf.best_estimator_

### Fit, Predict and Evaluate Model

In [None]:
# Create list to store decision tree classifier instances
model_rf2c = []

# RF Model #1 
model_rf2c.append(RandomForestClassifier(n_estimators=200, random_state=78, bootstrap=True,
                                         max_depth=5, min_samples_split=2, min_samples_leaf=1))

# RF Model #2
model_rf2c.append(RandomForestClassifier(n_estimators=500, random_state=78, bootstrap=False,
                                         max_depth=10, min_samples_split=5, min_samples_leaf=3))                 
                  
# RF Model #3: One of the best estimators from RandomizedSearchCV
model_rf2c.append(RandomForestClassifier(n_estimators=250, random_state=78, bootstrap=False,
                                         max_depth=20, min_samples_split=5, min_samples_leaf=4))

In [None]:
# Create list to store accurancy scores
acc_score_rf2c = []

# Loop to fit, predict and evaluate models
for i in range(0, len(model_rf2c)):
    model = model_rf2c[i]           
               
    # Fit the model
    model = model.fit(X_train, y_train)

    # Make predictions using the testing data
    predictions = model.predict(X_test)
    
    # Evaluate the model and save accuracy score
    print(f'Evaluation of Model #{i+1}\n')
    acc_score = eval_model_cmcr()
    acc_score_rf2c.append(acc_score)

### Plot and Save Images of Sample Trees

In [None]:
feature_names = list(X.columns)
figsize = (20, 20)

In [None]:
# Plot trees for each forest
for i in range(0, len(model_rf2c)):
    plot_r_forest(model_rf2c[i], acc_score_rf2c[i], i, num_classes, figsize)

In [None]:
# Plot trees for one forest - Use to adjust figsize or test one model
i = 1
num_classes = 2
figsize = (25, 25)

# plot_r_forest(model_rf2c[i], acc_score_rf2c[i], i, num_classes, figsize)

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
# Loop through models to display features sorted by importance hight to low
for i in range(0, len(model_rf2c)):
    model = model_rf2c[i]
    importances = model.feature_importances_
    
    # Sort features by their importance for current model
    pprint(f'Feature Importance for Model #{i+1}')
    pprint(sorted(zip(model.feature_importances_[0:15], X.columns[0:15]), reverse=True))
    print('\n')

## Decision Tree with 3 Target Classes

### Setup and scale data

In [None]:
# Display quartiles for cutting
data_df['FWBscore'].quantile([.25, .5, .75])

In [None]:
# Create 3 target classes
num_classes = 3
class_labels = ['<Avg', 'Avg', '>Avg']
y = classify_y(num_classes, class_labels)

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

### Determine the best set of hyperparameters

In [None]:
parameters = {'max_depth': [3, 5, 7, 9, 10, 25],
              'max_features': ['sqrt', 'log2', None],
              'min_samples_split': [2, 4, 6, 10, 20, 30],
              'min_samples_leaf': [1, 2, 5, 8, 10, 20] 
             }

In [None]:
# Create the decision tree classifier instance
model_dt = RandomizedSearchCV(DecisionTreeClassifier(),param_distributions = parameters, cv = 20, verbose = True)

In [None]:
# Fit the model
model_dt = model_dt.fit(X_train, y_train)

In [None]:
model_dt.best_estimator_

### Fit, Predict and Evaluate Model

In [None]:
# Create list to store decision tree classifier instances
model_dt3c = []

# DT Model #1: Default settings
model_dt3c.append(DecisionTreeClassifier(max_depth=None, max_features=None, min_samples_split=2, min_samples_leaf=1))

# DT Model #1: Simple tree
model_dt3c.append(DecisionTreeClassifier(max_depth=4, max_features=None, min_samples_split=5, min_samples_leaf=3))  

# DT Model #3: Best results after testing
model_dt3c.append(DecisionTreeClassifier(max_depth=5, max_features=None, min_samples_split=10, min_samples_leaf=5))                 
                  
# DT Model #4: Best estimator from RandomizedSearchCV
model_dt3c.append(DecisionTreeClassifier(max_depth=7, max_features=None, min_samples_split=4, min_samples_leaf=8))

In [None]:
# Create list to store accurancy scores
acc_score_dt3c = []

# Loop to Fit, predict and evaluate models
for i in range(0, len(model_dt3c)):
    model = model_dt3c[i]           
               
    # Fit the model
    model = model.fit(X_train, y_train)

    # Make predictions using the testing data
    predictions = model.predict(X_test)
    
    # Evaluate the model and store accuracy score
    print(f'Evaluation of Model #{i+1}\n')
    acc_score = eval_model_cmcr()
    acc_score_dt3c.append(acc_score)

### Plot and Save Decision Trees

In [None]:
feature_names = list(X.columns)
figsize = (15, 5)

In [None]:
# Plot trees for each model
for i in range(0, len(model_dt3c)):
    plot_d_tree(model_dt3c[i], acc_score_dt3c[i], i, num_classes, figsize)

In [None]:
# Plot one tree - Use to adjust figsize or testing one model
i = 0
num_classes = 3
figsize=(50, 8)

# plot_d_tree(model_dt3c[i], acc_score_dt3c[i], i, num_classes, figsize)

## Random Forest with 3 Target Classes

### Determine the best set of hyperparameters

In [None]:
# Create random grid with parameters
# Resource: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

parameters = {'n_estimators': [200, 250, 300, 500, 800, 1000, 1500, 2000],
               'max_depth': [5, 8, 10, 15, 20, 50],
               'min_samples_split': [2, 3, 5, 7, 10, 20],
               'min_samples_leaf': [1, 2, 4, 6],
               'bootstrap': [True, False]
             }

In [None]:
# Create the RF classifier instance
model_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions = parameters, cv = 5, verbose = True)

In [77]:
# Fit the model
model_rf = model_rf.fit(X_train, y_train)

In [78]:
model_rf.best_estimator_

### Fit, Predict and Evaluate Model

In [None]:
# Create list to store decision tree classifier instances
model_rf3c = []

# RF Model #1 
model_rf3c.append(RandomForestClassifier(n_estimators=200, random_state=78, bootstrap=True,
                                         max_depth=5, min_samples_split=2, min_samples_leaf=1))

# RF Model #2
model_rf3c.append(RandomForestClassifier(n_estimators=600, random_state=78, bootstrap=False,
                                         max_depth=10, min_samples_split=5, min_samples_leaf=3))                 
                  
# RF Model #3: One of the best estimators from RandomizedSearchCV
model_rf3c.append(RandomForestClassifier(n_estimators=250, random_state=78, bootstrap=False,
                                         max_depth=10, min_samples_split=7, min_samples_leaf=2))

In [None]:
# Create list to store accurancy scores
acc_score_rf3c = []

# Loop to fit, predict and evaluate models
for i in range(0, len(model_rf3c)):
    model = model_rf3c[i]           
               
    # Fit the model
    model = model.fit(X_train, y_train)

    # Make predictions using the testing data
    predictions = model.predict(X_test)
    
    # Evaluate the model and save accuracy score
    print(f'Evaluation of Model #{i+1}\n')
    acc_score = eval_model_cmcr()
    acc_score_rf3c.append(acc_score)

### Plot and Save Images of Sample Trees

In [None]:
feature_names = list(X.columns)
figsize = (20, 20)

In [None]:
# Plot 4 trees for each forest - Use to adjust figsize or test one model
for i in range(0, len(model_rf3c)):
    plot_r_forest(model_rf3c[i], acc_score_rf3c[i], i, num_classes, figsize)

In [None]:
# Plot 4 trees for one forest - Use to adjust figsize or test one model
i = 2
num_classes = 3
figsize = (40, 40)

# plot_r_forest(model_rf3c[i], acc_score_rf3c[i], i, num_classes, figsize)

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
# Loop through models to display features sorted by importance hight to low
for i in range(0, len(model_rf2c)):
    model = model_rf2c[i]
    importances = model.feature_importances_
    
    # Sort features by their importance for current model
    pprint(f'Feature importance for model #{i+1}')
    pprint(sorted(zip(model.feature_importances_[0:15], X.columns[0:15]), reverse=True))
    print('\n')