<a href="https://colab.research.google.com/github/katusabevictoria/katusabevictoria/blob/main/Regression_(Practice)_Victoria.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [3]:
# Add custom functions (from snippets if you have saved it)

def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

In [4]:
## Loading data from published google sheet
fpath ="/content/drive/MyDrive/Colab data uploads/Boston_Housing_from_Sklearn - Boston_Housing_from_Sklearn.csv"
df = pd.read_csv(fpath)
df

Unnamed: 0,CRIM,NOX,RM,AGE,PTRATIO,LSTAT,PRICE
0,0.00632,0.538,6.575,65.2,15.3,4.98,24.0
1,0.02731,0.469,6.421,78.9,17.8,9.14,21.6
2,0.02729,0.469,7.185,61.1,17.8,4.03,34.7
3,0.03237,0.458,6.998,45.8,18.7,2.94,33.4
4,0.06905,0.458,7.147,54.2,18.7,5.33,36.2
...,...,...,...,...,...,...,...
501,0.06263,0.573,6.593,69.1,21.0,9.67,22.4
502,0.04527,0.573,6.120,76.7,21.0,9.08,20.6
503,0.06076,0.573,6.976,91.0,21.0,5.64,23.9
504,0.10959,0.573,6.794,89.3,21.0,6.48,22.0


In [5]:
# Make X and y
target = 'PRICE'
X = df.drop(columns=target).copy()
y = df[target].copy()

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
# Instatiate Default model
default = DecisionTreeRegressor(random_state=42)

# Fit default model on training data only
default.fit(X_train, y_train)

# Evaluate with custom function
evaluate_regression(default, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 0.000
- MSE = 0.000
- RMSE = 0.000
- R^2 = 1.000

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 3.141
- MSE = 26.658
- RMSE = 5.163
- R^2 = 0.619


In [7]:
# Get depth of tree with default model
depth_used = default.get_depth()
depth_used

20

In [8]:
# Always helpful to see available paramters
default.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [9]:
# Define Param Grid
param_grid = {'max_depth' : (np.arange(5,depth_used,1)),
              'min_samples_leaf' : np.arange(1,11)}

In [10]:
# Instantiate GridSearchCV
grid_search = GridSearchCV(default, param_grid, n_jobs = -1, verbose = 1)

In [11]:
# Fit the Gridsearch on the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 150 candidates, totalling 750 fits


In [12]:
# Obtain best values
grid_search.best_params_

{'max_depth': 8, 'min_samples_leaf': 7}

In [13]:
# Define the best version of the model already refitted on entire training set
best_model = grid_search.best_estimator_

In [14]:
# Use custom function to evalute the model
evaluate_regression( best_model, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 2.221
- MSE = 11.803
- RMSE = 3.436
- R^2 = 0.867

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 2.613
- MSE = 16.046
- RMSE = 4.006
- R^2 = 0.771
