In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Models & evaluation metrics
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib

#setting random state for reproducibility
SEED = 321

np.random.seed(SEED)
# set pandas to display more columns
pd.set_option('display.max_columns',50)

In [2]:
# Declare folder and file path
FOLDER = "Data"
file = "Data/us-covid-confirmed.csv"

# Load csv
df = pd.read_csv(file)
df


Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,24012023,25012023,26012023,27012023,28012023,29012023,30012023,31012023,01022023,02022023
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,19389,19471,19471,19471,19471,19471,19471,19471,19530,19530
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.727750,-87.722071,...,68764,68983,68983,68983,68983,68983,68983,68983,69187,69187
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,7258,7299,7299,7299,7299,7299,7299,7299,7339,7339
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,7889,7919,7919,7919,7919,7919,7919,7919,7967,7967
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,18130,18255,18255,18255,18255,18255,18255,18255,18349,18349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.589080,...,12082,12082,12082,12082,12082,12082,12082,12058,12058,12058
3338,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,6353,6353,6353,6353,6353,6353,6353,6317,6317,6317
3339,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
3340,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,2737,2737,2737,2737,2737,2737,2737,2729,2729,2729


# Functions

In [7]:
# Get a summary of the dataframe
def summarize_df(df_):
    df = df_.copy()
    report = pd.DataFrame({
                        'dtype':df.dtypes,
                        '# null': df.isna().sum(),
                        'null (%)': df.isna().sum()/len(df)*100,
                        'nunique':df.nunique(),
                        "min":df.min(),
                        'max':df.max()
             })
    report.index.name='Column'
    return report.reset_index()
summarize_df(df)



  "min":df.min(),
  'max':df.max()


Unnamed: 0,Column,dtype,# null,null (%),nunique,min,max
0,01012021,int64,0,0.0,2466,0,790582
1,01012022,int64,0,0.0,2928,0,1696582
2,01012023,int64,0,0.0,3063,0,3631736
3,01022020,int64,0,0.0,3,0,2
4,01022021,int64,0,0.0,2599,0,1121107
...,...,...,...,...,...,...,...
1114,Province_State,object,0,0.0,58,Alabama,Wyoming
1115,UID,int64,0,0.0,3342,16,84099999
1116,code3,int64,0,0.0,6,16,850
1117,iso2,object,0,0.0,6,AS,VI


In [None]:
# Note - you might need to import other model evaluation functions if you don't use a regression model!!!!

# Model evaluation function
def evaluate_regression(model, X_train,y_train, X_test, y_test):
    """Evaluates a scikit learn regression model using r-squared and RMSE"""

    ## Training Data
    y_pred_train = model.predict(X_train)
    r2_train = metrics.r2_score(y_train, y_pred_train)
    rmse_train = metrics.mean_squared_error(y_train, y_pred_train,
                                            squared=False)

    print(f"Training Data:\tR^2= {r2_train:.2f}\tRMSE= {rmse_train:.2f}")


    ## Test Data
    y_pred_test = model.predict(X_test)
    r2_test = metrics.r2_score(y_test, y_pred_test)
    rmse_test = metrics.mean_squared_error(y_test, y_pred_test,
                                            squared=False)

    print(f"Test Data:\tR^2= {r2_test:.2f}\tRMSE= {rmse_test:.2f}")

# Import joblib

In [4]:
loaded = joblib.load('saved_model.joblib')
loaded.keys()

dict_keys(['preprocessor', 'X_train', 'X_test', 'y_train', 'y_test', 'LinearRegression', 'RandomForestRegressor'])

In [6]:
# Load in each variable
X_train_df = loaded['X_train']
y_train = loaded['y_train']
X_test_df = loaded['X_test']
y_test = loaded['y_test']
preprocessor = loaded['preprocessor']
linreg = loaded['LinearRegression']
rfreg = loaded['RandomForestRegressor']

In [11]:
X_train_df.head()

Unnamed: 0,fueltype,enginelocation,carlength,carwidth,carheight,curbweight,cylindernumber,horsepower,peakrpm,citympg,highwaympg
64,gas,front,177.8,66.5,55.5,2425,4,84,4800,26,32
32,gas,front,150.0,64.0,52.6,1837,4,60,5500,38,42
165,gas,front,168.7,64.0,52.6,2265,4,112,6600,26,29
39,gas,front,175.4,65.2,54.1,2304,4,86,5800,27,33
202,gas,front,188.8,68.9,55.5,3012,6,134,5500,18,23


In [9]:
summarize_df(df)

  "min":df.min(),
  'max':df.max()


Unnamed: 0,Column,dtype,# null,null (%),nunique,min,max
0,01012021,int64,0,0.0,2466,0,790582
1,01012022,int64,0,0.0,2928,0,1696582
2,01012023,int64,0,0.0,3063,0,3631736
3,01022020,int64,0,0.0,3,0,2
4,01022021,int64,0,0.0,2599,0,1121107
...,...,...,...,...,...,...,...
1114,Province_State,object,0,0.0,58,Alabama,Wyoming
1115,UID,int64,0,0.0,3342,16,84099999
1116,code3,int64,0,0.0,6,16,850
1117,iso2,object,0,0.0,6,AS,VI


Next step - Use the preprocessor to transform the X_train/X_test data into processed dataframes. 

# Explain LinearRegession Model

- Extract the coefficients and save them as a Series with the correct feature names as the index.
- Change the pandas option for float format to display the coefficients with pandas in a readable form:
    - separator for thousands
    - 2 decimal places.
- Create a bar graph of the coefficients, sorted from largest to smallest. 
- Provide a quantitative interpretation (both magnitude and directionality) of the three largest positive coefficients explaining how that feature impacts the target.
- Provide a quantitative interpretation (both magnitude and directionality)  of the three largest negative coefficients explaining how that feature impacts the target.

# Explain RandomForestRegressor Model

- Extract and create a bar graph of the feature importances, sorted from largest to smallest. 
- What are the top 6 most important features? 
- Of the top 6 most important features, identify which of these features also appeared in the 3 largest or 3 smallest coefficients. 

## Apply Shap to explain Random Forest Model

- Sample 100 rows of the processed X_train data as X_shap, using random_state=321
- Save the corresponding y_train values as y_shap
- Create a shap model explainer for the random forest model
- Calculate the shap values for the sampled data
- Create a summary plot (plot_type =’dot’) of the most important features, according to shap.
- Interpret the top 6 most important features. According to shap, what effect does each feature have on the model’s prediction?