- Pipeline = sequence of steps to fit in the data automatically without having to manually write in the data

# These 6 lines Wrap in all the essentials on how to make better predictions...

# STEP 1: build a pipeline with data cleaning and an estimator
########################################################################################

# after this, I quickly run pipe_modelName.fit() and pipe_modelName.predict()  
# to make sure this works before going forward, but then delete those commands

pipe_modelName = make_pipeline(<a sequence of data steps, and the last step is a model>)  

########################################################################################
# STEP 2: optimize the pipeline
########################################################################################

# this is the GridSearchCV approach - manually set up the param&value combos to try
# doc + examples: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV 

param_grid = {'stepname__paramname':[val1,val2,...,valN]} # params to try
cv = ...                                                  # what folds to use
grid = GridSearchCV(pipe_modelName, param_grid,cv,...)    # set up optimizer
grid.fit(X,y)                              # fit grid like a "normal model obj"
optimal_vrs_of_model1 = grid.best_params_  # grid now has new features. save best model

In [4]:
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn import metrics

# DL data
url = 'https://github.com/LeDataSciFi/lectures-spr2020/blob/master/assignment_data/Fannie_Mae_Plus_Data.gzip?raw=true'
fannie_mae = pd.read_csv(url,compression='gzip') 

# separate out y var
y = fannie_mae['Original_Interest_Rate']
fannie_mae.drop('Original_Interest_Rate',axis=1,inplace=True)

## Set up how each data type will get dealt with
Let's start with the continuous numeric variables. Here, I just try a few variables.

In [5]:
num_features = ['Original_UPB', 'Original_Loan_Term','Original_Debt_to_Income_Ratio']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

Now the categorical features 

In [6]:
cat_features = ['Property_type', 'Loan_purpose'] #for hw, just finish putting in the correct stuff for cat_features and num_features
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse=False))]) #turning variables into a list of dummys

### Combine the column-specific transformations with ColumnTransformer¶


In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features), #giving it the transformers that you want it to run
        ('cat', cat_transformer, cat_features)])

In [17]:
# copy and paste this, don't go over - they can figure out later
# get numerical col names (line 1) + post-transform categorical names (line 2)
cols  = preprocessor.transformers_[0][2] .copy()   #t_[0] is num trans,[2] item is col names, copy() so we don't change the underlying data structure!
cols += preprocessor.transformers_[1][1]['onehot']\
                     .get_feature_names(cat_features).tolist()  #t_[1] is cat trans,[1] is steps inside cat trans, get onehot, then pull the feature names

pd.DataFrame(preprocessor.transform(fannie_mae), columns=cols).head(5)

Unnamed: 0,Original_UPB,Original_Loan_Term,Original_Debt_to_Income_Ratio,Property_type_CO,Property_type_CP,Property_type_MH,Property_type_PU,Property_type_SF,Loan_purpose_C,Loan_purpose_P,Loan_purpose_R,Loan_purpose_U
0,-1.443151,0.642953,-0.990988,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.101791,0.642953,-0.639975,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.615504,-1.543334,-0.201208,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,-1.121288,-1.543334,-1.429754,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,-1.277621,-1.543334,-2.044026,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [18]:
preprocessor.fit(fannie_mae)
pd.DataFrame(preprocessor.transform(fannie_mae)).head(5) #can see that it itself is a column transformer
                            #can give it the whole dataset, unlike below with num_transformer

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.443151,0.642953,-0.990988,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.101791,0.642953,-0.639975,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.615504,-1.543334,-0.201208,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,-1.121288,-1.543334,-1.429754,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,-1.277621,-1.543334,-2.044026,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [10]:
num_transformer.fit(fannie_mae[num_features])
num_transformer.transform(fannie_mae[num_features]) #shows an array with no missing values for the three variables 

array([[-1.4431508 ,  0.64295267, -0.99098751],
       [ 0.10179076,  0.64295267, -0.63997455],
       [-0.61550354, -1.5433343 , -0.20120835],
       ...,
       [ 0.46043791,  0.64295267, -1.16649399],
       [-0.88218988,  0.64295267, -1.25424723],
       [-0.44997408,  0.64295267,  0.23755786]])

### This is ready to include in a pipeline with an estimator¶
It's as easy as: make_pipeline(preprocessor, model_of_your_choice()).

For example:

In [14]:
# combine preprocessor with estimator
pipe_reg = make_pipeline(preprocessor,
                        LinearRegression())
pipe_reg # look at it

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                        

# Optimizing the overly simply model above¶
Three reasons for doing this:

- specifying param_grid is just a little different because the pipeline has steps with nested steps
- one more example of optimizing a model
- you'll see how I'll evaluate your final model

Optimizing this pipeline is just like the pseudo code above: set up the parameter grid, then the grid to search, then fit and save the optimized model to an object.

In [21]:

param_grid = {
             'columntransformer__num__imputer__strategy': ['mean', 'median','most_frequent']
             }
param_grid

{'columntransformer__num__imputer__strategy': ['mean',
  'median',
  'most_frequent']}

_Note how we accessed the column transformer, 2 underscores, then the num transformer inside it, 2 underscores, then the imputer step, then the strategy parameter. I wouldn't have known to do this without looking at the pipe_reg output above._

In [22]:
grid_search = GridSearchCV(pipe_reg, param_grid, cv=5,scoring='r2') #searches over the values in param_grid
grid_search.fit(fannie_mae, y)
# grid_search.best_params_                   # examined this
opt_model_reg = grid_search.best_estimator_  # save best model to an actual model object

#****use this for part 2 of assignment****

START ASIDE: you can quickly check the model object's R2 in-sample (all of your data) and on the CV folds

In [26]:
# how does this do insample?
print("In sample:          ",metrics.r2_score(y,
                                              opt_model_reg.predict(fannie_mae)
                                             ).round(3)) 

#prints r2 of 20... not very good

In sample:           0.204


In [25]:
# lol this model generates negative R2 in the CV folds
print("Validation fold avg:",cross_validate(opt_model_reg,
                                            fannie_mae, y,
                                            scoring=['neg_mean_squared_error','r2']
                                           )
                                           ['test_r2'].mean().round(3))



Validation fold avg: -3.792


### Exercise 1: Add 5 new continuous variables to your pipeline and see how R2 changes

In [35]:
fannie_mae.columns


Index(['Loan_Identifier', 'Origination_Channel', 'Seller_Name', 'Original_UPB',
       'Original_Loan_Term', 'Original_LTV_(OLTV)',
       'Original_Combined_LTV_(CLTV)', 'Number_of_Borrowers',
       'Original_Debt_to_Income_Ratio', 'Borrower_Credit_Score_at_Origination',
       'Loan_purpose', 'Property_type', 'Number_of_units', 'Occupancy_type',
       'Property_state', 'Zip_code_short',
       'Primary_mortgage_insurance_percent', 'Product_type',
       'Co-borrower_credit_score_at_origination', 'Mortgage_Insurance_type',
       'Origination_Date', 'First_payment_date',
       'First_time_home_buyer_indicator', 'UNRATE', 'CPIAUCSL', 'Qdate',
       'rGDP', 'TCMR', 'POILWTIUSDM', 'TTLCONS', 'DEXUSEU', 'BOPGSTB',
       'GOLDAMGBD228NLBM', 'CSUSHPISA', 'MSPUS'],
      dtype='object')

In [32]:
#Add new features to the list 
num_features = ['Original_UPB', 'Original_Loan_Term','Original_Debt_to_Income_Ratio',
                'UNRATE','rGDP','Original_LTV_(OLTV)','TCMR','CPIAUCSL']

#reload the preprocessor (num_features is the new now)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features), #giving it the transformers that you want it to run
        ('cat', cat_transformer, cat_features)
    ])

#reload the pipeline
pipe_reg = make_pipeline(preprocessor, LinearRegression())

#check OOS with Cvs
cross_validate(pipe_reg,fannie_mae,y,scoring='r2')['test_score'].mean()

0.648650615470285

### Exercise 2: Add 2 new categorical variables (on top of the new 5 continuous vars) and see how R2 changes 

In [36]:
cat_features = ['Property_type', 'Loan_purpose',
               'Origination_Channel','Product_type'] #for hw, just finish putting in the correct stuff for cat_features and num_features

#reload the preprocessor (num_features is the new now)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features), #giving it the transformers that you want it to run
        ('cat', cat_transformer, cat_features)
    ])

#reload the pipeline
pipe_reg = make_pipeline(preprocessor, LinearRegression())

#check OOS with Cvs
cross_validate(pipe_reg,fannie_mae,y,scoring='r2')['test_score'].mean()



0.6467630572519719

# Exercise 3: Add as many variables as possible 

In [39]:
ignore = ['Loan_Identifier','Origination_Date','QDate']

dumb_nums = fannie_mae.select_dtypes('number').columns.to_list()
dumb_cats = [ele for ele in fannie_mae.columns if ele not in dumb_nums]

#remove
dumb_nums = [ele for ele in dumb_nums if ele not in ignore]
dumb_cats = [ele for ele in dumb_cats if ele not in ignore]

#all_others
cat_features = dumb_cats
num_features = dumb_nums

#reload the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features), #giving it the transformers that you want it to run
        ('cat', cat_transformer, cat_features)
    ])

#reload the pipeline
pipe_reg = make_pipeline(preprocessor, LinearRegression())

#check OOS with Cvs
cross_validate(pipe_reg,fannie_mae,y,scoring='r2')['test_score'].mean()



-1.6516292080301497e+20

In [41]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

#add a selection preprocessing step
pipe_reg = make_pipeline(preprocessor,
                        SelectFromModel(LassoCV()),     # turn this on/off to see diff                    
                        LinearRegression())

cross_validate(pipe_reg,fannie_mae,y,scoring='r2')['test_score'].mean()



0.4654804337184797