# Algorithmic and Automated Feature and Parameter Selection

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import scipy.stats as stats
import modeling
import data_viz
import data_cleaning
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import RFECV
%matplotlib inline
sns.set(font_scale=1.5)


In [2]:
df = pd.read_csv('datasets/train_cleaned_dummied.csv')
df_test = pd.read_csv('datasets/test_cleaned_dummied.csv')
model_df = pd.read_csv('datasets/models.csv')
modeling.test_columns(df=df, df_test=df_test)

## Establish our Analysis and Hand-Chosen Features to iterate on.

In [3]:
features6 = ['bsmtfin_sf_1',
'heating_qc',
'fireplace_qu',
'year_built',
'bsmt_qual',
'1st_flr_sf',
'garage_area',
'kitchen_qual',
'gr_liv_area',
'exter_qual',
'overall_qual',
'porches',
'baths',
'open_porch_sf',
'full_bath',
'lot',
'lot_area',
'overall_cond',
'bsmtfin_sf_1',
'2nd_flr_sf',
'half_bath',
'total_bsmt_sf',
'fireplaces',
'year_remod/add',
'central_air_y',
'neighborhood_nridght',
'neighborhood_stonebr',
'neighborhood_noridge',
]









target = 'logsaleprice'

## First test to see if Scaling our data has an effect - which it does not.

In [4]:
model_df = modeling.new_test(df = df,
        test_df = df_test,
        features = features6,
        target = target,
         random_state = 1,
         model_type = LinearRegression,
         model_df = model_df,
        scaled=True,
        );

Training Data Score: 0.9089304794321711
Test Data Score: 0.8877166155378544
Cross Validation Score: 0.9034269992149262


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  X_test = ss.transform(X_test)


## Using Lasso and Ridge, see if there is a level of penalty that can be applied to create an an optimal model from our existing variables.

In [15]:
model_df = modeling.new_test(df = df,
        test_df = df_test,
        features = features6,
        target = target,
         random_state = 1,
         model_type = LassoCV,
         model_df = model_df,
        scaled=True,
        );

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  X_test = ss.transform(X_test)


Training Data Score: 0.9088181524350349
Test Data Score: 0.8874619043127558
Cross Validation Score: 0.9034823141780087


In [16]:
model_df = modeling.new_test(df = df,
        test_df = df_test,
        features = features6,
        target = target,
         random_state = 1,
         model_type = RidgeCV,
         model_df = model_df,
        scaled=True,
        );

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  X_test = ss.transform(X_test)


Training Data Score: 0.908803330610601
Test Data Score: 0.887586773285483
Cross Validation Score: 0.9035873195077297


In [6]:
features_6_interactions = [feature for feature in features6 if " " in feature]
features_6_transforms = [feature for feature in features6 if ("^" in feature) or ("log" in feature)]
print(f'{len(features_6_interactions)} interaction variables')
print(f'{len(features_6_transforms)} transformed variables')
print(f'{len(features6) - len(features_6_interactions) - len(features_6_transforms)} non-interaction variables')

0 interaction variables
0 transformed variables
28 non-interaction variables


## Random Forest allows us to iterate through multiple different combinations of features in a model. I did a similar thing as `model_maker` in `modeling.py` 🤦🏻‍♂️

In [7]:
rfecv_1 = RFECV(RidgeCV(),
     min_features_to_select = 1,
     cv=5,
      n_jobs = -1,
      verbose = 0,
     )

X = df.drop(axis = 1, columns = ['id','pid','saleprice','logsaleprice'])
X = X.select_dtypes(np.number)
y = df['logsaleprice']
X_train, X_test, y_train, y_test = train_test_split(X,y,
                     random_state=1,
                    )

rfecv_1.fit(X_train,y_train)
rfecv_1.transform(X_train)
rfecv_1.transform(X_test)
#rfecv_1.transform(y_test)
features7 = X_train.columns[rfecv_1.support_]

print(f'Training Score: {rfecv_1.score(X_train,y_train)}')
print(f'Test Score: {rfecv_1.score(X_test,y_test)}')
print(f'Cross Validation Score: {cross_val_score(rfecv_1,X_train,y_train, cv=5).mean()}')

Training Score: 0.9246938051411685
Test Score: -4.760575439913468
Cross Validation Score: -1474918909317527.0


In [8]:
model_df = modeling.new_test(df = df,
        test_df = df_test,
        features = features7,
        target = target,
         random_state = 1,
         model_type = LassoCV,
         model_df = model_df,
        scaled=True,
        );

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  X_test = ss.transform(X_test)


Training Data Score: 0.9203337806511276
Test Data Score: -3.9954689535067565
Cross Validation Score: 0.9057759836106827


In [9]:
rfecv2 = RFECV(LassoCV(),
     min_features_to_select = 1,
     cv=5,
      n_jobs = -1,
      verbose = 0,
     )

X = df.drop(axis = 1, columns = ['id','pid','saleprice','logsaleprice'])
X = X.select_dtypes(np.number)
y = df['logsaleprice']
X_train, X_test, y_train, y_test = train_test_split(X,y,
                     random_state=1,
                    )

rfecv2.fit_transform(X_train,y_train)
rfecv2.transform(X_test)

features8 = X_train.columns[rfecv2.support_]

print(f'Training Score: {rfecv2.score(X_train,y_train)}')
print(f'Test Score: {rfecv2.score(X_test,y_test)}')
print(f'Cross Validation Score: {cross_val_score(rfecv2,X_train,y_train, cv=5).mean()}')













Training Score: 0.005498785067210243
Test Score: 0.0032802030843889214
























































Cross Validation Score: 0.0001165233864085069




In [10]:
model_df = modeling.new_test(df = df,
        test_df = df_test,
        features = features8,
        target = target,
         random_state = 1,
         model_type = LassoCV,
         model_df = model_df,
        scaled=True,
        );

Training Data Score: 0.0052429451039893715
Test Data Score: 0.0023374835345864042
Cross Validation Score: -0.00029973663134583715


## Those models didn't look great though. So what about using Polynomial Transformation on our features to see how they interact and if it affects the accuracy of our model?

In [11]:
poly = PolynomialFeatures(include_bias = False)
df_to_mangle = df.drop(columns=['pid','id','saleprice','logsaleprice'],axis=1)
X_poly = poly.fit_transform(df_to_mangle[features6])
X_poly = pd.DataFrame(X_poly, columns = poly.get_feature_names(df.drop(columns=['pid','id','saleprice','logsaleprice'],axis=1).columns))
# X_poly['id'] = df['id']
# X_poly = X_poly.merge(df)

poly_rfecv = RFECV(RidgeCV(),
     min_features_to_select = 5,
     cv=5,
    n_jobs = -1,
    verbose = 0,
      
     )

X = X_poly
y = df['logsaleprice']
X_train, X_test, y_train, y_test = train_test_split(X,y,
                     random_state=1,
                    )
poly_rfecv.fit(X_train,y_train)

RFECV(cv=5,
   estimator=RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False),
   min_features_to_select=5, n_jobs=-1, scoring=None, step=1, verbose=0)

In [12]:
#X_train_trans = poly_rfecv.transform(X_train)
#X_test = poly_rfecv.transform(X_test)
# cribbed from https://stackoverflow.com/q/50387089
print(f'Optimal number of features : {poly_rfecv.n_features_}')
print(f'Best features : {X_poly.columns[poly_rfecv.support_]}')
print(f'Training Score: {poly_rfecv.score(X_train,y_train)}')
print(f'Test Score: {poly_rfecv.score(X_test,y_test)}')
print(f'Cross Validation Score: {cross_val_score(poly_rfecv,X_train,y_train, cv=5).mean()}')

Optimal number of features : 107
Best features : Index(['lot_frontage', 'lot_area', 'alley', 'year_built',
       'ms_zoning lot_shape', 'ms_zoning bldg_type', 'ms_zoning bsmtfin_sf_2',
       'lot_frontage^2', 'lot_frontage lot_shape', 'lot_frontage land_slope',
       ...
       'exter_qual bsmtfin_sf_2', 'foundation^2', 'foundation bsmt_cond',
       'foundation bsmtfin_sf_1', 'foundation bsmtfin_sf_2',
       'bsmt_cond bsmtfin_sf_1', 'bsmt_cond bsmtfin_sf_2',
       'bsmt_cond bsmt_unf_sf', 'bsmt_exposure bsmtfin_sf_2',
       'bsmt_exposure bsmt_unf_sf'],
      dtype='object', length=107)
Training Score: 0.9108574605546707
Test Score: 0.8661134361538989
Cross Validation Score: 0.8829006913445457


In [13]:
features_final = X_poly.columns[poly_rfecv.support_].values

In [14]:
interactions = [feature for feature in features_final if " " in feature]
transforms = [feature for feature in features_final if ("^" in feature) or ("log" in feature)]
print(f'{len(interactions)} interaction variables')
print(f'{len(transforms)} transformed variables')
print(f'{len(features_final) - len(interactions) - len(transforms)} non-interaction variables')

92 interaction variables
11 transformed variables
4 non-interaction variables


### This wouldn't finish in time but would an interesting model to explore in the future!