### Import Libraries

In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# modeling imports
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNetCV, ElasticNet
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn import decomposition

### Import raw training data 

In [2]:
#ames = pd.read_csv('../datasets/train.csv')

In [3]:
ames = pd.read_csv('../datasets/test.csv')

In [4]:
ames.shape

(878, 80)

In [5]:
#want to change columns names to lower/snake_case so a little easier to work with
ames.columns = ames.columns.str.lower().str.replace(' ', '_')
ames.columns

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'alley', 'lot_shape', 'land_contour', 'utilities',
       'lot_config', 'land_slope', 'neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style', 'overall_qual',
       'overall_cond', 'year_built', 'year_remod/add', 'roof_style',
       'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type',
       'mas_vnr_area', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1',
       'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       'heating', 'heating_qc', 'central_air', 'electrical', '1st_flr_sf',
       '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath',
       'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'kitchen_qual', 'totrms_abvgrd', 'functional',
       'fireplaces', 'fireplace_qu', 'garage_type', 'garage_yr_blt',
       'g

In [6]:
#increase pandas default max_rows so I can view all columns at once
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

### First I want to get some more basic info, then i'll dive into the problem variables 

### Now, deeper dive into variables with nulls 

#### Lot Frontage 

In [7]:
#replace np.nan with average for lot_config of that type
ames.loc[(ames['lot_frontage'].isna()) & (ames['lot_config'] == 'Corner'), 'lot_frontage'] = 83.25
ames.loc[(ames['lot_frontage'].isna()) & (ames['lot_config'] == 'CulDSac'), 'lot_frontage'] = 54.734375        
ames.loc[(ames['lot_frontage'].isna()) & (ames['lot_config'] == 'FR2'), 'lot_frontage'] = 60.836735        
ames.loc[(ames['lot_frontage'].isna()) & (ames['lot_config'] == 'FR3'), 'lot_frontage'] = 87.000000        
ames.loc[(ames['lot_frontage'].isna()) & (ames['lot_config'] == 'Inside'), 'lot_frontage'] = 66.759571        

#### Alley

In [8]:
ames['alley'] = ames['alley'].replace(np.nan, 'NA') 

#### mas_vnr_type & mas_vnr_area.
#### Masonry veneer type and area

In [9]:
ames['mas_vnr_type'] = ames['mas_vnr_type'].replace(np.nan, 'None') #replace masonry np.nan with None
ames['mas_vnr_area'] = ames['mas_vnr_area'].replace(np.nan, 0) #replace masonry np.nan with 0

#### Basement related:  
* bsmt_qual (Evaluates the height of the basement), 
* bsmt_cond (Evaluates the general condition of the basement), 
* bsmt_exposure (Refers to walkout or garden level walls)
* bsmtfin_type_1
* bsmtfin_sf_1
* bsmtfin_type_2
* bsmtfin_sf_2
* bsmt_unf_sf
* total_bsmt_sf
* bsmt_full_bath
* bsmt_half_bath

In [10]:
#these all receive an NA if no basement
basement_to_na = ['bsmt_qual', 'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 
       'bsmtfin_type_2']

#these receive 0sqft if basement is NA
basement_to_zero = ['bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf', 'bsmtfin_sf_1', 'bsmt_full_bath',
       'bsmt_half_bath']

for i in basement_to_na:
    ames[i] = ames[i].replace(np.nan, 'NA') 

In [11]:
#set basement sqft to 0 IF basement is NA, else drop as incomplete basement data and only 1 null record for each    
for i in basement_to_zero:
    ames[i] = ames[i].replace(np.nan, 0) 

#### fireplace_qu

In [12]:
len(ames[(ames['fireplace_qu'].isna()) & (ames['fireplaces'] < 1)]) #number of houses w/o fireplace = fireplace_qu np.nan!

422

In [13]:
ames['fireplace_qu'] = ames['fireplace_qu'].replace(np.nan, 'NA') #NA for 'No Fireplace'

#### Garage related:
* garage_type
* garage_yr_blt
* garage_finish
* garage_cars
* garage_area
* garage_qual
* garage_cond


In [14]:
#create two lists, one for catagorical and one for numeric

garage_to_na = ['garage_type','garage_finish','garage_qual','garage_cond']
garage_to_0 = ['garage_yr_blt','garage_cars', 'garage_area']

for i in garage_to_na:
    ames[i] = ames[i].replace(np.nan, 'NA') 

In [15]:
ames[(ames['garage_yr_blt'].isna()) & (ames['garage_type'] =='NA')] #113 rows, whats up with the missing 1 to make 114?

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
29,1904,534451020,50,RL,51.0,3500,Pave,,Reg,Lvl,AllPub,Inside,Gtl,BrkSide,Feedr,Norm,1Fam,1.5Fin,3,5,1945,1950,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,No,LwQ,144,Unf,0,226,370,GasA,TA,N,FuseA,442,228,0,670,1,0,1,0,2,1,Fa,4,Typ,0,,,,,0,0,,,N,0,21,0,0,0,0,,MnPrv,Shed,2000,7,2007,WD
45,979,923228150,160,RM,21.0,1533,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,Twnhs,2Story,4,6,1970,2008,Gable,CompShg,CemntBd,CmentBd,,0.0,TA,TA,CBlock,TA,TA,No,Unf,0,Unf,0,546,546,GasA,TA,Y,SBrkr,798,546,0,1344,0,0,1,1,3,1,TA,6,Typ,1,TA,,,,0,0,,,Y,0,0,0,0,0,0,,,,0,5,2009,WD
66,2362,527403120,20,RL,83.25,8125,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,4,4,1971,1971,Gable,CompShg,HdBoard,HdBoard,,0.0,TA,TA,CBlock,TA,TA,No,BLQ,614,Unf,0,244,858,GasA,TA,Y,SBrkr,858,0,0,858,0,0,1,0,3,1,TA,5,Typ,0,,,,,0,0,,,Y,0,0,0,0,0,0,,,,0,6,2006,WD
68,2188,908226180,30,RH,70.0,4270,Pave,,Reg,Bnk,AllPub,Inside,Mod,Edwards,Norm,Norm,1Fam,1Story,3,6,1931,2006,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,BrkTil,TA,TA,No,Rec,544,Unf,0,0,544,GasA,Ex,Y,SBrkr,774,0,0,774,0,0,1,0,3,1,Gd,6,Typ,0,,,,,0,0,,,Y,0,0,286,0,0,0,,,,0,5,2007,WD
105,1988,902207010,30,RM,40.0,3880,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,9,1945,1997,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,Gd,CBlock,TA,TA,No,ALQ,329,Unf,0,357,686,GasA,Gd,Y,SBrkr,866,0,0,866,0,0,1,0,2,1,Gd,4,Typ,0,,,,,0,0,,,Y,58,42,0,0,0,0,,,,0,8,2007,WD
109,217,905101300,90,RL,72.0,10773,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,Duplex,1Story,4,3,1967,1967,Gable,Tar&Grv,Plywood,Plywood,BrkFace,72.0,Fa,Fa,CBlock,TA,TA,No,ALQ,704,Unf,0,1128,1832,GasA,TA,N,SBrkr,1832,0,0,1832,2,0,2,0,4,2,TA,8,Typ,0,,,,,0,0,,,Y,0,58,0,0,0,0,,,,0,5,2010,WD
113,2908,923205120,20,RL,90.0,17217,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,1Story,5,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,1140,1140,GasA,Ex,Y,SBrkr,1140,0,0,1140,0,0,1,0,3,1,TA,6,Typ,0,,,,,0,0,,,Y,36,56,0,0,0,0,,,,0,7,2006,WD
144,1507,908250040,50,RL,57.0,8050,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,1Fam,1.5Fin,5,8,1947,1993,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,Gd,Slab,,,,,0,,0,0,0,GasA,Gd,Y,SBrkr,929,208,0,1137,0,0,1,1,4,1,TA,8,Min1,0,,,,,0,0,,,Y,0,0,0,0,0,0,,,,0,4,2008,WD
152,1368,903476110,50,RM,60.0,5586,Pave,,IR1,Bnk,AllPub,Inside,Gtl,OldTown,Feedr,Norm,1Fam,1.5Fin,6,7,1920,1998,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,BrkTil,TA,TA,No,Unf,0,Unf,0,901,901,GasA,Gd,Y,SBrkr,1088,110,0,1198,0,0,1,0,4,1,TA,7,Typ,0,,,,,0,0,,,N,0,98,0,0,0,0,,MnPrv,,0,9,2008,ConLD
156,332,923228270,160,RM,21.0,1900,Pave,,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,Norm,Norm,TwnhsE,2Story,4,4,1970,1970,Gable,CompShg,CemntBd,CmentBd,,0.0,TA,TA,CBlock,TA,TA,No,Unf,0,Unf,0,546,546,GasA,Ex,Y,SBrkr,546,546,0,1092,0,0,1,1,3,1,TA,5,Typ,0,,,,,0,0,,,Y,0,0,0,0,0,0,,,,0,6,2010,WD


In [16]:
for i in garage_to_0:
    ames[i] = ames[i].replace(np.nan, 0) 

#### pool_qc

In [17]:
for i in ames[(ames['pool_qc'].isna()) & (ames['pool_area'] == 0)]:
    ames['pool_qc'] = ames['pool_qc'].replace(np.nan, 'NA')     

#### fence

In [18]:
ames['fence'] = ames['fence'].replace(np.nan, 'NA') 

#### misc_feature

In [19]:
for i in ames[(ames['misc_feature'].isna()) & (ames['misc_val'] == 0)]:
    ames['misc_feature'] = ames['misc_feature'].replace(np.nan, 'NA')  

### Final Null Check

In [20]:
ames.isnull().sum()

id                 0
pid                0
ms_subclass        0
ms_zoning          0
lot_frontage       0
lot_area           0
street             0
alley              0
lot_shape          0
land_contour       0
utilities          0
lot_config         0
land_slope         0
neighborhood       0
condition_1        0
condition_2        0
bldg_type          0
house_style        0
overall_qual       0
overall_cond       0
year_built         0
year_remod/add     0
roof_style         0
roof_matl          0
exterior_1st       0
exterior_2nd       0
mas_vnr_type       0
mas_vnr_area       0
exter_qual         0
exter_cond         0
foundation         0
bsmt_qual          0
bsmt_cond          0
bsmt_exposure      0
bsmtfin_type_1     0
bsmtfin_sf_1       0
bsmtfin_type_2     0
bsmtfin_sf_2       0
bsmt_unf_sf        0
total_bsmt_sf      0
heating            0
heating_qc         0
central_air        0
electrical         1
1st_flr_sf         0
2nd_flr_sf         0
low_qual_fin_sf    0
gr_liv_area  

## Check datatypes against data dictionary

#### Ordinal Data Transformations

In [21]:
#ordinal data. Will want to convert these to ordinal lists.

#columns I want to replace values in
ames_ordinal_str_columns = ames[['lot_shape','utilities','land_slope','exter_qual', 
                 'exter_cond','bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1','bsmtfin_type_2','heating_qc','electrical','kitchen_qual',
                'functional','fireplace_qu','garage_finish','garage_qual','garage_cond','paved_drive','pool_qc','fence']]


#dictionary of values I want to replace
dict_ordinal = {
    #overall_qual, overall_cond,exter_qual,exter_cond,bsmt_qual,bsmt_cond,bsmt_exposure,heatingqc,kitchenqual
    #firplacequ,garage_qual,garage_cond,pool_qc,fence
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,
    'NA': 0,
     #lot_shape
    'Reg':4,
    'IR1':3,
    'IR2':2,
    'IR3':1,
    #utilities
    'AllPub':4,
    'NoSewr':3,
    'NoSeWa':2,
    'ELO':1,
    #land_slope
    'Gtl':3,
    'Mod':2,
    'Sev':1,
    #bsmt_exposure
    'Av': 3,
    'Mn': 2,
    'No': 1,
    #bsmtfin_type_1, bsmtfin_type_1
    'GLQ':6,
    'ALQ':5,
    'BLQ':4,
    'Rec':3,
    'LwQ':2,
    'Unf':1,
    #Electrical
    'SBrkr':1,
    'FuseA':0,
    'FuseF':0,
    'FuseP':0,
    'Mix':0,
    #Functional
    'Typ':1,
    'Min1':0,
    'Min2':0,
    'Mod':0,
    'Maj1':0,
    'Maj2':0,
    'Sev':0,
    'Sal':0,
    #garage_finish    
    'Fin': 3,
    'RFn': 2,
    'Unf':1,
    #paved_drive
    'Y' : 2,
    'P' : 1,
    'N' : 0,
    #fence
    'GdPrv':4,
    'MnPrv':3,
    'GdWo':2,
    'MnWw':1   
}

#loop to get the job done
for i in ames_ordinal_str_columns.columns:
    ames.replace({i: dict_ordinal},inplace = True)

## Feature Creation

In [22]:
#I don't want to dummy neighboorhood, but I do want to bin it into oridinal columns based on mean price
ames['neighborhood_order'] = ames['neighborhood']
ames_ordinal_str_columns = ames[['neighborhood_order']]

dict_ordinal = {
    'MeadowV': 1,
    'IDOTRR' : 1,
    'BrDale' : 1,
    'OldTown': 1,
    'BrkSide': 1,
    'Edwards': 1,
    'SWISU': 1,
    'Landmrk':2,
    'Sawyer':2,
    'NPkVill':2,
    'Blueste':2,
    'NAmes':2,
    'Mitchel':2,
    'SawyerW':2,
    'Greens':3,
    'Gilbert':3,
    'NWAmes':3,
    'Blmngtn':3,
    'CollgCr':3,
    'Crawfor':3,
    'ClearCr':3,
    'Somerst':4,
    'Timber':4,
    'Veenker':4,
    'GrnHill':4,
    'StoneBr':4,
    'NoRidge':4,
    'NridgHt':4,
        }

#loop to get the job done
for i in ames_ordinal_str_columns.columns:
    ames.replace({i: dict_ordinal},inplace = True)

In [23]:
#I don't want to dummy neighboorhood, but I do want to bin it into oridinal columns based on mean price
ames['sale_type_order'] = ames['sale_type']
ames_ordinal_str_columns = ames[['sale_type_order']]

dict_ordinal = {
   'ConLw':1,
    'Oth':1,
    'ConLD':1,
    'COD':1,
    'WD':2,
    'WD ':2,
    'CWD':3,
    'ConLI':3,
    'Con':3,
    'New':3
        }

#loop to get the job done
for i in ames_ordinal_str_columns.columns:
    ames.replace({i: dict_ordinal},inplace = True)

In [24]:
ames['yr*rem'] = ames['year_remod/add'] * ames['year_built']

In [25]:
ames['basement_overall'] = ames['bsmt_qual'] *ames['bsmt_exposure']

In [26]:
ames['bsmt_type*sf_all'] = (ames['bsmtfin_type_1']*ames['bsmtfin_sf_1']) + (ames['bsmtfin_type_2']*ames['bsmtfin_sf_2']) #quality*sf interaction variable

In [27]:
ames[ames['bsmt_type*sf_all'] > 20000] #outliers maybe to drop

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,neighborhood_order,sale_type_order,yr*rem,basement_overall,bsmt_type*sf_all


In [28]:
ames['garage_qual*cond'] = ames['garage_qual']*ames['garage_cond']

In [29]:
ames['garage_fin*sqft'] = ames['garage_area']*ames['garage_finish'] #this is the man cave

In [30]:
ames['quality_above_sqft'] = (ames['1st_flr_sf']+ames['2nd_flr_sf']) - ames['low_qual_fin_sf']

In [31]:
ames['bsmt_baths'] = ames['bsmt_full_bath']+(ames['bsmt_half_bath']/2) #combine basment baths

In [32]:
ames['above_baths'] = ames['full_bath']+(ames['half_bath']/2) #combine upstairs baths

In [33]:
ames['all_baths'] = ames['above_baths'] + ames['bsmt_baths'] #I like this all bath feature, more how I think about baths in a home.

In [34]:
ames[ames['all_baths'] == 7] #outliers

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,neighborhood_order,sale_type_order,yr*rem,basement_overall,bsmt_type*sf_all,garage_qual*cond,garage_fin*sqft,quality_above_sqft,bsmt_baths,above_baths,all_baths
13,818,906230030,90,RL,66.759571,7976,Pave,,4,Lvl,4,Inside,3,SawyerW,Feedr,Norm,Duplex,2Story,7,5,2000,2000,Hip,CompShg,VinylSd,VinylSd,BrkFace,23.0,3,3,PConc,5,3,1,6,820,1,0,348,1168,GasA,5,Y,1.0,1168,1619,0,2787,2,0,4,2,6,2,3,8,1,2,3,BuiltIn,2000.0,3,4,820,3,3,2,312,0,0,0,0,0,0,0,,0,10,2009,WD,2,2,4000000,5,4920,9,2460,2787,2.0,5.0,7.0


In [35]:
ames['room_size'] = ames['gr_liv_area'] / (ames['totrms_abvgrd']) #sqft per room w/o baths. This performs better than with baths.

In [36]:
ames['kitchen_abvgr'].value_counts() #most every house has 1. Not worth including. 

1    835
2     41
3      1
0      1
Name: kitchen_abvgr, dtype: int64

In [37]:
#combine different porches. Trying to measure developed outside space for houses. 
ames['developed_outside_sf'] = ames['open_porch_sf']+ ames['screen_porch'] + ames['enclosed_porch'] +ames['3ssn_porch'] +ames['wood_deck_sf'] 

In [38]:
ames['mas_vnr_type_dummy'] = ames['mas_vnr_type']
ames_ordinal_str_columns = ames[['mas_vnr_type_dummy']]

dict_ordinal = {
   'BrkCmn':0,
    'None':0,
    'CBlock':0,
    'BrkFace':1,
    'Stone':1,   
        }

#loop to get the job done
for i in ames_ordinal_str_columns.columns:
    ames.replace({i: dict_ordinal},inplace = True)

In [39]:
ames['age'] = ames['yr_sold'] - ames['year_built'] #age of house at time of sale

In [40]:
ames['was_remodeled'] =  ames['year_remod/add'] - ames['year_built'] #dummy for if house was remodeled
ames['was_remodeled'] = [1 if i> 0 else 0 for i in ames['was_remodeled']];

In [41]:
ames['is_residential'] = ames['ms_zoning']
ames_ordinal_str_columns = ames[['is_residential']]

dict_ordinal = {
   'A (agr)':0,
    'I (all)':0,
    'C (all)':0,
    'RM':1,
    'RH':1,
    'RL':1,
    'FV':1,
        }

#loop to get the job done
for i in ames_ordinal_str_columns.columns:
    ames.replace({i: dict_ordinal},inplace = True)

#### Catagorical Data Transformations

In [42]:
ames_nominal_columns = ['ms_subclass', 'ms_zoning', 'street', 'alley', 'land_contour',
       'lot_config', 'neighborhood', 'condition_1', 'condition_2', 'bldg_type',
       'house_style', 'roof_style', 'roof_matl', 'exterior_1st',
       'exterior_2nd', 'mas_vnr_type', 'foundation', 'heating', 'central_air',
       'garage_type', 'misc_feature', 'sale_type','heating']

### DROP Outliers

In [43]:
ames[ames['id'].isin([1499, 2181])]

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,neighborhood_order,sale_type_order,yr*rem,basement_overall,bsmt_type*sf_all,garage_qual*cond,garage_fin*sqft,quality_above_sqft,bsmt_baths,above_baths,all_baths,room_size,developed_outside_sf,mas_vnr_type_dummy,age,was_remodeled,is_residential


In [44]:
#dropping sqft outliers for basement/frontage/first floor

ames = ames[ames['id'] != 1499]
ames = ames[ames['id'] != 2181]

In [45]:
ames.shape

(878, 97)

In [46]:
ames.dtypes

id                        int64
pid                       int64
ms_subclass               int64
ms_zoning                object
lot_frontage            float64
lot_area                  int64
street                   object
alley                    object
lot_shape                 int64
land_contour             object
utilities                 int64
lot_config               object
land_slope                int64
neighborhood             object
condition_1              object
condition_2              object
bldg_type                object
house_style              object
overall_qual              int64
overall_cond              int64
year_built                int64
year_remod/add            int64
roof_style               object
roof_matl                object
exterior_1st             object
exterior_2nd             object
mas_vnr_type             object
mas_vnr_area            float64
exter_qual                int64
exter_cond                int64
foundation               object
bsmt_qua

In [47]:
ames.corr()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,exter_qual,exter_cond,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating_qc,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_val,mo_sold,yr_sold,neighborhood_order,yr*rem,basement_overall,bsmt_type*sf_all,garage_qual*cond,garage_fin*sqft,quality_above_sqft,bsmt_baths,above_baths,all_baths,room_size,developed_outside_sf,mas_vnr_type_dummy,age,was_remodeled,is_residential
id,1.0,0.16812,-0.022195,0.040535,0.028352,-0.014167,0.017105,-0.043001,-0.021307,0.023618,-0.026613,-0.04205,-0.023498,-0.018423,0.011977,-0.027828,-0.011353,0.06867,-0.055185,-0.003672,-0.044132,-0.009195,-0.003255,-0.011222,0.00641,-0.052272,0.003436,0.016146,0.019628,0.017611,-0.064013,0.051464,-0.017217,-0.034426,0.027434,-0.0502,-0.022374,0.026685,-0.027398,0.021915,-0.004326,0.016719,-0.009006,-0.009492,-0.014802,0.001865,0.015857,0.027584,-0.017837,0.045393,0.018011,-0.030013,-0.009833,0.044071,0.03144,0.018282,0.009808,0.145787,-0.976564,-0.043654,-0.036697,0.059558,-0.018432,0.005374,-0.014428,0.015109,-0.053403,-0.028102,-0.057578,-0.016436,0.006191,-0.049368,-0.01601,0.065972,0.013468
pid,0.16812,1.0,0.003912,-0.111108,0.049926,0.123297,0.033814,-0.169524,-0.261138,0.099593,-0.336595,-0.111894,-0.195966,-0.224786,0.040077,-0.159985,-0.052351,0.079817,-0.150708,-0.133389,0.023867,0.024698,-0.030732,-0.162465,-0.063994,-0.108749,-0.140333,-0.001699,0.006596,-0.105007,-0.055896,0.021051,-0.156601,-0.175203,-0.000735,0.094267,-0.223537,-0.036274,-0.068307,-0.081229,-0.118562,-0.181048,-0.284462,-0.261336,-0.236341,-0.209565,-0.194244,-0.249839,-0.076886,-0.056141,0.18851,-0.025302,0.01603,-0.028212,-0.043322,0.091393,-0.038484,-0.095156,0.012456,-0.387943,-0.276749,0.026005,-0.126849,-0.216572,-0.275072,-0.106013,-0.051861,-0.20213,-0.197622,-0.124887,0.005554,-0.193308,0.336328,0.197497,-0.097384
ms_subclass,-0.022195,0.003912,1.0,-0.453635,-0.15201,0.058063,0.030614,-0.000555,0.049421,-0.060973,0.038803,0.039624,0.016716,0.017197,-0.050547,0.090853,0.01714,0.092476,0.047262,-0.059001,-0.031962,-0.077742,-0.10594,-0.20536,-0.000746,0.030378,-0.255449,0.301318,0.009775,0.068955,-0.013335,0.03348,0.119307,0.166807,-0.05715,0.270343,-0.010573,0.02563,-0.002483,-0.039311,-0.031173,-0.084692,-0.024807,-0.037282,-0.091309,-0.083402,-0.099519,-0.036447,-0.064931,-0.00261,0.00924,-0.056814,-0.080303,0.000311,-0.010827,-0.07537,-0.033519,-0.030822,0.016369,-0.054662,0.043998,0.084276,-0.048411,-0.087702,-0.063232,0.067799,-0.006142,0.166916,0.131282,0.127852,-0.082201,0.036004,-0.037997,-0.034083,-0.030983
lot_frontage,0.040535,-0.111108,-0.453635,1.0,0.335517,-0.199988,0.013301,-0.02442,0.228303,-0.105135,0.133036,0.09666,0.197006,0.166115,0.015169,0.123231,-0.003507,0.078877,0.023495,0.185617,0.021271,0.037753,0.086909,0.302741,0.078926,0.067371,0.390401,0.056759,0.026599,0.339499,0.086925,-0.018166,0.224338,0.045902,0.261967,-0.016048,0.168559,0.299175,-0.001909,0.227568,0.236166,0.113075,0.211027,0.302368,0.323577,0.144852,0.117257,0.116449,0.100902,0.091332,0.038744,0.006363,0.070333,0.218909,0.215165,-0.005803,-0.013971,0.073649,-0.054026,0.276121,0.132987,0.119659,0.184154,0.140659,0.308713,0.336579,0.08384,0.21004,0.224874,0.160585,0.158673,0.118562,-0.13507,-0.064147,-0.003946
lot_area,0.028352,0.049926,-0.15201,0.335517,1.0,-0.269882,-0.070639,-0.252109,0.080864,-0.058826,0.001246,-0.02825,0.057724,0.005498,-0.018964,0.052325,0.02891,0.168633,0.030264,0.152526,0.111556,0.146833,-0.004976,0.220063,-0.001748,0.036539,0.260506,0.030126,0.001833,0.218893,0.143519,0.025423,0.129551,0.009662,0.138673,-0.030849,0.013015,0.18392,-0.050618,0.216608,0.172706,0.055171,0.092335,0.128794,0.138601,0.063602,0.051252,0.013237,0.168038,0.035088,0.032907,0.012255,0.039328,0.065079,0.057234,-0.025813,0.033565,0.000416,-0.012974,0.177153,-0.011982,0.149433,0.166045,0.053825,0.128333,0.218984,0.150557,0.114763,0.19113,0.120924,0.173652,0.000377,-0.001808,-0.032498,-0.04298
lot_shape,-0.014167,0.123297,0.058063,-0.199988,-0.269882,1.0,0.035142,0.090317,-0.268165,0.102101,-0.274861,-0.169417,-0.053585,-0.178089,0.028842,-0.263174,-0.096601,-0.242032,-0.106582,-0.127598,-0.057354,-0.069238,-0.018861,-0.184673,-0.107336,-0.1171,-0.130615,-0.099543,0.057556,-0.178177,-0.065252,-0.038762,-0.202766,-0.127917,-0.085624,0.093997,-0.150289,-0.113917,-0.091288,-0.168044,-0.194089,-0.092444,-0.27406,-0.225162,-0.173779,-0.099473,-0.104453,-0.115843,-0.188245,-0.118814,0.094961,-0.000498,-0.000714,0.006767,0.013736,0.08795,-0.028208,0.013225,0.044744,-0.337668,-0.261208,-0.275014,-0.166572,-0.109896,-0.269035,-0.185857,-0.074407,-0.223355,-0.229519,-0.149434,-0.151286,-0.060225,0.276149,0.110304,-0.017309
utilities,0.017105,0.033814,0.030614,0.013301,-0.070639,0.035142,1.0,-0.007663,0.001237,0.106728,0.020555,0.053558,0.019164,0.022896,0.007706,0.016756,-0.005425,0.019904,0.040506,0.034008,0.009591,0.010194,-0.086642,-0.048839,0.040352,0.120421,-0.029551,0.027186,0.002831,0.001529,0.028029,0.00821,0.032627,0.026622,-0.00491,0.007216,0.025515,-0.011545,0.118442,-0.069714,-0.041593,-0.005921,0.026753,-0.01159,-0.003913,-0.009735,-0.009283,-0.01089,0.02605,0.023326,-0.055321,0.003516,0.009453,0.002129,0.002243,0.016185,-0.04308,0.015378,-0.004434,-0.019447,0.037819,0.022035,0.031908,-0.010256,0.020815,0.001166,0.030114,0.038281,0.050651,0.014493,0.00809,0.026204,-0.020699,0.031606,-0.003027
land_slope,-0.043001,-0.169524,-0.000555,-0.02442,-0.252109,0.090317,-0.007663,1.0,-0.007117,-0.03115,0.025564,0.016776,-0.020069,0.040652,-0.01966,-0.020936,-0.064028,-0.257833,-0.053257,-0.131715,-0.10885,-0.086359,0.10959,-0.064033,0.019493,-0.002801,-0.107324,0.024551,-0.049065,-0.061687,-0.110558,-0.10576,0.030358,-0.008963,0.036923,0.048497,-0.005601,0.018325,0.055396,-0.105729,-0.059552,-0.025478,0.020946,-0.000481,-0.00778,-0.042781,-0.032234,-0.034118,-0.081462,-0.015409,0.008347,-0.003869,-0.093875,-0.01111,-0.070424,0.048394,0.005619,0.015422,0.002016,-0.025749,0.025324,-0.213251,-0.139392,-0.040824,0.026205,-0.055459,-0.134846,0.022539,-0.069609,-0.130957,-0.09694,0.001762,-0.025415,-0.009735,-0.020344
overall_qual,-0.021307,-0.261138,0.049421,0.228303,0.080864,-0.268165,0.001237,-0.007117,1.0,-0.122401,0.582,0.532024,0.404529,0.716261,-0.030896,0.610964,0.20882,0.338058,0.268458,0.288637,-0.025778,-0.069156,0.252908,0.536911,0.449366,0.215525,0.471428,0.26829,-0.040658,0.576049,0.148762,-0.027035,0.535192,0.254276,0.088541,-0.134264,0.634998,0.372976,0.206175,0.405243,0.494288,0.284074,0.551865,0.627462,0.560325,0.302203,0.301321,0.23981,0.250373,0.263036,-0.113804,-0.015024,0.023043,0.104038,0.132421,-0.172637,-0.037613,0.056443,-0.039724,0.697368,0.631759,0.472358,0.357653,0.323217,0.674881,0.582146,0.144369,0.557216,0.545705,0.49414,0.257014,0.380607,-0.58233,-0.091007,0.115601
overall_cond,0.023618,0.099593,-0.060973,-0.105135,-0.058826,0.102101,0.106728,-0.03115,-0.122401,1.0,-0.363553,0.060317,-0.132937,-0.141667,0.385696,-0.201195,0.085876,-0.071017,-0.007736,-0.06157,0.072978,0.026855,-0.151703,-0.210064,-0.035731,0.016581,-0.171535,-0.00283,0.025977,-0.127846,-0.050505,0.045133,-0.202914,-0.076106,0.002435,-0.066631,-0.052708,-0.080252,0.07795,-0.086543,-0.128721,0.012752,-0.189305,-0.211422,-0.189795,-0.011271,0.0543,-0.077333,0.043534,-0.104338,0.00314,0.082982,0.035966,-0.048916,-0.049681,0.185995,0.081235,-0.016344,-0.006384,-0.248265,-0.220029,-0.137642,-0.074028,0.014424,-0.252955,-0.131383,-0.041143,-0.203392,-0.191661,-0.131227,0.014931,-0.172572,0.362399,0.268938,0.124434


In [48]:
ames.columns

Index(['id', 'pid', 'ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area',
       'street', 'alley', 'lot_shape', 'land_contour', 'utilities',
       'lot_config', 'land_slope', 'neighborhood', 'condition_1',
       'condition_2', 'bldg_type', 'house_style', 'overall_qual',
       'overall_cond', 'year_built', 'year_remod/add', 'roof_style',
       'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type',
       'mas_vnr_area', 'exter_qual', 'exter_cond', 'foundation', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmtfin_type_1', 'bsmtfin_sf_1',
       'bsmtfin_type_2', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       'heating', 'heating_qc', 'central_air', 'electrical', '1st_flr_sf',
       '2nd_flr_sf', 'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath',
       'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abvgr',
       'kitchen_abvgr', 'kitchen_qual', 'totrms_abvgrd', 'functional',
       'fireplaces', 'fireplace_qu', 'garage_type', 'garage_yr_blt',
       'g

In [49]:
#print(dummy_options)

In [50]:
ames_eval = ames[['id','is_residential','neighborhood_order','age','was_remodeled','overall_qual','exter_qual','mas_vnr_type_dummy', 'mas_vnr_area',
                  'functional', 'lot_frontage', 'lot_area', 'exter_cond','developed_outside_sf'
                  ,'garage_qual','garage_fin*sqft','garage_cars', 'paved_drive',
                 'bsmt_qual','bsmt_type*sf_all', 'bsmt_cond', 'bsmt_exposure',
                 'heating_qc','electrical','kitchen_qual', 'fireplace_qu',
                 'quality_above_sqft', 'bedroom_abvgr','totrms_abvgrd', 'all_baths','room_size']]

In [51]:
#ames_eval.to_csv('../datasets/ames_clean_v2.csv', index = False)

In [52]:
ames_eval.dtypes

id                        int64
is_residential            int64
neighborhood_order        int64
age                       int64
was_remodeled             int64
overall_qual              int64
exter_qual                int64
mas_vnr_type_dummy        int64
mas_vnr_area            float64
functional                int64
lot_frontage            float64
lot_area                  int64
exter_cond                int64
developed_outside_sf      int64
garage_qual               int64
garage_fin*sqft           int64
garage_cars               int64
paved_drive               int64
bsmt_qual                 int64
bsmt_type*sf_all          int64
bsmt_cond                 int64
bsmt_exposure             int64
heating_qc                int64
electrical              float64
kitchen_qual              int64
fireplace_qu              int64
quality_above_sqft        int64
bedroom_abvgr             int64
totrms_abvgrd             int64
all_baths               float64
room_size               float64
dtype: o

In [53]:
#just a final check
ames_eval.fillna(0,inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


# LassoCV Model

In [54]:
ames_clean = pd.read_csv('../datasets/ames_clean_v4.csv')

In [87]:
X_train = ames_clean.drop(columns=['saleprice'])
y_train = ames_clean['saleprice']
X_test = ames_eval.drop(columns= ['id'])

In [88]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(2051, 30)
(2051,)
(878, 30)


In [147]:
pipe_power = Pipeline([
    ('poly', PolynomialFeatures()),
    ('sc', StandardScaler()),
    ('lassocv', Lasso())
])

In [151]:
lasso_params1 = {'lassocv__alpha':np.logspace(-1, 4, 50),
               'lassocv__max_iter':[10000],
                 'lassocv__tol':[1]
                }

In [152]:
pipe_gridsearch = GridSearchCV(
                                pipe_power, 
                                param_grid=lasso_params1,
                            )

In [153]:
pipe_gridsearch.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                       ('sc', StandardScaler()),
                                       ('lassocv', Lasso())]),
             param_grid={'lassocv__alpha': array([1.00000000e-01, 1.26485522e-01, 1.59985872e-01, 2.02358965e-01,
       2.55954792e-01, 3.23745754e-01, 4.09491506e-01, 5.17947468e-01,
       6.55128557e-01, 8.28642773e-01, 1.04811313e+00, 1.32571137e+00,
       1.67683294e+00, 2.12...
       7.19685673e+01, 9.10298178e+01, 1.15139540e+02, 1.45634848e+02,
       1.84206997e+02, 2.32995181e+02, 2.94705170e+02, 3.72759372e+02,
       4.71486636e+02, 5.96362332e+02, 7.54312006e+02, 9.54095476e+02,
       1.20679264e+03, 1.52641797e+03, 1.93069773e+03, 2.44205309e+03,
       3.08884360e+03, 3.90693994e+03, 4.94171336e+03, 6.25055193e+03,
       7.90604321e+03, 1.00000000e+04]),
                         'lassocv__max_iter': [10000], 'lassocv__tol': [1]})

In [154]:
pipe_gridsearch.best_score_

0.8731830058633532

In [155]:
pipe_gridsearch.best_estimator_

Pipeline(steps=[('poly', PolynomialFeatures()), ('sc', StandardScaler()),
                ('lassocv',
                 Lasso(alpha=22.229964825261955, max_iter=10000, tol=1))])

In [156]:
preds = pipe_gridsearch.predict(X_test)

In [157]:
ames_eval['saleprice'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ames_eval['saleprice'] = preds


In [158]:
submission = ames_eval[['id', 'saleprice']]

In [159]:
submission.head(3)

Unnamed: 0,id,saleprice
0,2658,154978.489695
1,2718,154716.771141
2,2414,222222.747033


In [160]:
submission.to_csv('../datasets/submission_poly_lasso_iteration_2.csv', index = False) #THIS IS THE BEST SCORE YET!!!!

In [None]:
#pd.DataFrame(list(zip(X.columns, lasso_cv.coef_)))

# Ridge Model

In [127]:
pipe_ridge = Pipeline([
    #('poly', PolynomialFeatures()),
    ('sc', StandardScaler()),
    ('ridge', Ridge())
])

In [128]:
ridge_params1 = {'ridge__alpha':np.logspace(-5, 5, 1000),
               'ridge__max_iter':[10000]}

In [130]:
pipe_ridge_gridsearch = GridSearchCV(
                                pipe_ridge, 
                                param_grid=ridge_params1,
                            )

In [131]:
pipe_ridge_gridsearch.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('sc', StandardScaler()),
                                       ('ridge', Ridge())]),
             param_grid={'ridge__alpha': array([1.00000000e-05, 1.02331658e-05, 1.04717682e-05, 1.07159340e-05,
       1.09657929e-05, 1.12214777e-05, 1.14831241e-05, 1.17508713e-05,
       1.20248614e-05, 1.23052400e-05, 1.25921561e-05, 1.28857621e-05,
       1.31862140e-05, 1.34936714e-05, 1.38082977e-05, 1.413025...
       5.88531578e+04, 6.02254120e+04, 6.16296626e+04, 6.30666554e+04,
       6.45371540e+04, 6.60419396e+04, 6.75818117e+04, 6.91575883e+04,
       7.07701066e+04, 7.24202233e+04, 7.41088152e+04, 7.58367791e+04,
       7.76050334e+04, 7.94145172e+04, 8.12661920e+04, 8.31610415e+04,
       8.51000725e+04, 8.70843150e+04, 8.91148232e+04, 9.11926760e+04,
       9.33189772e+04, 9.54948564e+04, 9.77214697e+04, 1.00000000e+05]),
                         'ridge__max_iter': [10000]})

In [132]:
pipe_ridge_gridsearch.best_score_

0.8382442972749985

In [133]:
pipe_ridge_gridsearch.best_estimator_

Pipeline(steps=[('sc', StandardScaler()),
                ('ridge', Ridge(alpha=249.68784288843267, max_iter=10000))])

In [135]:
preds = pipe_ridge_gridsearch.predict(X_test)

In [137]:
ames_eval['saleprice'] = preds
ridge_submission = ames_eval[['id', 'saleprice']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ames_eval['saleprice'] = preds


In [138]:
ridge_submission.head(3)

Unnamed: 0,id,saleprice
0,2658,135638.468033
1,2718,169307.600039
2,2414,227345.540892


In [139]:
submission.to_csv('../datasets/submission_5ridge_iteration_2.csv', index = False)

# ElasticNetCV Model

In [112]:
std_slc = StandardScaler()

In [113]:
pca = decomposition.PCA()

In [114]:
elasticnet = linear_model.ElasticNet()

In [119]:
pipe_mario = Pipeline(steps=[('sc', StandardScaler()),
                           ('pca', decomposition.PCA()),
                           ('elasticnet', ElasticNet())])

In [120]:
n_components = list(range(1,X.shape[1]+1,1))
normalize = [True, False]
selection = ['cyclic', 'random']

In [124]:
parameters_mario = dict(pca__n_components=n_components,
                      elasticnet__normalize=normalize,
                      elasticnet__selection=selection)

In [125]:
clf_EN = GridSearchCV(pipe_mario, parameters)
clf_EN.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('sc', StandardScaler()), ('pca', PCA()),
                                       ('elasticnet', ElasticNet())]),
             param_grid={'elasticnet__normalize': [True, False],
                         'elasticnet__selection': ['cyclic', 'random'],
                         'pca__n_components': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                               11, 12, 13, 14, 15, 16, 17, 18,
                                               19, 20, 21, 22, 23, 24, 25, 26,
                                               27, 28, 29, 30]})

In [126]:
clf_EN.best_score_

0.8347419202004488

In [140]:
clf_EN.best_estimator_

Pipeline(steps=[('sc', StandardScaler()), ('pca', PCA(n_components=27)),
                ('elasticnet', ElasticNet())])

In [141]:
preds = clf_EN.predict(X_test)

In [142]:
ames_eval['saleprice'] = preds
ridge_submission = ames_eval[['id', 'saleprice']]
ridge_submission.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ames_eval['saleprice'] = preds


Unnamed: 0,id,saleprice
0,2658,131121.580357
1,2718,171335.803874
2,2414,227877.488009


In [143]:
submission.to_csv('../datasets/submission_elastic_iteration_2.csv', index = False)

# Experimental

In [None]:
('poly', PolynomialFeatures())

In [72]:
#https://alfurka.github.io/2018-11-18-grid-search/
def test(models, iterations = 1000):
    results = {}
    for i in models:
        r2_train = []
        r2_test = []
        for j in range(iterations):
            X_train, X_test, y_train, y_test = train_test_split(X, 
                                                                y, 
                                                                test_size= 0.01)
            r2_test.append(metrics.r2_score(y_test,
                                            models[i].fit(X_train, 
                                                         y_train).predict(X_test)))
            r2_train.append(metrics.r2_score(y_train, 
                                             models[i].fit(X_train, 
                                                          y_train).predict(X_train)))
        results[i] = [np.mean(r2_train), np.mean(r2_test)]
    return pd.DataFrame(results)

In [73]:
lasso_params = {'alpha':[0.02, 0.024, 0.025, 0.026, 0.03]}
ridge_params = {'alpha':[200, 230, 250,265, 270, 275, 290, 300, 500]}

models = {'OLS': linear_model.LinearRegression(),
           'Lasso': GridSearchCV(linear_model.Lasso(), 
                               param_grid=lasso_params).fit(X,y).best_estimator_,
           'Ridge': GridSearchCV(linear_model.Ridge(), 
                               param_grid=ridge_params).fit(X,y).best_estimator_,}

In [74]:
test(models)

Unnamed: 0,OLS,Lasso,Ridge
0,0.848125,0.848064,0.846194
1,0.786055,0.788354,0.787438


In [None]:
ames_eval['saleprice'] = preds

In [None]:
submission = ames_eval[['id', 'saleprice']]

In [None]:
print(submission.shape)
submission.head()

In [None]:
#submission.to_csv('../datasets/submission_3_iteration_2.csv', index = False)

In [None]:
lasso_cv.coef_

In [None]:
pd.DataFrame(list(zip(X.columns, lasso_cv.coef_)))