In [1]:
import pandas as pd
from statsmodels.formula.api import ols as sm_ols
import numpy as np
import seaborn as sns
from statsmodels.iolib.summary2 import summary_col # nicer tables
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline 
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split



Use your new skills to estimate a model you think will produce the most accurate out-of-sample predictions as possible.
1. Model the **natural log of the sale price** (`np.log`) using the `housing_train.csv` dataset.
1. Your model should focus on optimizing the **R2**.
1. When you settle on your final model, load the `housing_holdout.csv` dataset and predict the log sale price. 
1. Save a csv file in the "submission" folder called "MY_PREDICTIONS.csv".

In [2]:
house = pd.read_csv('input_data2/housing_train.csv')
house

Unnamed: 0,parcel,v_MS_SubClass,v_MS_Zoning,v_Lot_Frontage,v_Lot_Area,v_Street,v_Alley,v_Lot_Shape,v_Land_Contour,v_Utilities,...,v_Pool_Area,v_Pool_QC,v_Fence,v_Misc_Feature,v_Misc_Val,v_Mo_Sold,v_Yr_Sold,v_Sale_Type,v_Sale_Condition,v_SalePrice
0,1056_528110080,20,RL,107.0,13891,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,New,Partial,372402
1,1055_528108150,20,RL,98.0,12704,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,New,Partial,317500
2,1053_528104050,20,RL,114.0,14803,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,New,Partial,385000
3,2213_909275160,20,RL,126.0,13108,Pave,,IR2,HLS,AllPub,...,0,,,,0,6,2007,WD,Normal,153500
4,1051_528102030,20,RL,96.0,12444,Pave,,Reg,Lvl,AllPub,...,0,,,,0,11,2008,New,Partial,394617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1936,2524_534125210,190,RL,79.0,13110,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,7,2006,WD,Normal,146500
1937,2846_909131125,190,RH,,7082,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2006,WD,Normal,160000
1938,2605_535382020,190,RL,60.0,10800,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2006,ConLD,Normal,160000
1939,1516_909101180,190,RL,55.0,5687,Pave,Grvl,Reg,Bnk,AllPub,...,0,,,,0,3,2008,WD,Normal,135900


In [3]:
house['log_sale'] = np.log(house['v_SalePrice'])
house['log_sale']

0       12.827729
1       12.668233
2       12.860999
3       11.941456
4       12.885671
          ...    
1936    11.894781
1937    11.982929
1938    11.982929
1939    11.819675
1940    11.467311
Name: log_sale, Length: 1941, dtype: float64

In [4]:
model = sm_ols(' log_sale ~ v_Pool_Area + v_Garage_Area + v_Garage_Cars + v_Overall_Qual + v_Overall_Cond ', data=house).fit()
model.summary()

pred = model.predict()
log_sale = house['log_sale']


In [5]:
output = pd.DataFrame(list(map(list, zip(log_sale,pred)))).rename(columns=({0:'log_sale', 1:'pred'}))
#output.to_csv('submission/MY_PREDICTIONS.csv')

In [6]:
housing = pd.read_csv('input_data2/housing_holdout.csv')
housing

Unnamed: 0,parcel,v_MS_SubClass,v_MS_Zoning,v_Lot_Frontage,v_Lot_Area,v_Street,v_Alley,v_Lot_Shape,v_Land_Contour,v_Utilities,...,v_Screen_Porch,v_Pool_Area,v_Pool_QC,v_Fence,v_Misc_Feature,v_Misc_Val,v_Mo_Sold,v_Yr_Sold,v_Sale_Type,v_Sale_Condition
0,1_526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal
1,988_924100040,20,RL,,9819,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2009,WD,Normal
2,984_923275140,20,RL,,8780,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2009,WD,Normal
3,977_923227080,20,RL,83.0,13383,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,3,2009,WD,Normal
4,803_906203120,20,RL,90.0,14684,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2009,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,208_903476030,190,RM,76.0,7630,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,5,2010,WD,Normal
985,207_903454060,190,RM,70.0,5600,Pave,,Reg,Lvl,AllPub,...,0,0,,,Othr,3500,7,2010,WD,Normal
986,187_902401060,190,RM,100.0,9045,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Abnorml
987,190_902402250,190,RM,60.0,11340,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal


In [7]:

# predictions

In [8]:
housing.columns
#housing['log_sale'] = np.log(housing['parcel'])

Index(['parcel', 'v_MS_SubClass', 'v_MS_Zoning', 'v_Lot_Frontage',
       'v_Lot_Area', 'v_Street', 'v_Alley', 'v_Lot_Shape', 'v_Land_Contour',
       'v_Utilities', 'v_Lot_Config', 'v_Land_Slope', 'v_Neighborhood',
       'v_Condition_1', 'v_Condition_2', 'v_Bldg_Type', 'v_House_Style',
       'v_Overall_Qual', 'v_Overall_Cond', 'v_Year_Built', 'v_Year_Remod/Add',
       'v_Roof_Style', 'v_Roof_Matl', 'v_Exterior_1st', 'v_Exterior_2nd',
       'v_Mas_Vnr_Type', 'v_Mas_Vnr_Area', 'v_Exter_Qual', 'v_Exter_Cond',
       'v_Foundation', 'v_Bsmt_Qual', 'v_Bsmt_Cond', 'v_Bsmt_Exposure',
       'v_BsmtFin_Type_1', 'v_BsmtFin_SF_1', 'v_BsmtFin_Type_2',
       'v_BsmtFin_SF_2', 'v_Bsmt_Unf_SF', 'v_Total_Bsmt_SF', 'v_Heating',
       'v_Heating_QC', 'v_Central_Air', 'v_Electrical', 'v_1st_Flr_SF',
       'v_2nd_Flr_SF', 'v_Low_Qual_Fin_SF', 'v_Gr_Liv_Area',
       'v_Bsmt_Full_Bath', 'v_Bsmt_Half_Bath', 'v_Full_Bath', 'v_Half_Bath',
       'v_Bedroom_AbvGr', 'v_Kitchen_AbvGr', 'v_Kitchen_Qual',

### I do not see a sale price column/variable for the holdout dataset

In [9]:
# model = sm_ols(' log_sale ~ v_Pool_Area + v_Garage_Area + v_Garage_Cars + v_Overall_Qual + v_Overall_Cond ', data=house).fit()
# model.summary()

# pred = model.predict()
# log_sale = house['log_sale']
