In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from scipy import stats
from sklearn.compose import ColumnTransformer

In [2]:
train = pd.read_csv('../Data/train_clean.csv')
train.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,0.0,13517,0,0,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,0,0,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,0,0,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,0,0,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,0,0,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [3]:
train.shape

(2051, 81)

In [4]:
train.isna().sum()

id              0
pid             0
ms_subclass     0
ms_zoning       0
lot_frontage    0
               ..
misc_val        0
mo_sold         0
yr_sold         0
sale_type       0
saleprice       0
Length: 81, dtype: int64

In [5]:
test = pd.read_csv('../Data/test_clean.csv')
test.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,0,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,0.0,9662,0,0,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,0,0,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,0,0,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,0.0,9500,0,0,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [6]:
test.shape

(878, 80)

In [7]:
test.isna().sum()

id              0
pid             0
ms_subclass     0
ms_zoning       0
lot_frontage    0
               ..
misc_feature    0
misc_val        0
mo_sold         0
yr_sold         0
sale_type       0
Length: 80, dtype: int64

In [8]:
xcols = ['overall_qual', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'garage_cars', 'garage_area','utilities',
         'land_slope','neighborhood','house_style', 'roof_style', 'exterior_1st', 'exter_qual', 'bsmtfin_type_1', 
         'heating', 'heating_qc','electrical','kitchen_qual', 'functional', 'garage_type', 'garage_finish',
         'garage_qual', 'garage_cond','pool_qc', 'misc_feature'] 

X = train[xcols]
y = train['saleprice']

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=1452)

In [10]:
print(X_train.shape ,y_train.shape)
print(X_val.shape ,y_val.shape)

(1435, 25) (1435,)
(616, 25) (616,)


In [11]:
ct = ColumnTransformer([
     ('oh', OneHotEncoder(drop='first', sparse_output=False, handle_unknown = 'ignore'), 
      ['overall_qual', 'garage_cars', 'utilities','land_slope','neighborhood','house_style', 'roof_style',
       'exterior_1st', 'exter_qual', 'bsmtfin_type_1', 'heating', 'heating_qc','electrical','kitchen_qual', 
       'functional', 'garage_type', 'garage_finish','garage_qual', 'garage_cond','pool_qc', 'misc_feature'])
 ],
remainder='passthrough')

In [12]:
X_ct_train = ct.fit_transform(X_train)
X_ct_val = ct.transform(X_val)



In [13]:
test_preds = test[xcols]

In [14]:
test_ct_preds = ct.transform(test_preds)



In [15]:
lr=LinearRegression()
lr.fit(X_ct_train, y_train)

In [16]:
lr.score(X_ct_train, y_train)

0.9000275971774883

In [17]:
lr.score(X_ct_val, y_val)

0.8775586664970825

In [18]:
preds = lr.predict(test_ct_preds)
preds

array([111910.05652926, 186267.64127228, 196769.85440157,  96596.37340401,
       163551.27543212,  85512.52413067,  95753.50182117, 165980.01008125,
       196028.32463059, 159311.45693964, 169799.7467173 , 113231.46299428,
       150872.28371534, 273606.36348867,  99972.98443219, 122251.7183922 ,
       139304.87256832, 118695.29929974, 164029.5195371 , 190214.31900794,
       141217.87956529, 128581.63620398, 189856.56997579, 208192.54243158,
       163108.78435405, 121562.33306719, 148462.53107086, 147663.93125297,
       162399.41451726,  57904.78604295, 114953.06972265, 106208.69681771,
       211154.23638151, 144408.08135506, 211292.92844525, 176516.6585639 ,
       115364.03699947,  53171.95271539, 114429.0867104 , 192044.75691458,
       151721.98409784, 204368.09354143, 145403.10361065, 131891.42636043,
       221158.13395404,  77518.25212889, 213604.5825683 , 121201.13890395,
       129989.15750517, 141611.46353538, 103313.29842343, 203242.34201414,
       246355.63272074, 1

In [19]:
test['SalePrice'] = preds
test.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,SalePrice
0,2658,902301120,190,RM,69.0,9142,0,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,111910.056529
1,2718,905108090,90,RL,0.0,9662,0,0,IR1,Lvl,...,0,0,,,,0,8,2006,WD,186267.641272
2,2414,528218130,60,RL,58.0,17104,0,0,IR1,Lvl,...,0,0,,,,0,9,2006,New,196769.854402
3,1989,902207150,30,RM,60.0,8520,0,0,Reg,Lvl,...,0,0,,,,0,7,2007,WD,96596.373404
4,625,535105100,20,RL,0.0,9500,0,0,IR1,Lvl,...,185,0,,,,0,7,2009,WD,163551.275432


In [20]:
test.rename(columns={'id': 'Id'}, inplace=True)

In [21]:
submission_3 = test[['Id', 'SalePrice']].set_index('Id')

In [22]:
submission_3.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,111910.056529
2718,186267.641272
2414,196769.854402
1989,96596.373404
625,163551.275432


In [23]:
submission_3.shape

(878, 1)

In [24]:
submission_3.to_csv('juddy_submit_6.csv')