In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from scipy import stats

In [2]:
house = pd.read_csv('../Data/train_clean.csv')
house.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,533352170,60,RL,0.0,13517,0,0,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,0,0,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,0,0,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,0,0,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,0,0,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [3]:
test = pd.read_csv('../Data/test_clean.csv')
test.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,0,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,0.0,9662,0,0,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,0,0,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,0,0,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,0.0,9500,0,0,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [4]:
test.shape

(878, 80)

In [5]:
house.shape

(2051, 81)

In [6]:
xcols = ['overall_qual', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'garage_cars', 'garage_area']

In [7]:
house_types = house[xcols]
house_types.head()

Unnamed: 0,overall_qual,total_bsmt_sf,1st_flr_sf,gr_liv_area,garage_cars,garage_area
0,6,725.0,725,1479,2.0,475.0
1,7,913.0,913,2122,2.0,559.0
2,5,1057.0,1057,1057,1.0,246.0
3,5,384.0,744,1444,2.0,400.0
4,6,676.0,831,1445,2.0,484.0


In [8]:
house_types.dtypes

overall_qual       int64
total_bsmt_sf    float64
1st_flr_sf         int64
gr_liv_area        int64
garage_cars      float64
garage_area      float64
dtype: object

In [9]:
house_types.describe()

Unnamed: 0,overall_qual,total_bsmt_sf,1st_flr_sf,gr_liv_area,garage_cars,garage_area
count,2051.0,2051.0,2051.0,2051.0,2051.0,2051.0
mean,6.11214,1057.471965,1164.488055,1499.330083,1.775719,473.440761
std,1.426271,449.908003,396.446923,500.447829,0.765357,216.135102
min,1.0,0.0,334.0,334.0,0.0,0.0
25%,5.0,793.0,879.5,1129.0,1.0,319.0
50%,6.0,994.0,1093.0,1444.0,2.0,480.0
75%,7.0,1318.5,1405.0,1728.5,2.0,576.0
max,10.0,6110.0,5095.0,5642.0,5.0,1418.0


In [10]:
house_types.isna().sum()

overall_qual     0
total_bsmt_sf    0
1st_flr_sf       0
gr_liv_area      0
garage_cars      0
garage_area      0
dtype: int64

In [11]:
X = house[xcols]
y = house['saleprice']

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.3, random_state=1222)

In [14]:
print(X_train.shape, y_train.shape)

(1435, 6) (1435,)


In [15]:
print(X_val.shape, y_val.shape)

(616, 6) (616,)


In [16]:
lr = LinearRegression()

In [17]:
lr.fit(X_train, y_train)

In [18]:
lr.score(X_train, y_train)

0.7729453689039859

In [19]:
lr.score(X_val, y_val)
# The model looks like it fits well

0.787933961442823

In [None]:
X_dummies.dtypes

In [25]:
preds = lr.predict(test[xcols])

In [26]:
test.rename(columns={'id': 'Id'}, inplace=True)

In [27]:
test['SalePrice'] = preds
test.head()

Unnamed: 0,Id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,SalePrice
0,2658,902301120,190,RM,69.0,9142,0,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,184605.383214
1,2718,905108090,90,RL,0.0,9662,0,0,IR1,Lvl,...,0,0,,,,0,8,2006,WD,211149.830198
2,2414,528218130,60,RL,58.0,17104,0,0,IR1,Lvl,...,0,0,,,,0,9,2006,New,185882.495675
3,1989,902207150,30,RM,60.0,8520,0,0,Reg,Lvl,...,0,0,,,,0,7,2007,WD,129986.943957
4,625,535105100,20,RL,0.0,9500,0,0,IR1,Lvl,...,185,0,,,,0,7,2009,WD,187952.037724


In [28]:
submission_2 = test[['Id', 'SalePrice']].set_index('Id')

In [29]:
submission_2.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,184605.383214
2718,211149.830198
2414,185882.495675
1989,129986.943957
625,187952.037724


In [30]:
submission_2.shape

(878, 1)

In [None]:
#submission_2.to_csv('juddy_submit_2.csv')