In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
pd.set_option('display.max_rows', None)

In [4]:
df = pd.read_csv('./datasets/train_cleaned_fe3.csv')
df_test = pd.read_csv('./datasets/test_cleaned_fe3.csv')

df.drop(columns='Unnamed: 0', inplace=True)

In [5]:
df.head()

Unnamed: 0,lot_area,street,land_cont,neighborhood,cond_1,cond_2,bldg_type,style,overall_cond,yr_built,yr_remodeled,roof_style,exter_cond,foundation,bsmt_cond,bsmt_fin_1,bsmt_fin_2,bsmt_sf,heat,cent_air,gr_liv_area,full_bath,half_bath,bedrooms_gr,kitchen,kitch_qual,fireplaces,garage_type,garage_car_size,paved_drive,sale_price,year_sold,fe_bed_full_bath,has_pool
0,13517,Pave,Lvl,Sawyer,RRA,Norm,1Fam,2Story,8,1976,2005,Gable,TA,CBlock,TA,GLQ,Unf,725.0,GasA,Y,1479,2,1,3,1,Gd,0,Attchd,2,Y,130500,2010,6,0
1,11492,Pave,Lvl,SawyerW,Norm,Norm,1Fam,2Story,5,1996,1997,Gable,TA,PConc,TA,GLQ,Unf,913.0,GasA,Y,2122,2,1,4,1,Gd,1,Attchd,2,Y,220000,2009,8,0
2,7922,Pave,Lvl,NAmes,Norm,Norm,1Fam,1Story,7,1953,2007,Gable,Gd,CBlock,TA,GLQ,Unf,1057.0,GasA,Y,1057,1,0,3,1,Gd,0,Detchd,1,Y,109000,2010,3,0
3,9802,Pave,Lvl,Timber,Norm,Norm,1Fam,2Story,5,2006,2007,Gable,TA,PConc,TA,Unf,Unf,384.0,GasA,Y,1444,2,1,3,1,TA,0,BuiltIn,2,Y,174000,2010,6,0
4,14235,Pave,Lvl,SawyerW,Norm,Norm,1Fam,Fin,8,1900,1993,Gable,TA,PConc,Gd,Unf,Unf,676.0,GasA,Y,1445,2,0,3,1,TA,0,Detchd,2,N,138500,2010,6,0


In [6]:
# df.dtypes

In [7]:
X1_features = ['lot_area', 
         'land_cont', 
         'neighborhood', 
         'cond_1', 
         'bldg_type', 
         'style', 
         'overall_cond', 
         'yr_built', 
         'yr_remodeled', 
         'exter_cond', 
         'bsmt_sf', 
         'gr_liv_area', 
         'full_bath', 
         'half_bath', 
         'bedrooms_gr', 
         'kitch_qual', 
         'garage_type', 
         'has_pool', 
         'year_sold'
        ]

X4_features = ['lot_area', 
         'neighborhood', 
         'cond_1', 
         'bldg_type', 
         'yr_built', 
         'exter_cond', 
         'bsmt_sf', 
         'full_bath', 
         'half_bath', 
         'bedrooms_gr', 
         'kitch_qual', 
         'garage_type',
       'fe_ov_cond_gr_liv_area'
        ]



X5_features = ['lot_area', 
         'neighborhood', 
         'cond_1', 
         'bldg_type', 
         'overall_cond',
         'exter_cond', 
         'bsmt_sf', 
         'full_bath', 
         'half_bath', 
         'bedrooms_gr', 
         'kitch_qual', 
         'garage_type',
       'fe_yr_blt_gr_liv_area',
               'year_sold'
        ]

X6_features = ['lot_area', 
         'neighborhood', 
         'cond_1', 
         'bldg_type', 
         'style', 
         'overall_cond', 
         'yr_built', 
         'exter_cond', 
         'bsmt_sf', 
         'gr_liv_area', 
         'full_bath', 
         'half_bath', 
         'bedrooms_gr', 
         'kitch_qual', 
         'garage_type', 
         'year_sold',
         'cent_air'
        ]


X7_features = ['lot_area', 
         'neighborhood', 
         'cond_1', 
         'bldg_type', 
         'style', 
         'overall_cond', 
         'yr_built', 
         'yr_remodeled', 
         'exter_cond', 
         'bsmt_sf', 
         'gr_liv_area', 
         'full_bath', 
         'half_bath', 
         'bedrooms_gr', 
         'kitch_qual', 
         'garage_type', 
         'year_sold',
         'cent_air'
        ]

X8_features = ['lot_area', 
         'neighborhood', 
         'cond_1', 
         'bldg_type', 
         'style', 
         'overall_cond', 
         'yr_built', 
         'yr_remodeled', 
         'exter_cond', 
         'bsmt_sf', 
         'gr_liv_area', 
         'full_bath', 
         'half_bath', 
         'bedrooms_gr', 
         'kitch_qual', 
         'garage_type', 
         'cent_air'
        ]

X9_features = ['lot_area', 
         'neighborhood', 
         'cond_1', 
         'bldg_type', 
         'style', 
         'overall_cond', 
         'yr_built', 
         'yr_remodeled', 
         'exter_cond', 
         'bsmt_sf', 
         'gr_liv_area', 
         'full_bath', 
         'half_bath', 
         'bedrooms_gr', 
         'kitch_qual', 
         'garage_type', 
         'cent_air',
               'fe_bed_full_bath'
        ]


X10_features = ['lot_area', 
         'neighborhood', 
         'cond_1', 
         'bldg_type', 
         'style', 
         'overall_cond', 
         'yr_built', 
         'yr_remodeled', 
         'exter_cond', 
         'bsmt_sf', 
         'gr_liv_area', 
 
         'half_bath', 

         'kitch_qual', 
         'garage_type', 
         'cent_air',
               'fe_bed_full_bath'
        ]

X10 = df[X10_features]
X10 = pd.get_dummies(data=X10, columns=['neighborhood', 
                                      'cond_1', 'bldg_type', 'style',
                                      'exter_cond', 'kitch_qual', 
                                      'garage_type', 'cent_air'], drop_first=True)

X10_test = df_test[X10_features]
X10_test = pd.get_dummies(data=X10_test, columns=['neighborhood', 
                                      'cond_1', 'bldg_type', 'style',
                                      'exter_cond', 'kitch_qual', 
                                      'garage_type', 'cent_air'], drop_first=True)

y = df['sale_price']

In [8]:
X10.shape

(2051, 55)

In [9]:
# X6.columns

In [10]:
X10_test.shape

(878, 55)

In [11]:
# X6_test.columns

In [12]:
# print(X1_test)
# X1_test.dtypes

In [13]:
# X1.dtypes

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X10, y, random_state=42)

In [15]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [16]:
train_pred = lr.predict(X_train)

In [17]:
lr.score(X_train, y_train)

0.8356172057831173

In [18]:
lr.score(X_test, y_test)

0.8678744709894287

In [19]:
SalePrice = lr.predict(X10_test)

In [20]:
df_test['SalePrice'] = SalePrice

In [21]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,lot_area,street,land_cont,neighborhood,cond_1,cond_2,bldg_type,style,overall_cond,yr_built,yr_remodeled,roof_style,exter_cond,foundation,bsmt_cond,bsmt_fin_1,bsmt_fin_2,bsmt_sf,heat,cent_air,gr_liv_area,full_bath,half_bath,bedrooms_gr,kitchen,kitch_qual,fireplaces,garage_type,garage_car_size,paved_drive,Id,year_sold,fe_bed_full_bath,has_pool,SalePrice
0,0,9142,Pave,Lvl,OldTown,Norm,Norm,2fmCon,2Story,8,1910,1950,Gable,Fa,Other,TA,Unf,Unf,1020,GasA,N,1928,2,0,4,2,Fa,0,Detchd,1,Y,2658,2006,8,0,130915.402492
1,1,9662,Pave,Lvl,Sawyer,Norm,Norm,Duplex,1Story,4,1977,1977,Gable,TA,CBlock,TA,Unf,Unf,1967,GasA,Y,1967,2,0,6,2,TA,0,Attchd,2,Y,2718,2006,12,0,183822.956114
2,2,17104,Pave,Lvl,Gilbert,Norm,Norm,1Fam,2Story,5,2006,2006,Gable,TA,PConc,Gd,GLQ,Unf,654,GasA,Y,1496,2,1,3,1,Gd,1,Attchd,2,Y,2414,2006,6,0,189738.831787
3,3,8520,Pave,Lvl,OldTown,Norm,Norm,1Fam,1Story,6,1923,2006,Gable,TA,CBlock,TA,Unf,Unf,968,GasA,Y,968,1,0,2,1,TA,0,Detchd,2,N,1989,2007,2,0,109486.038431
4,4,9500,Pave,Lvl,NAmes,Norm,Norm,1Fam,1Story,5,1963,1963,Gable,TA,CBlock,TA,BLQ,Unf,1394,GasA,Y,1394,1,1,3,1,TA,2,Attchd,2,Y,625,2009,3,0,159330.425302


In [22]:
jsalisbury_10 = df_test[['Id', 'SalePrice']]

In [23]:
jsalisbury_10.head()

Unnamed: 0,Id,SalePrice
0,2658,130915.402492
1,2718,183822.956114
2,2414,189738.831787
3,1989,109486.038431
4,625,159330.425302


In [24]:
# lr = LinearRegression()

In [25]:
# lr.fit(X1, y)

In [26]:
# preds = lr.predict(X1)

In [27]:
# metrics.r2_score(y, preds)

In [28]:
# mse = metrics.mean_squared_error(y, preds)
# np.sqrt(mse)

In [29]:
jsalisbury_10.to_csv('./datasets/jsalisbury_10.csv', index=False)