In [19]:
import numpy as np
import pandas as pd
pd.options.mode.copy_on_write = True

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

import plotly.express as px

In [20]:
train = pd.read_csv('https://marksmath.org/data/reconstructed_train.csv')
test = pd.read_csv('https://marksmath.org/data/reconstructed_test.csv')

In [21]:
train

Unnamed: 0,Id,SalePrice,GrLivArea,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,208500,1710,60,RL,65.0,8450,Pave,,Reg,...,0,0,,,,0,2,2008,WD,Normal
1,2,181500,1262,20,RL,80.0,9600,Pave,,Reg,...,0,0,,,,0,5,2007,WD,Normal
2,3,223500,1786,60,RL,68.0,11250,Pave,,IR1,...,0,0,,,,0,9,2008,WD,Normal
3,4,140000,1717,70,RL,60.0,9550,Pave,,IR1,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,250000,2198,60,RL,84.0,14260,Pave,,IR1,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,175000,1647,60,RL,62.0,7917,Pave,,Reg,...,0,0,,,,0,8,2007,WD,Normal
1456,1457,210000,2073,20,RL,85.0,13175,Pave,,Reg,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1457,1458,266500,2340,70,RL,66.0,9042,Pave,,Reg,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1458,1459,142125,1078,20,RL,68.0,9717,Pave,,Reg,...,0,0,,,,0,4,2010,WD,Normal


In [22]:
px.scatter(train, x = 'GrLivArea', y = 'SalePrice',
    width=800, height=500,
    trendline='ols', trendline_color_override='black')

In [23]:
regress = LinearRegression()
X = train[['GrLivArea']]
Y = train['SalePrice']
regress.fit(X,Y)

In [24]:
y_predicted = regress.predict(train[['GrLivArea']])
log_y_predicted = [np.log(y) for y in y_predicted]
y_true = train['SalePrice']
log_y_true = y_true.apply(np.log)
mean_squared_error(log_y_predicted, log_y_true)**0.5

0.27557677046374385

In [25]:
y_predicted = regress.predict(test[['GrLivArea']])
submission = pd.DataFrame()
submission['Id'] = range(len(train) + 1, len(train) + len(test) + 1)
submission['SalePrice'] = y_predicted
submission

Unnamed: 0,Id,SalePrice
0,1461,114557.827490
1,1462,160945.272922
2,1463,193084.380612
3,1464,190406.121638
4,1465,155695.885333
...,...,...
1451,2912,135555.377847
1452,2913,135555.377847
1453,2914,149696.585231
1454,2915,122485.474053


In [26]:
submission.to_csv('submission.csv', index=False)

In [27]:
train.columns

Index(['Id', 'SalePrice', 'GrLivArea', 'MSSubClass', 'MSZoning', 'LotFrontage',
       'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
     

In [28]:
train.Fence

0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
        ...  
1455      NaN
1456    MnPrv
1457    GdPrv
1458      NaN
1459      NaN
Name: Fence, Length: 1460, dtype: object

In [29]:
test_columns = set(test.dropna(axis=1).columns)
train_columns = set(train.dropna(axis=1).columns)
full_columns = list(test_columns.intersection(train_columns))
full_columns.sort()
full_columns

['1stFlrSF',
 '2ndFlrSF',
 '3SsnPorch',
 'BedroomAbvGr',
 'BldgType',
 'CentralAir',
 'Condition1',
 'Condition2',
 'EnclosedPorch',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Fireplaces',
 'Foundation',
 'FullBath',
 'Functional',
 'GrLivArea',
 'HalfBath',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'Id',
 'KitchenAbvGr',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotArea',
 'LotConfig',
 'LotShape',
 'LowQualFinSF',
 'MSSubClass',
 'MSZoning',
 'MiscVal',
 'MoSold',
 'Neighborhood',
 'OpenPorchSF',
 'OverallCond',
 'OverallQual',
 'PavedDrive',
 'PoolArea',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SaleType',
 'ScreenPorch',
 'Street',
 'TotRmsAbvGrd',
 'Utilities',
 'WoodDeckSF',
 'YearBuilt',
 'YearRemodAdd',
 'YrSold']

In [30]:
predictors = ['GrLivArea','LotArea', 'ExterQual','ExterCond', 'Neighborhood','KitchenQual','YearBuilt','YearRemodAdd','Street','SaleCondition','PoolArea','OverallQual','Utilities','3SsnPorch']
X = train[predictors]
X

Unnamed: 0,GrLivArea,LotArea,ExterQual,ExterCond,Neighborhood,KitchenQual,YearBuilt,YearRemodAdd,Street,SaleCondition,PoolArea,OverallQual,Utilities,3SsnPorch
0,1710,8450,Gd,TA,CollgCr,Gd,2003,2003,Pave,Normal,0,7,AllPub,0
1,1262,9600,TA,TA,Veenker,TA,1976,1976,Pave,Normal,0,6,AllPub,0
2,1786,11250,Gd,TA,CollgCr,Gd,2001,2002,Pave,Normal,0,7,AllPub,0
3,1717,9550,TA,TA,Crawfor,Gd,1915,1970,Pave,Abnorml,0,7,AllPub,0
4,2198,14260,Gd,TA,NoRidge,Gd,2000,2000,Pave,Normal,0,8,AllPub,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1647,7917,TA,TA,Gilbert,TA,1999,2000,Pave,Normal,0,6,AllPub,0
1456,2073,13175,TA,TA,NWAmes,TA,1978,1988,Pave,Normal,0,6,AllPub,0
1457,2340,9042,Ex,Gd,Crawfor,Gd,1941,2006,Pave,Normal,0,7,AllPub,0
1458,1078,9717,TA,TA,NAmes,Gd,1950,1996,Pave,Normal,0,5,AllPub,0


In [31]:
quality_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
quality_encoder.fit([['Ex'], ['Gd'], ['TA'], ['Fa'], ['Po'], ['NA']])

In [32]:
numeric = ['GrLivArea','YearRemodAdd','YearBuilt','PoolArea','LotArea']
quality = ['ExterQual','KitchenQual','SaleCondition','OverallQual','ExterCond']
nominal = ['Neighborhood','Street']

encode = ColumnTransformer(
    transformers=[
        ('numeric', 'passthrough', numeric),
        ('quality', quality_encoder, quality),
        ('nominal', OneHotEncoder(handle_unknown='ignore'), nominal)
    ]
)

In [33]:
regression = LinearRegression()
pipe = Pipeline(steps = [
        ("encode", encode),
        ('regression', regression)
    ]
)

In [34]:
Y = train['SalePrice']
Y = Y.apply(np.log)
pipe.fit(X,Y)

In [35]:
test['SalePrice'] = pipe.predict(test)
test['SalePrice'] = test['SalePrice'].apply(np.exp)
submit = test[['Id', 'SalePrice']]
submit

Unnamed: 0,Id,SalePrice
0,1461,123312.270962
1,1462,154412.732475
2,1463,161920.742684
3,1464,179528.579355
4,1465,232094.790586
...,...,...
1451,2912,91037.951812
1452,2913,82283.083012
1453,2914,129013.504912
1454,2915,137812.607464


In [36]:
submit.to_csv('submission1.csv', index=False)