In [1]:
import pandas as pd
import pandasql as pdsql
import seaborn as sns
import signac
import produce_data
import random_forest

from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error

sns.set_theme(style="whitegrid")

### Dataset with minimum R-squared value 0.1

In [2]:
training_dataset, test_X, ids_test = produce_data.produce_init("10rsq_final_data.csv")

In [3]:
X = training_dataset.drop(['SalePrice'], axis=1)
y = training_dataset['SalePrice']
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2)

In [4]:
display(X.head())
display(X.describe())

Unnamed: 0,OverallQual,GrLivArea,GarageCars,ExterQual,GarageArea,TotalBsmtSF,BsmtQual,1stFlrSF,KitchenQual,FullBath,...,GarageYrBlt,MasVnrArea,Fireplaces,HeatingQC,BsmtFinSF1,Foundation,GarageType,LotFrontage,WoodDeckSF,2ndFlrSF
0,7,1710,2,2,548,856,2,856,2,2,...,2003.0,196.0,0,0,706,2,1,65.0,0,854
1,6,1262,2,3,460,1262,2,1262,3,2,...,1976.0,0.0,1,0,978,1,1,80.0,298,0
2,7,1786,2,2,608,920,2,920,2,2,...,2001.0,162.0,1,0,486,2,1,68.0,0,866
3,7,1717,3,3,642,756,4,961,2,1,...,1998.0,0.0,1,2,216,0,5,60.0,0,756
4,8,2198,3,2,836,1145,2,1145,2,2,...,2000.0,350.0,1,0,655,2,1,84.0,192,1053


Unnamed: 0,OverallQual,GrLivArea,GarageCars,ExterQual,GarageArea,TotalBsmtSF,BsmtQual,1stFlrSF,KitchenQual,FullBath,...,GarageYrBlt,MasVnrArea,Fireplaces,HeatingQC,BsmtFinSF1,Foundation,GarageType,LotFrontage,WoodDeckSF,2ndFlrSF
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,6.099315,1515.463699,1.767123,2.539726,472.980137,1057.429452,2.724658,1162.626712,2.339726,1.565068,...,1978.478082,103.681507,0.613014,1.538356,443.639726,1.396575,2.485616,70.041096,94.244521,346.992466
std,1.382997,525.480383,0.747315,0.693995,213.804841,438.705324,1.27863,386.587738,0.830161,0.550916,...,23.994863,180.56912,0.644666,1.739524,456.098091,0.722394,1.933206,22.024031,125.338794,436.528436
min,1.0,334.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,...,1900.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0
25%,5.0,1129.5,1.0,2.0,334.5,795.75,2.0,882.0,2.0,1.0,...,1962.0,0.0,0.0,0.0,0.0,1.0,1.0,60.0,0.0,0.0
50%,6.0,1464.0,2.0,3.0,480.0,991.5,2.0,1087.0,3.0,2.0,...,1978.0,0.0,1.0,0.0,383.5,1.0,1.0,70.0,0.0,0.0
75%,7.0,1776.75,2.0,3.0,576.0,1298.25,4.0,1391.25,3.0,2.0,...,2001.0,164.25,1.0,4.0,712.25,2.0,5.0,79.0,168.0,728.0
max,10.0,5642.0,4.0,3.0,1418.0,6110.0,4.0,4692.0,3.0,3.0,...,2010.0,1600.0,3.0,4.0,5644.0,5.0,6.0,313.0,857.0,2065.0


In [5]:
display(y.head())
display(y.describe())

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [6]:
y_pred, r2, mse, hyperparams = random_forest.validate_model(train_X, train_y, val_X, val_y)

Generating Model... Please wait
Model Rendered


#### R-squared 

In [7]:
r2

0.7273836347063488

#### Mean Squared Log Error

In [8]:
mse

0.028783836057227737

#### Submission

In [9]:
final_predict, _, _ = random_forest.model(X, y, test_X, None, hyperparams=hyperparams)
produce_data.produce_submission(ids_test, final_predict, "10_rsq_randomforest_submission.csv")

#### Store data to jobs

In [10]:
job = produce_data.get_job('10%')
job.sp['val_r_squared'] = 0.72
job.sp['val_mean_squared_log_error'] = 0.028
job.sp['submit_mean_squared_log_error'] = 0.14915

### Dataset with minimum R-squared value 0.05

In [11]:
training_dataset, test_X, ids_test = produce_data.produce_init("5rsq_final_data.csv")

In [12]:
X = training_dataset.drop(['SalePrice'], axis=1)
y = training_dataset['SalePrice']
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2)

In [13]:
display(X.head())
display(X.describe())

Unnamed: 0,OverallQual,GrLivArea,GarageCars,ExterQual,GarageArea,TotalBsmtSF,BsmtQual,1stFlrSF,KitchenQual,FullBath,...,2ndFlrSF,OpenPorchSF,BsmtExposure,HalfBath,LotArea,LotShape,CentralAir,Electrical,PavedDrive,BsmtFullBath
0,7,1710,2,2,548,856,2,856,2,2,...,854,61,4,1,8450,3,1,5,2,1
1,6,1262,2,3,460,1262,2,1262,3,2,...,0,0,1,0,9600,3,1,5,2,0
2,7,1786,2,2,608,920,2,920,2,2,...,866,42,3,1,11250,0,1,5,2,1
3,7,1717,3,3,642,756,4,961,2,1,...,756,35,4,0,9550,0,1,5,2,1
4,8,2198,3,2,836,1145,2,1145,2,2,...,1053,84,0,1,14260,0,1,5,2,1


Unnamed: 0,OverallQual,GrLivArea,GarageCars,ExterQual,GarageArea,TotalBsmtSF,BsmtQual,1stFlrSF,KitchenQual,FullBath,...,2ndFlrSF,OpenPorchSF,BsmtExposure,HalfBath,LotArea,LotShape,CentralAir,Electrical,PavedDrive,BsmtFullBath
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,6.099315,1515.463699,1.767123,2.539726,472.980137,1057.429452,2.724658,1162.626712,2.339726,1.565068,...,346.992466,46.660274,2.989041,0.382877,10516.828082,1.942466,0.934932,4.59589,1.856164,0.425342
std,1.382997,525.480383,0.747315,0.693995,213.804841,438.705324,1.27863,386.587738,0.830161,0.550916,...,436.528436,66.256028,1.552337,0.502885,9981.264932,1.409156,0.246731,1.328647,0.496592,0.518911
min,1.0,334.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1300.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,1129.5,1.0,2.0,334.5,795.75,2.0,882.0,2.0,1.0,...,0.0,0.0,2.0,0.0,7553.5,0.0,1.0,5.0,2.0,0.0
50%,6.0,1464.0,2.0,3.0,480.0,991.5,2.0,1087.0,3.0,2.0,...,0.0,25.0,4.0,0.0,9478.5,3.0,1.0,5.0,2.0,0.0
75%,7.0,1776.75,2.0,3.0,576.0,1298.25,4.0,1391.25,3.0,2.0,...,728.0,68.0,4.0,1.0,11601.5,3.0,1.0,5.0,2.0,1.0
max,10.0,5642.0,4.0,3.0,1418.0,6110.0,4.0,4692.0,3.0,3.0,...,2065.0,547.0,4.0,2.0,215245.0,3.0,1.0,5.0,2.0,3.0


In [14]:
display(y.head())
display(y.describe())

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [15]:
y_pred, r2, mse, hyperparams = random_forest.validate_model(train_X, train_y, val_X, val_y)

Generating Model... Please wait
Model Rendered


#### R-squared 

In [16]:
r2

0.8771162019964527

#### Mean Squared Log Error

In [17]:
mse

0.019531040019506406

#### Submission

In [18]:
final_predict, _, _ = random_forest.model(X, y, test_X, None, hyperparams=hyperparams)
produce_data.produce_submission(ids_test, final_predict, "5_rsq_randomforest_submission.csv")

#### Store data to jobs

In [20]:
job = produce_data.get_job('5%')
job.sp['val_r_squared'] = 0.88
job.sp['val_mean_squared_log_error'] = 0.019
job.sp['submit_mean_squared_log_error'] = 0.14725

### Dataset with minimum R-squared value 0.02

In [21]:
training_dataset, test_X, ids_test = produce_data.produce_init("2rsq_final_data.csv")

In [22]:
X = training_dataset.drop(['SalePrice'], axis=1)
y = training_dataset['SalePrice']
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2)

In [23]:
display(X.head())
display(X.describe())

Unnamed: 0,OverallQual,GrLivArea,GarageCars,ExterQual,GarageArea,TotalBsmtSF,BsmtQual,1stFlrSF,KitchenQual,FullBath,...,PavedDrive,BsmtFullBath,RoofStyle,BsmtUnfSF,SaleCondition,Neighborhood,HouseStyle,BedroomAbvGr,MSZoning,GarageCond
0,7,1710,2,2,548,856,2,856,2,2,...,2,1,1,150,4,5,5,3,3,5
1,6,1262,2,3,460,1262,2,1262,3,2,...,2,0,1,284,4,24,2,3,3,5
2,7,1786,2,2,608,920,2,920,2,2,...,2,1,1,434,4,5,5,3,3,5
3,7,1717,3,3,642,756,4,961,2,1,...,2,1,1,540,0,6,5,3,3,5
4,8,2198,3,2,836,1145,2,1145,2,2,...,2,1,1,490,4,15,5,4,3,5


Unnamed: 0,OverallQual,GrLivArea,GarageCars,ExterQual,GarageArea,TotalBsmtSF,BsmtQual,1stFlrSF,KitchenQual,FullBath,...,PavedDrive,BsmtFullBath,RoofStyle,BsmtUnfSF,SaleCondition,Neighborhood,HouseStyle,BedroomAbvGr,MSZoning,GarageCond
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,6.099315,1515.463699,1.767123,2.539726,472.980137,1057.429452,2.724658,1162.626712,2.339726,1.565068,...,1.856164,0.425342,1.410274,567.240411,3.770548,12.25137,3.038356,2.866438,3.028767,4.763014
std,1.382997,525.480383,0.747315,0.693995,213.804841,438.705324,1.27863,386.587738,0.830161,0.550916,...,0.496592,0.518911,0.834998,441.866955,1.100854,6.013735,1.911305,0.815778,0.632017,0.80267
min,1.0,334.0,0.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,1129.5,1.0,2.0,334.5,795.75,2.0,882.0,2.0,1.0,...,2.0,0.0,1.0,223.0,4.0,7.0,2.0,2.0,3.0,5.0
50%,6.0,1464.0,2.0,3.0,480.0,991.5,2.0,1087.0,3.0,2.0,...,2.0,0.0,1.0,477.5,4.0,12.0,2.0,3.0,3.0,5.0
75%,7.0,1776.75,2.0,3.0,576.0,1298.25,4.0,1391.25,3.0,2.0,...,2.0,1.0,1.0,808.0,4.0,17.0,5.0,3.0,3.0,5.0
max,10.0,5642.0,4.0,3.0,1418.0,6110.0,4.0,4692.0,3.0,3.0,...,2.0,3.0,5.0,2336.0,5.0,24.0,7.0,8.0,4.0,5.0


In [24]:
display(y.head())
display(y.describe())

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [25]:
y_pred, r2, mse, hyperparams = random_forest.validate_model(train_X, train_y, val_X, val_y)

Generating Model... Please wait
Model Rendered


#### R-squared 

In [26]:
r2

0.844047845342431

#### Mean Squared Log Error

In [27]:
mse

0.01695501226098883

#### Submission

In [28]:
final_predict, _, _ = random_forest.model(X, y, test_X, None, hyperparams=hyperparams)
produce_data.produce_submission(ids_test, final_predict, "2_rsq_randomforest_submission.csv")

#### Store data to jobs

In [10]:
job = produce_data.get_job('2%')
job.sp['val_r_squared'] = 0.72
job.sp['val_mean_squared_log_error'] = 0.028
job.sp['submit_mean_squared_log_error'] = 0.14915