In [98]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [99]:
df = pd.read_csv ('cl_merged.csv')

In [100]:
df

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price,index
0,3,3.0,2850.0,sqft,4200.0,sqft,98119,1175000.0,1
1,3,2.0,2360.0,sqft,12196.8,sqft,98188,565000.0,4
2,3,3.5,1942.0,sqft,1603.0,sqft,98107,1187000.0,5
3,4,2.0,2060.0,sqft,4206.0,sqft,98144,1025000.0,9
4,3,2.5,1760.0,sqft,3630.0,sqft,98122,1135000.0,11
...,...,...,...,...,...,...,...,...,...
1012,3,2.0,1482.0,sqft,1085.0,sqft,98117,919000.0,2009
1013,3,3.5,1680.0,sqft,1486.0,sqft,98126,675000.0,2010
1014,3,2.0,1370.0,sqft,21780.0,sqft,98112,910000.0,2012
1015,4,2.0,2140.0,sqft,6250.0,sqft,98199,1150000.0,2014


In [101]:
df.isnull().sum()   ## data frame has 0 nulls

beds              0
baths             0
size              0
size_units        0
lot_size          0
lot_size_units    0
zip_code          0
price             0
index             0
dtype: int64

In [102]:
df.shape

(1017, 9)

In [103]:
df2 = df[['beds',	'baths'	,'size',	'size_units',	'lot_size',	'lot_size_units',	'zip_code',	'price'	]]
df2

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,3.0,2850.0,sqft,4200.0,sqft,98119,1175000.0
1,3,2.0,2360.0,sqft,12196.8,sqft,98188,565000.0
2,3,3.5,1942.0,sqft,1603.0,sqft,98107,1187000.0
3,4,2.0,2060.0,sqft,4206.0,sqft,98144,1025000.0
4,3,2.5,1760.0,sqft,3630.0,sqft,98122,1135000.0
...,...,...,...,...,...,...,...,...
1012,3,2.0,1482.0,sqft,1085.0,sqft,98117,919000.0
1013,3,3.5,1680.0,sqft,1486.0,sqft,98126,675000.0
1014,3,2.0,1370.0,sqft,21780.0,sqft,98112,910000.0
1015,4,2.0,2140.0,sqft,6250.0,sqft,98199,1150000.0


In [None]:
## Removing unseen data

In [104]:

data = df2.sample(frac=0.95,random_state=28)
data_unseen = df2.drop(data.index)

print(f'Data for model: {data.shape},\nData for unseen predictions: { data_unseen.shape}')

Data for model: (966, 8),
Data for unseen predictions: (51, 8)


In [None]:
## Data Prep 

In [105]:
from pycaret.regression import *
reg = setup(data = data, target='price', normalize=True)
reg

Unnamed: 0,Description,Value
0,Session id,8930
1,Target,price
2,Target type,Regression
3,Original data shape,"(966, 8)"
4,Transformed data shape,"(966, 14)"
5,Transformed train set shape,"(676, 14)"
6,Transformed test set shape,"(290, 14)"
7,Numeric features,5
8,Categorical features,2
9,Preprocess,True


<pycaret.regression.oop.RegressionExperiment at 0x1e515ee2ec0>

In [None]:
### Data Clean and removed  Nulls

In [106]:
df.isnull().sum()  

beds              0
baths             0
size              0
size_units        0
lot_size          0
lot_size_units    0
zip_code          0
price             0
index             0
dtype: int64

In [None]:
## comparing just the best and most effitient models.

In [107]:
best_model = compare_models(include = ['gbr','rf','huber','ridge'])
best_model

In [108]:
res = pull()
res

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,189366.9665,74690690000.0,271487.1062,0.4684,0.232,0.1861,2.772
rf,Random Forest Regressor,196635.0081,79095250000.0,279617.8534,0.4345,0.2449,0.1968,1.995
ridge,Ridge Regression,214470.6745,85165670000.0,290605.518,0.3909,0.2622,0.2182,2.035
huber,Huber Regressor,215218.8237,85652750000.0,291350.6863,0.3893,0.262,0.2167,1.986


In [None]:
## CREATE gbr MODEL

In [109]:
gbr = create_model('gbr')

In [None]:
# tunning model so I can compare

In [110]:
gbr_tuned_model = tune_model(gbr)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,178881.5453,67159768478.7915,259152.0181,0.4412,0.2212,0.1605
1,196310.792,98118925285.9519,313239.4057,0.4248,0.251,0.1985
2,221878.471,88760414740.9999,297926.8614,0.2783,0.2601,0.2243
3,174675.0923,48771077513.1894,220841.7477,0.522,0.2404,0.2028
4,217449.9653,83256145567.5434,288541.4105,0.3818,0.2588,0.2101
5,183732.4392,55957001980.2896,236552.324,0.4588,0.2591,0.2164
6,188470.2657,97452128033.7487,312173.234,0.5404,0.2113,0.1535
7,191262.3551,76400254905.7192,276405.9603,0.6123,0.2257,0.1747
8,217317.4523,81793820208.371,285996.1892,0.4251,0.2876,0.2512
9,199610.93,79483805049.0542,281928.7233,0.3328,0.2314,0.1822


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
## R2 IS 0.4418

In [None]:
## Increasing the folds

In [113]:
gbr_tuned_model_new = tune_model(gbr, fold = 15)


Fitting 15 folds for each of 10 candidates, totalling 150 fits


In [114]:
gb = pull()
gb

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,164462.3347,60996140000.0,246973.9717,0.551,0.1964,0.1407
1,215175.8003,129698100000.0,360136.2325,0.2993,0.2635,0.1939
2,171814.3579,56008470000.0,236661.0892,0.4438,0.2341,0.1963
3,207428.0621,71936060000.0,268208.9868,0.45,0.2422,0.2124
4,189909.1484,70188060000.0,264930.2971,0.247,0.2676,0.2241
5,189110.7594,54556620000.0,233573.5953,0.5185,0.2341,0.2007
6,237198.8975,101030900000.0,317853.6519,0.3106,0.2817,0.2249
7,198364.8219,58762530000.0,242409.8296,0.4065,0.2551,0.2221
8,158544.9793,51079880000.0,226008.5758,0.5226,0.2422,0.1863
9,154307.3559,53400070000.0,231084.5619,0.4426,0.21,0.1499


In [115]:
print(gbr)

GradientBoostingRegressor(random_state=8930)


In [116]:
final_model = finalize_model(gbr)
final_model

In [None]:
## predict_model(gbr)

In [None]:
## gbr,rf,huber,ridge

In [None]:
### WORK BELOW IS INVALID THIS WAS PART OF MY ANALYSIS AND COMPARION... I HAVE DECIDED TO LEAVE IT HERE JUST INCASE I WANT TO CONTINUE OR REVIEW MY ANALYSIS

In [None]:
### checking method 2 

In [67]:
test = pd.read_csv ('test.csv')

In [68]:
train = pd.read_csv ('train.csv')

In [69]:
merged_df2 = pd.concat([test, train], ignore_index=True)
merged_df2


Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,3.0,2850.0,sqft,4200.00,sqft,98119,1175000.0
1,4,5.0,3040.0,sqft,5002.00,sqft,98106,1057500.0
2,3,1.0,1290.0,sqft,6048.00,sqft,98125,799000.0
3,3,2.0,2360.0,sqft,0.28,acre,98188,565000.0
4,3,3.5,1942.0,sqft,1603.00,sqft,98107,1187000.0
...,...,...,...,...,...,...,...,...
2516,3,2.0,1370.0,sqft,0.50,acre,98112,910000.0
2517,1,1.0,889.0,sqft,,,98121,550000.0
2518,4,2.0,2140.0,sqft,6250.00,sqft,98199,1150000.0
2519,2,2.0,795.0,sqft,,,98103,590000.0


In [81]:
merged_df2

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,3.0,2850.0,sqft,4200.00,sqft,98119,1175000.0
1,4,5.0,3040.0,sqft,5002.00,sqft,98106,1057500.0
2,3,1.0,1290.0,sqft,6048.00,sqft,98125,799000.0
3,3,2.0,2360.0,sqft,0.28,acre,98188,565000.0
4,3,3.5,1942.0,sqft,1603.00,sqft,98107,1187000.0
...,...,...,...,...,...,...,...,...
2516,3,2.0,1370.0,sqft,0.50,acre,98112,910000.0
2517,1,1.0,889.0,sqft,3896.00,sqft,98121,550000.0
2518,4,2.0,2140.0,sqft,6250.00,sqft,98199,1150000.0
2519,2,2.0,795.0,sqft,3896.00,sqft,98103,590000.0


In [82]:
merged_df2.isnull().sum()  

beds              0
baths             0
size              0
size_units        0
lot_size          0
lot_size_units    0
zip_code          0
price             0
dtype: int64

In [83]:
merged_df2['lot_size'].mean()

3896.204454581515

In [84]:
merged_df2['lot_size_units']

0       sqft
1       sqft
2       sqft
3       acre
4       sqft
        ... 
2516    acre
2517    sqft
2518    sqft
2519    sqft
2520    sqft
Name: lot_size_units, Length: 2521, dtype: object

In [85]:
merged_df2['lot_size'] = merged_df2['lot_size'].fillna(3896)

In [86]:
merged_df2['lot_size_units'] = merged_df2['lot_size_units'].fillna('sqft')

In [87]:
merged_df2.isnull().sum()  

beds              0
baths             0
size              0
size_units        0
lot_size          0
lot_size_units    0
zip_code          0
price             0
dtype: int64

In [89]:
merged_df2.shape

(2521, 8)

In [90]:
data_unseen = merged_df2.sample(n=127, random_state = 28)
merged_df2 = merged_df2.drop(data_unseen.index)

print(f'Data for model: {merged_df2.shape},\nData for unseen predictions: { data_unseen.shape}')

Data for model: (2394, 8),
Data for unseen predictions: (127, 8)


In [None]:

## data = df2.sample(frac=0.95,random_state=28)
## data_unseen = df2.drop(data.index)

## print(f'Data for model: {data.shape},\nData for unseen predictions: { data_unseen.shape}')

In [91]:
from pycaret.regression import *
s = setup(data = merged_df2, target='price', normalize=True)
s

Unnamed: 0,Description,Value
0,Session id,5565
1,Target,price
2,Target type,Regression
3,Original data shape,"(2394, 8)"
4,Transformed data shape,"(2394, 11)"
5,Transformed train set shape,"(1675, 11)"
6,Transformed test set shape,"(719, 11)"
7,Ordinal features,1
8,Numeric features,5
9,Categorical features,2


<pycaret.regression.oop.RegressionExperiment at 0x1e51302d480>

In [92]:
best2 = compare_models()
best2

In [94]:
results = pull()
results

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,251012.3106,838699800000.0,673131.6,0.4672,0.3353,0.2432,1.563
ridge,Ridge Regression,265019.7337,836596400000.0,674273.5,0.4632,0.3592,0.2869,1.409
lr,Linear Regression,265067.4239,836607700000.0,674306.7,0.4631,0.3593,0.287,2.362
lasso,Lasso Regression,265067.094,836607500000.0,674306.4,0.4631,0.3593,0.287,1.536
lar,Least Angle Regression,265067.4239,836607700000.0,674306.7,0.4631,0.3593,0.287,1.513
llar,Lasso Least Angle Regression,265067.0955,836607500000.0,674306.4,0.4631,0.3593,0.287,1.529
omp,Orthogonal Matching Pursuit,264922.176,842066400000.0,679551.9,0.4513,0.3462,0.2808,1.431
en,Elastic Net,282615.6113,856402300000.0,694053.5,0.4178,0.3666,0.3123,1.396
par,Passive Aggressive Regressor,266147.2874,872947800000.0,706118.5,0.3914,0.3471,0.24,1.616
knn,K Neighbors Regressor,266495.4984,900396600000.0,750822.6,0.2511,0.3435,0.2685,1.534
