In [37]:
# Math 
from math import sqrt
from scipy import stats
import statistics
from scipy.stats import wilcoxon
import os

# General
import numpy as np
import pandas as pd
from pydataset import data

# Sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer

# Sklearn Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score


# Sklearn Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.cluster import KMeans


# Visuals
import matplotlib.pyplot as plt
import seaborn as sns
from graphviz import Graph
from tabulate import tabulate
from sklearn.tree import export_graphviz

# Custom Module's
from wrangle import wrangle
from functions import metric, select_rfe
import warnings
warnings.filterwarnings("ignore")

In [2]:
train, validate, test = wrangle()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28210 entries, 17275 to 63558
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   bath                   28210 non-null  float64
 1   bed                    28210 non-null  float64
 2   area                   28210 non-null  float64
 3   lat                    28210 non-null  float64
 4   long                   28210 non-null  float64
 5   lot_size               28210 non-null  float64
 6   year                   28210 non-null  float64
 7   tax_value              28210 non-null  float64
 8   tax_amount             28210 non-null  float64
 9   logerror               28210 non-null  float64
 10  heating_type           28210 non-null  object 
 11  county                 28210 non-null  object 
 12  price_per_sqft         28210 non-null  float64
 13  bath_scaled            28210 non-null  float64
 14  bed_scaled             28210 non-null  float64
 15

In [39]:
cols = ['bath_scaled', 'bed_scaled', 'area_scaled', 'year_scaled', 'tax_amount_scaled', 'lat_scaled', 'long_scaled','cluster_0','cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']
cluster = ['cluster_0','cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']

X_train = train[['bath_scaled', 'bed_scaled', 'area_scaled']]

X_validate = validate[['bath_scaled', 'bed_scaled', 'area_scaled']]

X_test = test[['bath_scaled', 'bed_scaled', 'area_scaled']]


y_train = train['logerror']
y_validate = validate['logerror']
y_test = test['logerror']

In [32]:
y_train  = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

results_bed_bath_area = pd.DataFrame()
results_bed_bath_area = metric(X_train, y_train, X_validate, y_validate, results)

In [33]:
results_bed_bath_area = metric(X_train, y_train, X_validate, y_validate, results_bed_bath_area)

In [34]:
results_bed_bath_area.RMSE_validate.sort_values().head(1)

1    0.172981
Name: RMSE_validate, dtype: float64

In [35]:
# only clusters

X_train = train[['cluster_0','cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']]

X_validate = validate[['cluster_0','cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']]

X_test = test[['cluster_0','cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']]


y_train = train['logerror']
y_validate = validate['logerror']
y_test = test['logerror']

y_train  = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

results_only_clusters = pd.DataFrame()
results_only_clusters = metric(X_train, y_train, X_validate, y_validate, results_only_clusters)

In [36]:
results_only_clusters.RMSE_validate.sort_values().head(1)

7    0.173039
Name: RMSE_validate, dtype: float64

# Things to try

- keep trying new features
- Look for the biggest cluster
- try features i haven't used before without clusters
- try only two clusters with starting features and new features
- ask if clusters need to be scaled.
- 
- Also look into feature engineering

In [85]:
cols = ['bath_scaled', 'tax_value', 'price_per_sqft', 'lot_size', 'bed_scaled', 'area_scaled', 'year_scaled', 'lat_scaled', 'long_scaled','cluster_0','cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']
X_train = train[cols]

In [82]:
cols = ['bath_scaled', 'bed_scaled', 'area_scaled', 'year_scaled']
X_train = train[cols]

In [101]:
cols = ['bath_scaled', 'bed_scaled', 'year_scaled']
X_train = train[cols]

In [102]:
select_rfe(X_train, y_train, 2)

(['bath_scaled', 'bed_scaled'],
            Var  Rank
 0  bath_scaled     1
 1   bed_scaled     1
 2  year_scaled     2)

In [90]:
cols = ['bath_scaled', 'tax_value', 'price_per_sqft', 'lot_size', 'bed_scaled', 'cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'area_scaled', 'year_scaled', 'lat_scaled', 'long_scaled']
X_train = train[cols]

In [91]:
select_rfe(X_train, y_train, 3)

(['cluster_0', 'cluster_2', 'cluster_3'],
                Var  Rank
 5        cluster_0     1
 7        cluster_2     1
 8        cluster_3     1
 6        cluster_1     2
 9        cluster_4     3
 10     area_scaled     4
 2   price_per_sqft     5
 4       bed_scaled     6
 12      lat_scaled     7
 0      bath_scaled     8
 13     long_scaled     9
 11     year_scaled    10
 3         lot_size    11
 1        tax_value    12)

In [43]:
# only top rfe

X_train = train[['tax_amount_scaled', 'cluster_0', 'cluster_2', 'cluster_3', 'cluster_4']]

X_validate = validate[['tax_amount_scaled', 'cluster_0', 'cluster_2', 'cluster_3', 'cluster_4']]

X_test = test[['tax_amount_scaled', 'cluster_0', 'cluster_2', 'cluster_3', 'cluster_4']]


y_train = train['logerror']
y_validate = validate['logerror']
y_test = test['logerror']

y_train  = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

results_only_rfe = pd.DataFrame()
results_only_rfe = metric(X_train, y_train, X_validate, y_validate, results_only_clusters)

In [45]:
results_only_rfe.RMSE_validate.sort_values().head(1)

16    0.173
Name: RMSE_validate, dtype: float64

In [50]:
# area, 0, 2, 3

X_train = train[['area', 'cluster_0', 'cluster_2', 'cluster_3']]

X_validate = validate[['area', 'cluster_0', 'cluster_2', 'cluster_3']]

X_test = test[['area', 'cluster_0', 'cluster_2', 'cluster_3']]


y_train = train['logerror']
y_validate = validate['logerror']
y_test = test['logerror']

y_train  = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

results_only_area023 = pd.DataFrame()
results_only_area023 = metric(X_train, y_train, X_validate, y_validate, results_only_area023)

In [56]:
results_only_area023

Unnamed: 0,model,RMSE_validate,r^2_validate
0,mean_baseline,0.173162,0.0
1,OLS Regressor,0.172893,0.003087
2,lasso_alpha_2,0.173162,0.0
3,lasso_alpha_3,0.173162,0.0
4,lasso_alpha_4,0.173162,0.0
5,lasso_alpha_5,0.173162,0.0
6,poly_degree_2,0.172936,0.002597
7,poly_degree_3,0.17289,0.003126
8,poly_degree_4,0.17288,0.00324
9,poly_degree_5,0.172894,0.003087


In [53]:
results_only_area023.RMSE_validate.sort_values().head(1)

8    0.17288
Name: RMSE_validate, dtype: float64

In [54]:
target = y_train.columns[0]
lm = LinearRegression(normalize=True)
lm.fit(X_train, y_train[target])
y_train[target+'_pred_lm'] = lm.predict(X_train)
# evaluate: rmse
rmse_train = mean_squared_error(y_train[target], y_train[target+'_pred_lm']) ** (1/2)

In [55]:
rmse_train

0.17236327633377443

In [74]:
# best so far
# year, 0 

X_train = train[['price_per_sqft','area', 'cluster_0', 'cluster_1','cluster_2', 'cluster_3', 'cluster_4']]

X_validate = validate[['price_per_sqft', 'area','cluster_0', 'cluster_1','cluster_2', 'cluster_3', 'cluster_4']]
X_test = test[['area', 'cluster_0', 'cluster_1','cluster_2', 'cluster_3', 'cluster_4']]


y_train = train['logerror']
y_validate = validate['logerror']
y_test = test['logerror']

y_train  = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

results_only_area13 = pd.DataFrame()
results_only_area13 = metric(X_train, y_train, X_validate, y_validate, results_only_area13)

In [75]:
results_only_area13

Unnamed: 0,model,RMSE_validate,r^2_validate
0,mean_baseline,0.173162,0.0
1,OLS Regressor,0.17275,0.004728
2,lasso_alpha_2,0.173162,0.0
3,lasso_alpha_3,0.173162,0.0
4,lasso_alpha_4,0.173162,0.0
5,lasso_alpha_5,0.173162,0.0
6,poly_degree_2,0.172988,0.001998
7,poly_degree_3,0.173035,0.001476
8,poly_degree_4,0.173468,-0.003563
9,poly_degree_5,0.175903,-0.031933


In [76]:
#
# year, 0 

X_train = train[['price_per_sqft','area', 'cluster_0','cluster_2', 'cluster_3']]

X_validate = validate[['price_per_sqft','area', 'cluster_0','cluster_2', 'cluster_3']]


y_train = train['logerror']
y_validate = validate['logerror']
y_test = test['logerror']

y_train  = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

results_only_area13 = pd.DataFrame()
results_only_area13 = metric(X_train, y_train, X_validate, y_validate, results_only_area13)



In [77]:
results_only_area13

Unnamed: 0,model,RMSE_validate,r^2_validate
0,mean_baseline,0.173162,0.0
1,OLS Regressor,0.17275,0.004741
2,lasso_alpha_2,0.173162,0.0
3,lasso_alpha_3,0.173162,0.0
4,lasso_alpha_4,0.173162,0.0
5,lasso_alpha_5,0.173162,0.0
6,poly_degree_2,0.17298,0.002091
7,poly_degree_3,0.173013,0.001724
8,poly_degree_4,0.173414,-0.002921
9,poly_degree_5,0.175733,-0.02995


In [79]:
target = y_train.columns[0]
lm = LinearRegression(normalize=True)
lm.fit(X_train, y_train[target])
y_train[target+'_pred_lm'] = lm.predict(X_train)
# evaluate: rmse
rmse_train = mean_squared_error(y_train[target], y_train[target+'_pred_lm']) ** (1/2)
rmse_train

0.17226222437180053

In [88]:
# BEST 
# price, bed, area023

X_train = train[['price_per_sqft','bed','area', 'cluster_0','cluster_2', 'cluster_3']]

X_validate = validate[['price_per_sqft','bed','area', 'cluster_0','cluster_2', 'cluster_3']]


y_train = train['logerror']
y_validate = validate['logerror']
y_test = test['logerror']

y_train  = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

results_only_area13 = pd.DataFrame()
results_only_area13 = metric(X_train, y_train, X_validate, y_validate, results_only_area13)




In [89]:
results_only_area13

Unnamed: 0,model,RMSE_validate,r^2_validate
0,mean_baseline,0.173162,0.0
1,OLS Regressor,0.172696,0.005362
2,lasso_alpha_2,0.173162,0.0
3,lasso_alpha_3,0.173162,0.0
4,lasso_alpha_4,0.173162,0.0
5,lasso_alpha_5,0.173162,0.0
6,poly_degree_2,0.172983,0.002046
7,poly_degree_3,0.173378,-0.00251
8,poly_degree_4,0.173914,-0.008714
9,poly_degree_5,0.176558,-0.039636


In [98]:

# price, bed, area01234

X_train = train[['bed', 'cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']]

X_validate = validate[['bed', 'cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']]


y_train = train['logerror']
y_validate = validate['logerror']
y_test = test['logerror']

y_train  = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

results_only_area13 = pd.DataFrame()
results_only_area13 = metric(X_train, y_train, X_validate, y_validate, results_only_area13)

results_only_area13

Unnamed: 0,model,RMSE_validate,r^2_validate
0,mean_baseline,0.173162,0.0
1,OLS Regressor,0.172869,0.003367
2,lasso_alpha_2,0.173162,0.0
3,lasso_alpha_3,0.173162,0.0
4,lasso_alpha_4,0.173162,0.0
5,lasso_alpha_5,0.173162,0.0
6,poly_degree_2,0.172896,0.003068
7,poly_degree_3,0.172905,0.002959
8,poly_degree_4,0.172998,0.001881
9,poly_degree_5,0.173207,-0.00052


In [115]:
# best and final bed, bath, tax amount scaled, cluster 0, 2, 3
X_train = train[['bed_scaled','bath_scaled','tax_amount_scaled','cluster_0', 'cluster_2', 'cluster_3']]

X_validate = validate[['bed_scaled', 'bath_scaled','tax_amount_scaled','cluster_0', 'cluster_2', 'cluster_3']]


y_train = train['logerror']
y_validate = validate['logerror']
y_test = test['logerror']

y_train  = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

results_only_area13 = pd.DataFrame()
results_only_area13 = metric(X_train, y_train, X_validate, y_validate, results_only_area13)

results_only_area13

Unnamed: 0,model,RMSE_validate,r^2_validate
0,mean_baseline,0.173162,0.0
1,OLS Regressor,0.172681,0.005534
2,lasso_alpha_2,0.173162,0.0
3,lasso_alpha_3,0.173162,0.0
4,lasso_alpha_4,0.173162,0.0
5,lasso_alpha_5,0.173162,0.0
6,poly_degree_2,0.172736,0.004906
7,poly_degree_3,0.172778,0.00443
8,poly_degree_4,0.173654,-0.005695
9,poly_degree_5,0.178197,-0.058992


In [116]:
target = y_train.columns[0]
lm = LinearRegression(normalize=True)
lm.fit(X_train, y_train[target])
y_train[target+'_pred_lm'] = lm.predict(X_train)
# evaluate: rmse
rmse_train = mean_squared_error(y_train[target], y_train[target+'_pred_lm']) ** (1/2)
rmse_train

0.17224335972016272