In [None]:
#https://towardsdatascience.com/geopandas-101-plot-any-data-with-a-latitude-and-longitude-on-a-map-98e01944b972

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

#libraries
import pandas as pd
import numpy as np
from datetime import date

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
#Visual format
pd.options.display.float_format = '{:20,.4f}'.format

#my libraries
from wrangle import get_zillow_data, wrangle_zillow, remove_outliers, train_validate_test_split, get_hist, get_box
from explore import plot_against_target, inertia
import evaluate
import model
import env

#library imports
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import learning_curve

# Statistical Tests
import scipy.stats as stats

#alpha
alpha = .05

In [None]:
df = get_zillow_data()
df.head(2)

In [None]:
df = wrangle_zillow(df)
df.head(2)

In [None]:
def create_features(df):
    df['age_bin'] = pd.cut(df.age, 
                           bins = [0, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140],
                           labels = [0, .066, .133, .20, .266, .333, .40, .466, .533, 
                                     .60, .666, .733, .8, .866, .933])

    # square feet bin
    df['sqft_bin'] = pd.cut(df.sqft, 
                            bins = [0, 800, 1000, 1250, 1500, 2000, 2500, 3000, 4000, 7000, 12000],
                            labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9])

    df['ppsqft_bin'] = pd.cut(df.price_per_sqft, 
                                             bins = [0, 25, 50, 75, 100, 150, 200, 300, 500, 1000, 1500],
                                             labels = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9])

    # update datatypes of binned values to be float
    df = df.astype({'age_bin': 'float64', 'sqft_bin': 'float64', 'ppsqft_bin': 'float64'})

    return df

In [None]:
df = create_features(df)
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
df.age_bin.fillna(0.3749, inplace = True)
df.sqft_bin.fillna(0.3551, inplace = True)
df.ppsqft_bin.fillna(0.5427, inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
def split(df, target_var):
    '''
    This function takes in the dataframe and target variable name as arguments and then
    splits the dataframe into train (56%), validate (24%), & test (20%)
    It will return a list containing the following dataframes: train (for exploration), 
    X_train, X_validate, X_test, y_train, y_validate, y_test
    '''
    # split df into train_validate (80%) and test (20%)
    train_validate, test = train_test_split(df, test_size=.20, random_state=13)
    # split train_validate into train(70% of 80% = 56%) and validate (30% of 80% = 24%)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=13)

    # create X_train by dropping the target variable 
    X_train = train.drop(columns=[target_var])
    # create y_train by keeping only the target variable.
    y_train = train[[target_var]]

    # create X_validate by dropping the target variable 
    X_validate = validate.drop(columns=[target_var])
    # create y_validate by keeping only the target variable.
    y_validate = validate[[target_var]]

    # create X_test by dropping the target variable 
    X_test = test.drop(columns=[target_var])
    # create y_test by keeping only the target variable.
    y_test = test[[target_var]]

    partitions = [train, X_train, X_validate, X_test, y_train, y_validate, y_test]
    return partitions

In [None]:
partitions = split(df, target_var='logerror')

In [None]:
train = partitions[0]
train['logerror_bins'] = pd.cut(train.logerror, [-5, -.2, -.05, .05, .2, 4])
partitions[0] = train

In [None]:
train.logerror_bins.value_counts()

In [None]:
sns.pairplot(data = train, hue = 'logerror_bins', 
             x_vars = ['logerror', 'age_bin', 'sqft_bin', 'ppsqft_bin'],
             y_vars = ['logerror', 'age', 'sqft', 'price_per_sqft'])

In [None]:
# SCALE

# the variables that still need scaling
scaled_vars = ['latitude', 'longitude', 'lot_size', 'tax_value', 'age', 'sqft']

# create new column names for the scaled variables by adding 'scaled_' to the beginning of each variable name 
scaled_column_names = ['scaled_' + i for i in scaled_vars]

# select the X partitions: [X_train, X_validate, X_test]
X = partitions[1:4]

# fit the minmaxscaler to X_train
X_train = X[0]
scaler = MinMaxScaler(copy=True).fit(X_train[scaled_vars])


def scale_and_concat(df):
    scaled_array = scaler.transform(df[scaled_vars])
    scaled_df = pd.DataFrame(scaled_array, columns=scaled_column_names, index=df.index.values)
    return pd.concat((df, scaled_df), axis=1)

for i in range(len(X)):
    X[i] = scale_and_concat(X[i])

In [None]:
# recall: X[0] is X_train, X[1] is X_validate and X[2] is X_test
X[0].info()

# Cluster 1: Age, Long and Lat

In [None]:
# list of variables I will cluster on. 
cluster_vars = ['scaled_latitude', 'scaled_longitude', 'age_bin']
cluster_name = 'area_cluster'
k = 3

In [None]:
def create_clusters(X_train, k, cluster_vars):
    # create kmean object
    kmeans = KMeans(n_clusters=k, random_state = 13)

    # fit to train and assign cluster ids to observations
    kmeans.fit(X_train[cluster_vars])

    return kmeans

In [None]:
kmeans = create_clusters(X[0], k, cluster_vars)

In [None]:
def get_centroids(kmeans, cluster_vars, cluster_name):
    # get the centroids for each distinct cluster...

    centroid_col_names = ['centroid_' + i for i in cluster_vars]

    centroid_df = pd.DataFrame(kmeans.cluster_centers_, 
                               columns=centroid_col_names).reset_index().rename(columns={'index': cluster_name})

    return centroid_df

In [None]:
centroid_df = get_centroids(kmeans, cluster_vars, cluster_name)

In [None]:
# label cluster for each observation in X_train (X[0] in our X list of dataframes), 
# X_validate (X[1]), & X_test (X[2])

def assign_clusters(kmeans, cluster_vars, cluster_name, centroid_df):
    for i in range(len(X)):
        clusters = pd.DataFrame(kmeans.predict(X[i][cluster_vars]), 
                            columns=[cluster_name], index=X[i].index)

        clusters_centroids = clusters.merge(centroid_df, on=cluster_name, copy=False).set_index(clusters.index.values)

        X[i] = pd.concat([X[i], clusters_centroids], axis=1)
    return X

In [None]:
X = assign_clusters(kmeans, cluster_vars, cluster_name, centroid_df)

In [None]:
pd.DataFrame(X[0].groupby(['area_cluster', 'centroid_scaled_latitude', 'centroid_scaled_longitude', 
                           'centroid_age_bin'])['area_cluster'].count())

In [None]:
X[0].info()

In [None]:
plt.figure(figsize=(12,6))
# plt.scatter(y=X_train.latitude, x=X_train.longitude, c=X_train.area_cluster, alpha=.4)
plt.scatter(y=X[0].age, x=X[0].longitude, c=X[0].area_cluster, alpha=.4)

In [None]:
y_train = partitions[-3]

plt.scatter(y=y_train.logerror, x=X[0].age, c=X[0].area_cluster, alpha=.7)
plt.ylim(-1,1)
plt.xlabel('Age of Property')
plt.ylabel('Log Error of Zestimate')
plt.title("Do clusters reveal differences in age and error?")

In [None]:
sns.boxplot(y=y_train.logerror, x=X[0].area_cluster)
plt.ylim(-1, 1)
# sns.swarmplot(X_train.age_bin, y_train.logerror, hue=X_train.area_cluster)

# Cluster 2: Age, Sqft and Value

In [None]:
X[0].info()

In [None]:
cluster_vars = ['bathrooms', 'bedrooms', 'ppsqft_bin']
cluster_name = 'aged_cluster'
k = 3

In [None]:
kmeans = create_clusters(X[0], k, cluster_vars)

In [None]:
centroid_df = get_centroids(kmeans, cluster_vars, cluster_name)

In [None]:
X = assign_clusters(kmeans, cluster_vars, cluster_name, centroid_df)

In [None]:
pd.DataFrame(X[0].groupby(['aged_cluster', 'centroid_bathrooms', 'centroid_bedrooms', 
                           'centroid_ppsqft_bin'])['aged_cluster'].count())

In [None]:
plt.figure(figsize=(12,6))
# plt.scatter(y=X_train.latitude, x=X_train.longitude, c=X_train.area_cluster, alpha=.4)
plt.scatter(y=X[0].price_per_sqft, x=X[0].bedrooms, c=X[0].aged_cluster, alpha=.4)

In [None]:
y_train = partitions[-3]

plt.scatter(y=y_train.logerror, x=X[0].price_per_sqft, c=X[0].aged_cluster, alpha=.7)
plt.ylim(-1,1)
plt.xlabel('Age of Property')
plt.ylabel('Log Error of Zestimate')
plt.title("Do clusters reveal differences in age and error?")

In [None]:
sns.boxplot(y=y_train.logerror, x=X[0].aged_cluster)
plt.ylim(-1, 1)
# sns.swarmplot(X_train.age_bin, y_train.logerror, hue=X_train.area_cluster)

In [None]:
plt.figure(figsize=(12,6))
# plt.scatter(y=X_train.latitude, x=X_train.longitude, c=X_train.area_cluster, alpha=.4)
plt.scatter(y=y_train.logerror, x=X[0].sqft, c=X[0].aged_cluster, alpha=.7)
plt.yscale('symlog')
plt.xlabel('Finished Square Feet')
plt.ylabel('Log Error of Zestimate')
plt.title('Is there distinction between clusters when visualizing size of the home by the error in zestimate?')

plt.show()

In [None]:
plt_df = X[0][['bathrooms', 'bedrooms', 'sqft', 'aged_cluster']]

sns.pairplot(data=plt_df, hue='aged_cluster')

In [None]:
sns.scatterplot(x='bedrooms', y='sqft', 
                data=X[0], hue='aged_cluster')

In [None]:
# update datatypes of binned values to be float
X_train = X[0].astype({'aged_cluster': 'category', 'area_cluster': 'category'})

In [None]:
dummy_df = pd.get_dummies(X[0][['aged_cluster','area_cluster']], dummy_na=False, drop_first=[True, True])

# append dummy df cols to the original df. 
X_train = pd.concat([X_train, dummy_df], axis=1)

In [None]:
X_train.head()

In [None]:
X_train.info()

In [None]:
X_train = [['aged_cluster', 'area_cluster']]#features
y_train = X[0].logerror
X_validate = [['aged_cluster', 'area_cluster']]#features
y_validate = X[1].logerror
X_test = [['aged_cluster', 'area_cluster']]#features
y_test = X[2].logerror

In [None]:
evaluate.rfe(X_train,y_train,1)

In [None]:
plt.figure(figsize=(12,6))
# plt.scatter(y=X_train.latitude, x=X_train.longitude, c=X_train.area_cluster, alpha=.4)
plt.scatter(y=y_train.logerror, x=X_train.sqft, c=X_train.size_cluster, alpha=.7)
plt.yscale('symlog')
plt.xlabel('Finished Square Feet')
plt.ylabel('Log Error of Zestimate')
plt.title('Is there distinction between clusters when visualizing size of the home by the error in zestimate?')

plt.show()

In [None]:
df.describe().T

In [None]:
train, validate, test = train_validate_test_split(df)
print("train observations: ", train.shape)
print("validate observations: ", validate.shape)
print("test observations: ", test.shape)

In [None]:
train.describe().T

# Explore
- We are not going to explore the scaled data at this time, but it is important that the data is scaled before moving into clustering.

- Target Variable: 'log_error'

In [None]:
#What are the distributions of each variable (train)
for col in train.columns:
    plt.figure(figsize=(4,2))
    plt.hist(train[col])
    plt.title(col)
    plt.show()

##### Takeawyas
- right skewed tax_value, square_feet, and tax_rate
- bit of a left skew on age
- log_error normally distributed

In [None]:
variables = ['bathrooms', 'bedrooms', 'sqft', 'latitude', 'longitude', 'lot_size', 'tax_value', 'age', 
            'tax_rate', 'price_per_sqft', 'county_code']

In [None]:
plot_against_target(df = train, target = 'logerror', var_list = variables)

In [None]:
cols = ['age', 'latitude', 'longitude', 'logerror']

sns.pairplot(data = train[cols], corner=True)

plt.suptitle('Amount of error is to see with Logerror', fontsize = 15)

plt.show()

In [None]:
sns.barplot(x="county", y="logerror", data=train)

In [None]:
sns.boxplot(train.county, train.logerror)
plt.title('Potential difference in logerror across counties')
plt.ylim(-.16, .16)
plt.show()

In [None]:
cols = ['age', 'bathrooms', 'bedrooms', 'sqft', 'price_per_sqft', 'lot_size', 'tax_value', 'logerror']

sns.pairplot(data = train[cols], corner=True)

plt.suptitle('Amount of error is to see with Logerror', fontsize = 15)

plt.show()

1. Higher Log errors with homes these features:
    - Homes <60 yrs 
    - <= 4 bathrooms
    - <= 5 bedrooms
    - <= 2500sqft
    - < 1,000 dollars per sqft
    - ?? Lot size
    - ?? Home value

    
    

# Clustering

## Scaled Data

In [None]:
#scale
# Start w/ empty copies to retain the original splits
train_scaled = train.copy()
validate_scaled = validate.copy()
test_scaled = test.copy()


scaler = MinMaxScaler()
cols = train.drop(columns=["county"]).columns.tolist()


train_scaled[cols] = scaler.fit_transform(train[cols])
validate_scaled[cols] = scaler.fit_transform(validate[cols])
test_scaled[cols] = scaler.fit_transform(test[cols])

# Add back in the gender column to the dataframes
train_scaled["county"] = train.county.copy()
validate_scaled["county"] = validate.county.copy()
test_scaled["county"] = test.county.copy()

## cluster 1

In [None]:
X = train_scaled[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['latitude', 'longitude'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.longitude, subset.latitude, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='latitude', x='longitude', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

## cluster 2 (logerror - age)

In [None]:
X = train_scaled[['logerror', 'age']]
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['logerror', 'age'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.age, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='age', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('logerror')
plt.ylabel('age')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

## Cluster 3 (logerror - bathrooms)

In [None]:
X = train_scaled[['logerror', 'bathrooms']]
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['logerror', 'bathrooms'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.bathrooms, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='bathrooms', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('bathrooms')
plt.ylabel('logerror')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

## Cluster 5

In [None]:
X = train_scaled[['logerror', 'bedrooms']]
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['logerror', 'bedrooms'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.bedrooms, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='bedrooms', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('bedrooms')
plt.ylabel('logerror')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

## Cluster 6

In [None]:
X = train_scaled[['logerror', 'sqft']]
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['logerror', 'sqft'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.sqft, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='sqft', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('sqft')
plt.ylabel('logerror')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

## Cluster 7

In [None]:
X = train_scaled[['logerror', 'price_per_sqft']]
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['logerror', 'price_per_sqft'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.price_per_sqft, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='price_per_sqft', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('price_per_sqft')
plt.ylabel('loerror')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

## Cluster 8

In [None]:
X = train_scaled[['logerror', 'lot_size']]
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['logerror', 'lot_size'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.lot_size, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='lot_size', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('lot_size')
plt.ylabel('loerror')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

## Cluster 9 (logerror - home value)

In [None]:
X = train_scaled[['logerror', 'tax_value']]
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['logerror', 'tax_value'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.tax_value, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='tax_value', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('tax_value')
plt.ylabel('loerror')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

### Takeaways 
- Is a higher log error dependent on homes over 50 years old? (Cluster - 2)
- Is a higher log error dependent on homes less 1000 sqft? (Cluster - 6)
- Is a higher log error dependent on homes who's ppsqft is less 200? (Cluster - 7)
- Is a higher log error dependent on homes with a smaller lot size? (Cluster - 8)
- Is a higher log error dependent on less expensive homes? (Cluster - 9)

In [None]:
# create heatmap with scaled data
plt.figure(figsize=(8,12))
value_heatmap = sns.heatmap(train.corr()[['abs_logerror']].sort_values(by='abs_logerror', ascending=True), 
                            cmap='PuOr', vmin=-.5, vmax=.5, annot=True)
value_heatmap.set_title('Feautures Correlating with Absolute Logerror')
plt.show()

# Stats testing

##### Is a higher log error dependent on homes over 50 years old? (Cluster - 2)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.age > 50)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

##### Is a higher log error dependent on homes less 1000 sqft? (Cluster - 6)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.sqft > 1000)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

##### Is a higher log error dependent on homes who's ppsqft is less 500? (Cluster - 7)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.price_per_sqft < 500)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

##### Is a higher log error dependent on homes with a smaller lot size? (Cluster - 8)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.lot_size < 236)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

##### Is a higher log error dependent on less expensive homes? (Cluster - 9)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.tax_value < 205000)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

# Modeling

### Feature Engineering

In [None]:
X_train = train_scaled[['age', 'bathrooms', 'bedrooms', 'sqft', 'price_per_sqft', 'lot_size', 'tax_value']]#features
y_train = train.logerror
X_validate = validate_scaled[['age', 'bathrooms', 'bedrooms', 'sqft', 'price_per_sqft', 'lot_size', 'tax_value']]#features
y_validate = validate.logerror
X_test = test_scaled[['age', 'bathrooms', 'bedrooms', 'sqft', 'price_per_sqft', 'lot_size', 'tax_value']]#features
y_test = test.logerror


In [None]:
evaluate.rfe(X_train,y_train,1)

In [None]:
evaluate.rfe(X_train,y_train,3)

In [None]:
evaluate.select_kbest(X_train,y_train,1)

In [None]:
evaluate.select_kbest(X_train,y_train,3)

### Regression Modeling

In [None]:
#baseline function calculates baseline and adds columns to the dataframe
evaluate.get_baseline(train,train[['sqft']], train['logerror'])

In [None]:
evaluate.get_residuals(train, train['logerror'])

In [None]:
evaluate.plot_residual(train, train[['sqft']], train['logerror'])

In [None]:
evaluate.regression_errors(train, train['logerror'], train.yhat)

In [None]:
evaluate.baseline_mean_errors(train, train['logerror'], train.yhat_baseline)

In [None]:
evaluate.better_than_baseline(regression_errors = True, baseline_mean_errors = True)

# Baseline Model

In [None]:
model.model_baseline(y_train, y_validate, 'logerror')

## LinearRegression (OLS)

In [None]:
model.linear_regression(y_train, X_train, y_validate, X_validate)

## LassoLars

In [None]:
model.lassolars(y_train, X_train, y_validate, X_validate)

## TPolynomial Regression

In [None]:
model.polynomialregression(y_train, X_train, y_validate, X_validate, X_test)

# Test

In [None]:
model.lassolars_test(X_test, y_test)

In [None]:
model.linear_regression_test(X_test, y_test)