In [None]:
#https://towardsdatascience.com/geopandas-101-plot-any-data-with-a-latitude-and-longitude-on-a-map-98e01944b972

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

#libraries
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import date

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.model_selection import learning_curve

pd.options.display.float_format = '{:20,.4f}'.format

#my libraries
from wrangle import get_connection, get_zillow_data, handle_missing_values, remove_columns, wrangle_zillow, remove_outliers, train_validate_test_split, get_hist, get_box
from explore import explore_univariate, exp_bivariate_categorical, exp_bivariate_continuous, exp_multivariate, exp_bivariate_categorical, plot_against_target
import env

#library imports
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

# Statistical Tests
import scipy.stats as stats

In [None]:
df = get_zillow_data()
df.head(2)

In [None]:
df = wrangle_zillow(df)
df.head(2)

In [None]:
train, validate, test = train_validate_test_split(df)
print("train observations: ", train.shape)
print("validate observations: ", validate.shape)
print("test observations: ", test.shape)

# Explore
- We are not going to explore the scaled data at this time, but it is important that the data is scaled before moving into clustering.

- Target Variable: 'log_error'

In [None]:
#What are the distributions of each variable (train)
for col in train.columns:
    plt.figure(figsize=(4,2))
    plt.hist(train[col])
    plt.title(col)
    plt.show()

##### Takeawyas
- right skewed tax_value, square_feet, and tax_rate
- bit of a left skew on age
- log_error normally distributed

In [None]:
variables = ['bathrooms', 'bedrooms', 'sqft', 'latitude', 'longitude', 'lot_size', 'tax_value', 'age', 
            'tax_rate', 'price_per_sqft']

In [None]:
plot_against_target(df = train, target = 'logerror', var_list = variables)

In [None]:
cols = ['age', 'latitude', 'longitude', 'logerror']

sns.pairplot(data = train[cols], corner=True)

plt.suptitle('Amount of error is to see with Logerror', fontsize = 15)

plt.show()

In [None]:
sns.barplot(x="county", y="logerror", data=train)

##### Test whether or not Orange County has a higher log error then the rest of the population

- stats.ttest_ind(train[train.county == 'los_angeles'].logerror, train[train.county == 'orange'].logerror, equal_var=False)

- significance in location

##### Is there a relationship between bedroomcount and logerror?
- H0: Variance in logerror for homes with 5 or less bedrooms is equivalent to those with more than 5 bedrooms
- alpha of 0.05
- stats.levene(train[train.bedroomcnt > 5].logerror, train[train.bedroomcnt <= 5].logerror)

##### Accounting for Bedroom counts, is there a relationship between error and geolocation via long/lat?
- train['abs_logerr'] = train.logerror.apply(lambda x: x if x >= 0 else -x)
- plt.figure(figsize=(14,8))
- sns.scatterplot(x='longitude', y='latitude', hue='county', size='abs_logerr', sizes=(0,300), data=train)
- plt.legend()
- plt.title('Geolocation and Absolute Logerror')
- plt.show()

In [None]:
sns.boxplot(train.county, train.logerror)
plt.title('Potential difference in logerror across counties')
plt.ylim(-.16, .16)
plt.show()

In [None]:
cols = ['bathrooms', 'bedrooms', 'sqft', 'price_per_sqft', 'lot_size', 'tax_value', 'logerror']

sns.pairplot(data = train[cols], corner=True)

plt.suptitle('Amount of error is to see with Logerror', fontsize = 15)

plt.show()

1. Higher Log errors with homes these features:
    - 2-4 bathrooms: Highest 2.5
    - 2-5 bedrooms: Highest 3 bedroom
    - < 2500sqft: Highsest 1,000
    - ?? Lot size
    - ?? Home value
    - roughly 60 yrs old
    - ?? Tax rate
    - < 1,000 dollars per sqft: Highest 250 dollars
    

# Clustering

- age, long, lat
- ppsqft, bath, bed
- tax_value, sqft, ppsqft

## Scaled Data

In [None]:
#scale
# Start w/ empty copies to retain the original splits
train_scaled = train.copy()
scaler = MinMaxScaler()
cols = train.drop(columns=["county"]).columns.tolist()
train_scaled[cols] = scaler.fit_transform(train[cols])
# Add back in the gender column to the dataframes
train_scaled["county"] = train.county.copy()

## cluster 1

In [None]:
X = train_scaled[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['latitude', 'longitude'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.longitude, subset.latitude, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='latitude', x='longitude', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('longitude')
plt.ylabel('latitude')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(2, 6)):
    clusters = KMeans(k).fit(X).predict(X)
    ax.scatter(X.latitude, X.longitude, c=clusters)
    ax.set(title='k = {}'.format(k), xlabel='sepal length', ylabel='petal length')


## cluster 2

In [None]:
X = train_scaled[['sqft', 'tax_value']]
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['sqft', 'tax_value'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.tax_value, subset.sqft, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='sqft', x='tax_value', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('tax_value')
plt.ylabel('sqft')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(2, 6)):
    clusters = KMeans(k).fit(X).predict(X)
    ax.scatter(X.sqft, X.tax_value, c=clusters)
    ax.set(title='k = {}'.format(k), xlabel='tax_value', ylabel='sqft')


## Cluster 3

In [None]:
X = train_scaled[['sqft', 'logerror']]
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

In [None]:
train_scaled.groupby('cluster')['sqft', 'logerror'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.sqft, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='sqft', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('logerror')
plt.ylabel('sqft')
plt.title('Visualizing Cluster Centers')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(2, 6)):
    clusters = KMeans(k).fit(X).predict(X)
    ax.scatter(X.sqft, X.logerror, c=clusters)
    ax.set(title='k = {}'.format(k), xlabel='logerror', ylabel='sqft')