In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

#libraries
import pandas as pd
import numpy as np
from datetime import date

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
#Visual format
pd.options.display.float_format = '{:20,.4f}'.format

#my libraries
from wrangle import get_zillow_data, wrangle_zillow, remove_outliers, train_validate_test_split, get_hist, get_box
from explore import inertia, variable_distributions, plot_against_target
import evaluate
import model
import env

#library imports
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import learning_curve

# Statistical Tests
import scipy.stats as stats

#alpha
alpha = .05

# Executive Summary

## Plan

## Acquired

In [None]:
df = get_zillow_data()
df.head(2)

## Prepared

In [None]:
df = wrangle_zillow(df)
df.head(2)

### Split

In [None]:
train, validate, test = train_validate_test_split(df)
print("train observations by shape: ", train.shape)
print("validate observations by shape: ", validate.shape)
print("test observations by shape: ", test.shape)

## Explore

##### Train variable distributions

In [None]:
variable_distributions(train)

##### Log error distributions

In [None]:
plot_against_target(train)

In [None]:
sns.barplot(x="county", y="logerror", data=train)

### Clustering

#### Scale

In [None]:
#empty copies to retain the original splits
train_scaled = train.copy()
validate_scaled = validate.copy()
test_scaled = test.copy()
#scale
scaler = MinMaxScaler()
#drop object column
cols = train.drop(columns=["county"]).columns.tolist()
#fit scaled data
train_scaled[cols] = scaler.fit_transform(train[cols])
validate_scaled[cols] = scaler.fit_transform(validate[cols])
test_scaled[cols] = scaler.fit_transform(test[cols])
#add object column back to the split dataframes
train_scaled["county"] = train.county.copy()
validate_scaled["county"] = validate.county.copy()
test_scaled["county"] = test.county.copy()

In [None]:
#create heatmap with scaled data
plt.figure(figsize=(8,12))
value_heatmap = sns.heatmap(train.corr()[['abs_logerror']].sort_values(by='abs_logerror', ascending=True), 
                            cmap='PuOr', vmin=-.5, vmax=.5, annot=True)
value_heatmap.set_title('Feautures Correlating with Absolute Logerror')
plt.show()

#### Cluster 1: Latitude and longitude clusters

In [None]:
X = train_scaled[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
train_scaled.groupby('cluster')['latitude', 'longitude'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.longitude, subset.latitude, label='cluster ' + str(cluster), alpha=.6)
    centroids.plot.scatter(y='latitude', x='longitude', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Visualizing Cluster Centers')

In [None]:
inertia ()

#### Cluster 2: Log error to age of the home clusters

In [None]:
X = train_scaled[['logerror', 'age']]
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
train_scaled.groupby('cluster')['logerror', 'age'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.age, subset.logerror, label='cluster ' + str(cluster), alpha=.6)
centroids.plot.scatter(y='logerror', x='age', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('logerror')
plt.ylabel('age')
plt.title('Visualizing Cluster Centers')

In [None]:
inertia ()

#### Cluster 6: Log error to home square footage clusters

In [None]:
X = train_scaled[['logerror', 'sqft']]
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
train_scaled.groupby('cluster')['logerror', 'sqft'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.sqft, subset.logerror, label='cluster ' + str(cluster), alpha=.6)
centroids.plot.scatter(y='logerror', x='sqft', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('sqft')
plt.ylabel('logerror')
plt.title('Visualizing Cluster Centers')

In [None]:
inertia ()

#### Cluster 7: Log error to price per square footage clusters

In [None]:
X = train_scaled[['logerror', 'price_per_sqft']]
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
train_scaled.groupby('cluster')['logerror', 'price_per_sqft'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.price_per_sqft, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='price_per_sqft', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('price_per_sqft')
plt.ylabel('loerror')
plt.title('Visualizing Cluster Centers')

In [None]:
inertia()

#### Cluster 8: Log error to lot size clusters

In [None]:
X = train_scaled[['logerror', 'lot_size']]
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
train_scaled.groupby('cluster')['logerror', 'lot_size'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.lot_size, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='lot_size', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('lot_size')
plt.ylabel('loerror')
plt.title('Visualizing Cluster Centers')

In [None]:
inertia()

#### Cluster 9: Log error to home value clusters

In [None]:
X = train_scaled[['logerror', 'tax_value']]
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
train_scaled['cluster'] = kmeans.predict(X)
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
train_scaled.groupby('cluster')['logerror', 'tax_value'].mean()

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in train_scaled.groupby('cluster'):
    plt.scatter(subset.tax_value, subset.logerror, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='logerror', x='tax_value', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('tax_value')
plt.ylabel('loerror')
plt.title('Visualizing Cluster Centers')

In [None]:
inertia()

### Takeaways 
- Is a higher log error dependent on homes over 50 years old? (Cluster - 2)
- Is a higher log error dependent on homes less 1000 sqft? (Cluster - 6)
- Is a higher log error dependent on homes who's ppsqft is less 200? (Cluster - 7)
- Is a higher log error dependent on homes with a smaller lot size? (Cluster - 8)
- Is a higher log error dependent on less expensive homes? (Cluster - 9)

### Statistical Testing

##### Is a higher log error dependent on homes over 50 years old? (Cluster - 2)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.age > 50)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

##### Is a higher log error dependent on homes less 1000 sqft? (Cluster - 6)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.sqft > 1000)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

##### Is a higher log error dependent on homes who's ppsqft is less 500? (Cluster - 7)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.price_per_sqft < 500)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

##### Is a higher log error dependent on homes with a smaller lot size? (Cluster - 8)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.lot_size < 236)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

##### Is a higher log error dependent on less expensive homes? (Cluster - 9)

In [None]:
Null = 'Is independent'
Alternate = 'Is dependent'

observed = pd.crosstab(train.logerror > 0, train.tax_value < 205000)
chi2, p, degf, expected = stats.chi2_contingency(observed)

print(f'chi^2 = {chi2:.4f}')
print(f'p     = {p:.4f}')

print('\n')
if p < alpha:
    print(f'We reject the null and accept the alternate: {Alternate}')
else:
    print(f'We fail to reject the null and accept the null: {Null}')

## Model

### Feature Engineering

In [None]:
X_train = train_scaled[['age', 'bathrooms', 'bedrooms', 'sqft', 'price_per_sqft', 'lot_size', 'tax_value']]#features
y_train = train.logerror
X_validate = validate_scaled[['age', 'bathrooms', 'bedrooms', 'sqft', 'price_per_sqft', 'lot_size', 'tax_value']]#features
y_validate = validate.logerror
X_test = test_scaled[['age', 'bathrooms', 'bedrooms', 'sqft', 'price_per_sqft', 'lot_size', 'tax_value']]#features
y_test = test.logerror

In [None]:
evaluate.rfe(X_train,y_train,1)

In [None]:
evaluate.rfe(X_train,y_train,3)

In [None]:
evaluate.select_kbest(X_train,y_train,1)

In [None]:
evaluate.select_kbest(X_train,y_train,3)

### Regression Modeling

In [None]:
#baseline function calculates baseline and adds columns to the dataframe
evaluate.get_baseline(train,train[['sqft']], train['logerror'])

In [None]:
evaluate.get_residuals(train, train['logerror'])

In [None]:
evaluate.plot_residual(train, train[['sqft']], train['logerror'])

In [None]:
evaluate.regression_errors(train, train['logerror'], train.yhat)

In [None]:
evaluate.baseline_mean_errors(train, train['logerror'], train.yhat_baseline)

In [None]:
evaluate.better_than_baseline(regression_errors = True, baseline_mean_errors = True)

### Baseline Model

In [None]:
model.model_baseline(y_train, y_validate, 'logerror')

#### Linear Regression

In [None]:
model.linear_regression(y_train, X_train, y_validate, X_validate)

#### LassoLars

In [None]:
model.lassolars(y_train, X_train, y_validate, X_validate)

#### Tpolynomial Regression

In [None]:
model.polynomialregression(y_train, X_train, y_validate, X_validate, X_test)

### Test

In [None]:
model.linear_regression_test(X_test, y_test)

## Conclusion