## Clustering and Regressions

We have 2 basic data sources:

[Merged Sales] - This is sales data by zip code. It came from https://www.redfin.com/news/data-center/ and it's got a lot of stats in there. Most of the data is medians so it doesn't tell us much about the outliers which almost certainly are skewing some of the data pretty heavily. The Price per SQ Foot is one way to see a little bit of how the market is shaped. These are broken out by zip code and month.

[2018_demographic data] - This is 2018 demographic info by zip code.

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil.parser import parse
from sklearn.cluster import KMeans
from sklearn import linear_model

### Import Data
#### Demographics

In [144]:
demographics = pd.read_csv("2018_demographic_data_edited.csv", delimiter=',')

print(demographics.shape) # removed some of the blank columns and columns with strings in excel

demo = demographics.fillna(demographics.mean())
demo.head()

(33120, 2151)


Unnamed: 0,zip,Geo_SUMLEV,Geo_GEOCOMP,Geo_LOGRECNO,Geo_ZCTA3,Geo_ZCTA5,SE_A00001_001,SE_A00002_001,SE_A00002_002,SE_A00002_003,...,SE_A10065_001,SE_A10065_002,SE_A10066_001,SE_A10066_002,SE_A10066_003,SE_A10066_004,SE_A10066_005,SE_A10066_006,SE_A10066_007,SE_A10066_008
0,601,860,0,7459,6,601,17242,17242,267.9506,64.34769,...,3237,1849,5517,1346,1768,1222,889,214,46,32
1,602,860,0,7460,6,602,38442,38442,1255.421,30.620812,...,5636,2721,12738,3107,4024,2467,2345,588,163,44
2,603,860,0,7461,6,603,48814,48814,1543.925,31.61682,...,8627,4772,19233,5121,6256,4058,2523,1085,118,72
3,606,860,0,7462,6,606,6437,6437,152.1423,42.309073,...,975,588,2014,474,693,343,343,120,19,22
4,610,860,0,7463,6,610,27073,27073,753.8562,35.912685,...,4317,2125,8858,2194,2972,1679,1440,486,55,32


#### Sales

In [59]:
sales = pd.read_csv("med_sale_price_yoy.csv", delimiter=',')
sales.rename(columns={"Zip Code": "zip"}, inplace = True)
sales.head()

Unnamed: 0,zip,Feb-16,Mar-16,Apr-16,May-16,Jun-16,Jul-16,Aug-16,Sep-16,Oct-16,...,Dec-19,Jan-20,Feb-20,Mar-20,Apr-20,May-20,Jun-20,Jul-20,Aug-20,Sep-20
0,501,,,,,,,,,,...,,,,,,,,,,
1,1005,15.40%,5.70%,-29.70%,-24.00%,33.70%,8.60%,5.70%,-9.10%,-4.10%,...,7.30%,-4.30%,6.30%,-7.90%,2.20%,-4.40%,12.40%,-1.30%,4.50%,1.30%
2,1010,,,,,,,,,,...,,,,,,,,,,
3,1031,705.00%,612.00%,-24.60%,-24.60%,-83.20%,-13.70%,-3.30%,-2.00%,27.60%,...,43.90%,-13.80%,70.90%,162.00%,126.90%,-1.00%,-37.80%,-48.30%,-51.10%,681.40%
4,1037,,,-23.90%,,102.70%,150.40%,150.40%,,,...,,,,,,,,,-24.40%,-14.50%


## Clustering Algorithm
### K Means
#### Run Model

In [4]:
km = KMeans(n_clusters=6, init='k-means++')
clstrs = km.fit(demo)
print (clstrs.cluster_centers_.shape)
print (clstrs.labels_)

(6, 2151)
[0 2 2 ... 0 0 0]


#### Add cluster labels and sales data

In [145]:
#add the column for clusters
demo['cluster'] = clstrs.labels_
print(demo.shape)
clusters = pd.DataFrame(demo[['zip','cluster']])
print(clusters.shape)
clusters.head()

(33120, 2152)
(33120, 2)


Unnamed: 0,zip,cluster
0,601,0
1,602,2
2,603,2
3,606,0
4,610,2


In [67]:
#join the sales data (has less zip codes, so right join)
data = clusters.set_index('zip').join(sales.set_index('zip'), how ='right')

print(data.shape)
data.head()

(16201, 57)


Unnamed: 0_level_0,cluster,Feb-16,Mar-16,Apr-16,May-16,Jun-16,Jul-16,Aug-16,Sep-16,Oct-16,...,Dec-19,Jan-20,Feb-20,Mar-20,Apr-20,May-20,Jun-20,Jul-20,Aug-20,Sep-20
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
501,,,,,,,,,,,...,,,,,,,,,,
1005,0.0,15.40%,5.70%,-29.70%,-24.00%,33.70%,8.60%,5.70%,-9.10%,-4.10%,...,7.30%,-4.30%,6.30%,-7.90%,2.20%,-4.40%,12.40%,-1.30%,4.50%,1.30%
1010,0.0,,,,,,,,,,...,,,,,,,,,,
1031,0.0,705.00%,612.00%,-24.60%,-24.60%,-83.20%,-13.70%,-3.30%,-2.00%,27.60%,...,43.90%,-13.80%,70.90%,162.00%,126.90%,-1.00%,-37.80%,-48.30%,-51.10%,681.40%
1037,0.0,,,-23.90%,,102.70%,150.40%,150.40%,,,...,,,,,,,,,-24.40%,-14.50%


#### Separate into groups

In [188]:
print('Group 0')
clust0 = data[data['cluster']==0]
print(clust0.shape)
clust0 = clust0.drop(columns=['cluster'])
clust0_melt = pd.melt(clust0.reset_index(), id_vars=['zip'], value_vars=list(clust0.columns), var_name='date', value_name='yoy')
print(clust0_melt.shape)
clust0 = clust0_melt.dropna()
clust0.reset_index(drop=True, inplace=True)
clust0['parsed_date'] = pd.to_datetime(clust0.loc[:,'date'], format='%b-%y')
clust0['days_from_start'] = clust0.loc[:,'parsed_date'] - clust0.loc[0, 'parsed_date']
clust0['yoy'] = clust0['yoy'].str.replace(r'%', r'')
clust0['yoy'] = pd.to_numeric(clust0["yoy"], downcast="float")
print(clust0.shape)
print(clust0.head(-10))

print('''
Group 1''')
clust1 = data[data['cluster']==1]
clust1 = clust1.drop(columns=['cluster'])
clust1_melt = pd.melt(clust1.reset_index(), id_vars=['zip'], value_vars=list(clust1.columns), var_name='date', value_name='yoy')
clust1 = clust1_melt.dropna()
clust1.reset_index(drop=True, inplace=True)
clust1['parsed_date'] = pd.to_datetime(clust1.loc[:,'date'], format='%b-%y')
clust1['days_from_start'] = clust1.loc[:,'parsed_date'] - clust1.loc[0, 'parsed_date']
print(clust1.shape)
print(clust1.head(-10))

print('''
Group 2''')
clust2 = data[data['cluster']==2]
clust2 = clust2.drop(columns=['cluster'])
clust2_melt = pd.melt(clust2.reset_index(), id_vars=['zip'], value_vars=list(clust2.columns), var_name='date', value_name='yoy')
clust2 = clust2_melt.dropna()
clust2.reset_index(drop=True, inplace=True)
clust2['parsed_date'] = pd.to_datetime(clust2.loc[:,'date'], format='%b-%y')
clust2['days_from_start'] = clust2.loc[:,'parsed_date'] - clust2.loc[0, 'parsed_date']
print(clust2.shape)
print(clust2.head(-10))

print('''
Group 3''')
clust3 = data[data['cluster']==3]
clust3 = clust3.drop(columns=['cluster'])
clust3_melt = pd.melt(clust3.reset_index(), id_vars=['zip'], value_vars=list(clust3.columns), var_name='date', value_name='yoy')
clust3 = clust3_melt.dropna()
clust3.reset_index(drop=True, inplace=True)
clust3['parsed_date'] = pd.to_datetime(clust3['date'], format='%b-%y')
clust3['days_from_start'] = clust3.loc[:,'parsed_date'] - clust3.loc[0, 'parsed_date']
print(clust3.shape)
print(clust3.head(-10))

print('''
Group 4''')
clust4 = data[data['cluster']==4]
clust4 = clust4.drop(columns=['cluster'])
clust4_melt = pd.melt(clust4.reset_index(), id_vars=['zip'], value_vars=list(clust4.columns), var_name='date', value_name='yoy')
clust4 = clust4_melt.dropna()
clust4.reset_index(drop=True, inplace=True)
clust4['parsed_date'] = pd.to_datetime(clust4['date'], format='%b-%y')
clust4['days_from_start'] = clust4.loc[:,'parsed_date'] - clust4.loc[0, 'parsed_date']
print(clust4.shape)
print(clust4.head(-10))

print('''
Group 5''')
clust5 = data[data['cluster']==5]
clust5 = clust5.drop(columns=['cluster'])
clust5_melt = pd.melt(clust5.reset_index(), id_vars=['zip'], value_vars=list(clust5.columns), var_name='date', value_name='yoy')
clust5 = clust5_melt.dropna()
clust5.reset_index(drop=True, inplace=True)
clust5['parsed_date'] = pd.to_datetime(clust5['date'], format='%b-%y')
clust5['days_from_start'] = clust5.loc[:,'parsed_date'] - clust5.loc[0, 'parsed_date']
print(clust5.shape)
print(clust5.head(-10))

Group 0
(6225, 57)
(348600, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust0['parsed_date'] = pd.to_datetime(clust0.loc[:,'date'], format='%b-%y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust0['days_from_start'] = clust0.loc[:,'parsed_date'] - clust0.loc[0, 'parsed_date']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust0['yoy'] = clust0['yoy'].str.replace(r'

(243137, 5)
          zip    date         yoy parsed_date days_from_start
0        1005  Feb-16   15.400000  2016-02-01          0 days
1        1031  Feb-16  705.000000  2016-02-01          0 days
2        1068  Feb-16  -13.200000  2016-02-01          0 days
3        1083  Feb-16  -63.599998  2016-02-01          0 days
4        1366  Feb-16   20.100000  2016-02-01          0 days
...       ...     ...         ...         ...             ...
243122  99137  Sep-20  -11.400000  2020-09-01       1674 days
243123  99141  Sep-20   -7.900000  2020-09-01       1674 days
243124  99148  Sep-20   72.199997  2020-09-01       1674 days
243125  99173  Sep-20   -4.400000  2020-09-01       1674 days
243126  99181  Sep-20  -11.300000  2020-09-01       1674 days

[243127 rows x 5 columns]

Group 1
(100900, 5)
          zip    date      yoy parsed_date days_from_start
0        1453  Feb-16   19.50%  2016-02-01          0 days
1        1545  Feb-16  -15.90%  2016-02-01          0 days
2        1702  Feb-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust1['days_from_start'] = clust1.loc[:,'parsed_date'] - clust1.loc[0, 'parsed_date']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust2['parsed_date'] = pd.to_datetime(clust2.loc[:,'date'], format='%b-%y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clust2['days_from_start'] = clust2.loc[:,'pa

#### Run Linear Regressions for each Group
##### Cluster 0

In [197]:
x = clust0['days_from_start'].values
y = clust0['yoy'].values


x = x.reshape(-1, 1)

model = linear_model.LinearRegression().fit(x, y)
linear_model.LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)


# x = list(x.flatten())
# y = list(y)

# print(x)
# print(y)

# plt.scatter(x, y,  color='black')

# plt.xticks(())
# plt.yticks(())

# plt.show()

LinearRegression(n_jobs=1)

##### Cluster 1