## Finding B2R Communities
### best algo uses dbscan

In [1]:
import os
import pandas as pd
from datetime import datetime
import warnings

import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from ipywidgets import interactive
from collections import defaultdict
import hdbscan
import folium
import re

sns.set(style="white")

In [2]:
warnings.filterwarnings('ignore') 

In [3]:
cur_dir = os.getcwd()
cur_dir

'C:\\Users\\jnunez\\Documents\\Projects\\SFR B2R Community Clustering'

In [4]:
data_dir = cur_dir + '\\Data'

In [5]:
# import data from data file
os.chdir(data_dir)

# Upload

In [6]:
# view files
files = sorted(os.listdir(data_dir))
files

['DBSCAN',
 'HDBSCAN',
 'MasterList_B2R_02-01-2023.xlsx',
 'MasterList_B2R_03-08-2022.xlsx',
 'MasterList_B2R_07-13-2022.xlsx',
 'New folder',
 'Old Data & Output',
 'SFR_GeoTable.xlsx',
 'SuperList_B2R_02-01-2023.xlsx',
 'SuperList_B2R_03-08-2022.xlsx',
 'SuperList_B2R_07-13-2022.xlsx',
 'sfr_geofile.csv',
 'sfr_table.csv',
 '~$MasterList_B2R_02-01-2023.xlsx',
 '~$MasterList_B2R_03-08-2022.xlsx',
 '~$MasterList_B2R_07-13-2022.xlsx',
 '~$SuperList_B2R_02-01-2023.xlsx']

Note that SFR_GeoTable holds the latest info - use it and not the other sfr files

In [7]:
'''
# merge corresponding master files

master_files = [file for file in files if 'MasterList' in file]
# to check sfr clusters against with B2R column (Louis's supervised learning observed answer)
master_list = pd.DataFrame()
i = 0
for master in master_files:
    new = pd.read_excel(master)
    d = master.split('_')[-1][:-5] # get date from file and append as column date stamp
    new['date_stamp'] = d
    if i == 0:
        master_list = new
    else:
        master_list = pd.concat([master_list, new])
    i += 1

# merge corresponding super list files

super_files = [file for file in files if 'SuperList' in file]

super_list = pd.DataFrame()
i = 0
for s in super_files:
    new = pd.read_excel(s)
    d = s.split('_')[-1][:-5] # get date from file and append as column date stamp
    new['date_stamp'] = d
    if i == 0:
        super_list = new
    else:
        super_list = pd.concat([super_list, new])
    i += 1
'''

"\n# merge corresponding master files\n\nmaster_files = [file for file in files if 'MasterList' in file]\n# to check sfr clusters against with B2R column (Louis's supervised learning observed answer)\nmaster_list = pd.DataFrame()\ni = 0\nfor master in master_files:\n    new = pd.read_excel(master)\n    d = master.split('_')[-1][:-5] # get date from file and append as column date stamp\n    new['date_stamp'] = d\n    if i == 0:\n        master_list = new\n    else:\n        master_list = pd.concat([master_list, new])\n    i += 1\n\n# merge corresponding super list files\n\nsuper_files = [file for file in files if 'SuperList' in file]\n\nsuper_list = pd.DataFrame()\ni = 0\nfor s in super_files:\n    new = pd.read_excel(s)\n    d = s.split('_')[-1][:-5] # get date from file and append as column date stamp\n    new['date_stamp'] = d\n    if i == 0:\n        super_list = new\n    else:\n        super_list = pd.concat([super_list, new])\n    i += 1\n"

In [8]:
sfr_file = 'SFR_GeoTable.xlsx'
sfr = pd.read_excel(sfr_file) # to cluster this raw data set

In [9]:
# save output here
data_dir = cur_dir + '\\DATA\\DBSCAN'

# import data from data file
os.chdir(data_dir)

# Clean

In [10]:
999999 in sfr.OwnerID.unique().tolist()

True

In [11]:
sfr.UID.unique().tolist()

[100081864,
 100081941,
 100056833,
 100081726,
 100066786,
 100082055,
 100081936,
 100081808,
 100081855,
 100081991,
 100081949,
 100057287,
 100081868,
 100066078,
 100057913,
 100062164,
 100066704,
 100059666,
 100066585,
 100064280,
 100059943,
 100062693,
 100057160,
 100063615,
 100058754,
 100059428,
 100066688,
 100060147,
 100056823,
 100055683,
 100055658,
 100057804,
 100057173,
 100063235,
 100064217,
 100059180,
 100059421,
 100062544,
 100060151,
 100056867,
 100059326,
 100058858,
 100063293,
 100066506,
 100059445,
 100059232,
 100057172,
 100051868,
 100057610,
 100066589,
 100059241,
 100066620,
 100066106,
 100063458,
 100066057,
 100055587,
 100055601,
 100059267,
 100064177,
 100063682,
 100063192,
 100063172,
 100059319,
 100056950,
 100058957,
 100058790,
 100063280,
 100057279,
 100066796,
 100057765,
 100066231,
 100063186,
 100066741,
 100059565,
 100066767,
 100063201,
 100054292,
 100064628,
 100050549,
 100055207,
 100055243,
 100050782,
 100065087,
 100

In [12]:
sfr.shape

(410803, 13)

In [13]:
# remove non-institutional homes with UID = 999999 
sfr = sfr.loc[sfr['OwnerID'] != 999999] 

In [14]:
sfr.shape

(317218, 13)

In [15]:
# clean superlist (indiv homes) from homes located outside of US territory
sfr_clean = sfr.loc[(sfr['Latitude'] <= 49.4) | (sfr['Latitude'] >= 24.5)] 
sfr_clean = sfr.loc[(sfr['Longitude'] <= -66.9) | (sfr['Longitude'] >= -125)] 

In [16]:
sfr_clean.shape

(317217, 13)

In [17]:
states = sfr_clean.State.unique()
states

array(['NV', 'CA', 'AZ', 'TX', 'CO', 'NC', 'WA', 'GA', 'MN', 'IL', 'TN',
       'FL', 'SC', 'IN', nan, 'OK', 'KY', 'OH', 'MD', 'MI', 'MS', 'MO',
       'UT', 'AL', 'ID', 'VA', 'PA', 'NJ', 'NY', 'KS', 'Fl', 'MA', 'NM',
       'AR', 'CT', 'WY', 'DE', 'WI', 'OR', 'LA', 'IA', 'DC', 'RI', 'NE',
       'ga', 'Ga', 'Tx', 'TZ', 'Va', 'Co', 'Mo', 'Ky', 'oh'], dtype=object)

In [18]:
sfr_clean['State'] = sfr_clean['State'].str.upper()

In [19]:
# fix state names
sfr_clean = sfr_clean.replace({'State': {'ARKANSAS  3809 GLITTERMAN DRIVE': 'AK'}})

In [20]:
bad_states = ['ENGLAND', 'W', 'M 638', 'IDF', 'TZ']

In [21]:
sfr_clean = sfr_clean[sfr_clean['State'].isin(bad_states) == False] 

In [22]:
states = sfr_clean.State.unique()
states

array(['NV', 'CA', 'AZ', 'TX', 'CO', 'NC', 'WA', 'GA', 'MN', 'IL', 'TN',
       'FL', 'SC', 'IN', nan, 'OK', 'KY', 'OH', 'MD', 'MI', 'MS', 'MO',
       'UT', 'AL', 'ID', 'VA', 'PA', 'NJ', 'NY', 'KS', 'MA', 'NM', 'AR',
       'CT', 'WY', 'DE', 'WI', 'OR', 'LA', 'IA', 'DC', 'RI', 'NE'],
      dtype=object)

In [23]:
sfr_clean.shape

(317216, 13)

In [24]:
sfr_clean['Latitude'].isna().values.any()


False

In [25]:
sfr_clean['Longitude'].isna().values.any()
# no missing lat/lon values --> don't drop rows

False

In [26]:
sfr_clean.duplicated(subset=['Latitude', 'Longitude']).values.any()
# it's okay to have duplicated values here

True

In [27]:
# geographic viz -- works
''' 
m = folium.Map(location=[sfr_clean.Latitude.mean(), sfr_clean.Longitude.mean()], zoom_start=9, 
               tiles='OpenStreet Map')
for _, row in sfr_clean.iterrows():
    folium.CircleMarker(
        location=[row.Latitude, row.Longitude],
        radius=5,
        popup=re.sub(r'[^a-zA-Z ]+', '', row.Name),
        color='#1787FE',
        fill=True,
        fill_colour='#1787FE'
    ).add_to(m)

m
'''

" \nm = folium.Map(location=[sfr_clean.Latitude.mean(), sfr_clean.Longitude.mean()], zoom_start=9, \n               tiles='OpenStreet Map')\nfor _, row in sfr_clean.iterrows():\n    folium.CircleMarker(\n        location=[row.Latitude, row.Longitude],\n        radius=5,\n        popup=re.sub(r'[^a-zA-Z ]+', '', row.Name),\n        color='#1787FE',\n        fill=True,\n        fill_colour='#1787FE'\n    ).add_to(m)\n\nm\n"

In [28]:
''' -- doesn't work
img_data = m._to_png(5)
img = Image.open(io.BytesIO(img_data))
img.save('mag.png')
'''

" -- doesn't work\nimg_data = m._to_png(5)\nimg = Image.open(io.BytesIO(img_data))\nimg.save('mag.png')\n"

# Cluster All

In [29]:
# cluster's color selections
cols = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
        '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', 
        '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', 
        '#000075', '#808080', 
        '#7b194b', '#7b9df2', '#d62bb3', '#d6444f'
       ]*1000000

# fill in geographic map with made clusters
def create_map(this_df, cluster_column, cluster_owner):
    m = folium.Map(location=[this_df.Latitude.mean(), this_df.Longitude.mean()], zoom_start=9, tiles='OpenStreet Map')
    #print(this_df.shape[0])
    for index, row in this_df.iterrows():
        #print(index)
        #print(row[cluster_column])
        #print('--')
        if row[cluster_column] == -1:
            cluster_colour = '#000000'
        else:
            cluster_colour = cols[row[cluster_column]]
            folium.CircleMarker(
                        location= [row['Latitude'], row['Longitude']],
                        radius=5,
                        popup= row[cluster_owner],
                        color=cluster_colour,
                        fill=True,
                        fill_color=cluster_colour
            ).add_to(m)
        
    return m

In [30]:
states

array(['NV', 'CA', 'AZ', 'TX', 'CO', 'NC', 'WA', 'GA', 'MN', 'IL', 'TN',
       'FL', 'SC', 'IN', nan, 'OK', 'KY', 'OH', 'MD', 'MI', 'MS', 'MO',
       'UT', 'AL', 'ID', 'VA', 'PA', 'NJ', 'NY', 'KS', 'MA', 'NM', 'AR',
       'CT', 'WY', 'DE', 'WI', 'OR', 'LA', 'IA', 'DC', 'RI', 'NE'],
      dtype=object)

In [48]:
# save cluster id's in here
my_super_list = pd.DataFrame(columns = sfr_clean.columns)
my_master_list = pd.DataFrame(columns = ['State', 'OwnerID', 'CLUSTERS_DBSCAN', 'Num_Homes', 'Latitude', 'Longitude', 'City'])
#all_core_pts = pd.DataFrame(columns = sfr_clean.columns)
for state in states:
    print(state)
    df_st = sfr_clean.loc[sfr_clean['State'] == state]
    owner_ids = df_st['OwnerID'].unique()
    print(owner_ids)
    
    for owner_id in owner_ids:
        df_own = df_st.loc[df_st['OwnerID'] == owner_id]
        print(df_own.shape)
        lat_lon_df = df_own[['Latitude', 'Longitude']]

        # cluster
        # ----------------------------------

        # Parameters
        # The maximum distance between two samples for one to be considered as in the neighborhood of the other. 
        # This is not a maximum bound on the distances of points within a cluster. 
        EPS = 0.001

        # The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. 
        # This includes the point itself.
        MIN_SAMPLES = 12

        #The power of the Minkowski metric to be used to calculate distance between points. 
        #If None, then p=2 (equivalent to the Euclidean distance).
        P = 4
        # ----------------------------------

        # cluster 
        model = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, p=P).fit(lat_lon_df)
        class_predictions = model.labels_
        df_own['CLUSTERS_DBSCAN'] = class_predictions
        #df_own.head(2)
        '''
        # indices of core points
        #core_pts_indices = model.core_sample_indices_ 
        #this_core_pts = df_own.iloc[core_pts_indices]
        # drop duplicates (two core points may be chosen if they were overlapping)
        #this_core_pts = this_core_pts.drop_duplicates(['Latitude','Longitude'], keep='last')
        #print('num core pts: ' + str(this_core_pts.shape[0]))
        #all_core_pts = pd.concat([all_core_pts, this_core_pts])
        '''
        # ----------------------------------
        # analyze visually and mathematically
        #m = create_map(df_own, 'CLUSTERS_DBSCAN')
        num_clusts = len(np.unique(class_predictions))
        print(f'Number of clusters found: {num_clusts}')
        if num_clusts > 1:
            print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')
            #print(f'Silhouette ignoring outliers: {silhouette_score(lat_lon_df[class_predictions!=-1], class_predictions[class_predictions!=-1])}')
            #no_outliers = 0
            #no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
            #print(f'Silhouette outliers as singletons: {silhouette_score(lat_lon_df, no_outliers)}')

            # ----------------------------------
            # reformat for downloading
            '''
            this_clust = df_own.groupby(['State', 'OwnerID', 'CLUSTERS_DBSCAN']).agg({'Address':['count'],
                                                                         'Latitude':['mean'],
                                                                         'Longitude':['mean'],
                                                                         'City':['min'],
                                                                         #'Column':['count','sum','mean','std']
                                                                        })
            '''
            this_clust = df_own.groupby(['State', 'OwnerID', 'CLUSTERS_DBSCAN']).agg({'Address':['count'],
                                                                         'Latitude':['mean'],
                                                                         'Longitude':['mean']})
            cities = df_own.groupby(['State', 'OwnerID', 'CLUSTERS_DBSCAN'])['City'].first()
                                                                        
            this_clust.columns = this_clust.columns.droplevel(0)
            this_clust['City'] = cities
            this_clust.columns = ['Num_Homes', 'Latitude', 'Longitude', 'City']
            this_clust = this_clust.reset_index()
            this_clust = this_clust.drop(labels=0, axis=0)
            df_own = df_own.loc[df_own.CLUSTERS_DBSCAN != -1]

            # ----------------------------------
            # save results in super and master lists
            my_master_list = pd.concat([my_master_list, this_clust])
            my_super_list = pd.concat([my_super_list, df_own])
            
        print('-*-')

NV
[ 1  2  9  3  6 10 20  8]
(2204, 13)
num core pts: 2
Number of clusters found: 2
Number of outliers found: 2189
-*-
(928, 13)
num core pts: 0
Number of clusters found: 1
-*-
(3602, 13)
num core pts: 6
Number of clusters found: 4
Number of outliers found: 3566
-*-
(1749, 13)
num core pts: 634
Number of clusters found: 15
Number of outliers found: 1066
-*-
(1688, 13)
num core pts: 0
Number of clusters found: 1
-*-
(186, 13)
num core pts: 17
Number of clusters found: 3
Number of outliers found: 150
-*-
(341, 13)
num core pts: 0
Number of clusters found: 1
-*-
(264, 13)
num core pts: 0
Number of clusters found: 1
-*-
CA
[ 1  2 10  3 15  6]
(6119, 13)
num core pts: 0
Number of clusters found: 1
-*-
(509, 13)
num core pts: 0
Number of clusters found: 1
-*-
(1134, 13)
num core pts: 1
Number of clusters found: 2
Number of outliers found: 1122
-*-
(92, 13)
num core pts: 0
Number of clusters found: 1
-*-
(1, 13)
num core pts: 0
Number of clusters found: 1
-*-
(1, 13)
num core pts: 0
Number of

In [32]:
my_master_list

Unnamed: 0,State,OwnerID,CLUSTERS_DBSCAN,Num_Homes,Latitude,Longitude,City
1,NV,1,0,15,36.229434,-115.329017,Las Vegas
1,NV,9,0,12,36.277574,-115.140917,North Las Vegas
2,NV,9,1,12,36.273532,-115.127017,North Las Vegas
3,NV,9,2,12,36.312687,-115.303807,Las Vegas
1,NV,3,0,166,36.270673,-115.120495,Las Vegas
...,...,...,...,...,...,...,...
2,AR,17,1,13,36.046932,-94.180920,Fayetteville
1,OR,3,0,17,45.513564,-122.851695,Beaverton
1,LA,17,0,38,30.408540,-91.010535,Baton Rouge
2,LA,17,1,16,30.451468,-91.187147,Baton Rouge


In [33]:
my_super_list

Unnamed: 0,UID,Name,Address,City,State,Zip,Owner Name,OwnerID,Latitude,Longitude,BlockFIPS,FIPSCode,StateFips,CLUSTERS_DBSCAN
51,100066620,INVH - 10627 Gibbous Moon Dr,10627 Gibbous Moon Dr,Las Vegas,NV,89129.0,Invitation Homes,1,36.229510,-115.328410,3.200300e+14,32003.0,32.0,0.0
3156,100066625,INVH - 10631 Gibbous Moon Dr,10631 Gibbous Moon Dr,Las Vegas,NV,89129.0,Invitation Homes,1,36.229000,-115.329000,3.200300e+14,32003.0,32.0,0.0
5770,100066621,INVH - 10619 Gibbous Moon Dr,10619 Gibbous Moon Dr,Las Vegas,NV,89129.0,Invitation Homes,1,36.229510,-115.328410,3.200300e+14,32003.0,32.0,0.0
23747,100066630,INVH - 10639 W Gilmore Ave,10639 W Gilmore Ave,Las Vegas,NV,89129.0,Invitation Homes,1,36.229000,-115.329000,3.200300e+14,32003.0,32.0,0.0
27609,100066619,INVH - 10639 Gibbous Moon Dr,10639 Gibbous Moon Dr,Las Vegas,NV,89129.0,Invitation Homes,1,36.229000,-115.329000,3.200300e+14,32003.0,32.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264085,1704028378,Kairos - 6909 Holly Ct,6909 Holly Ct,Johnston,IA,50131.0,Kairos Living,17,41.686423,-93.762987,1.915301e+14,19153.0,19.0,0.0
315514,1704082751,Kairos - 6936 Holly Ct,6936 Holly Ct,Johnston,IA,50131.0,Kairos Living,17,41.686423,-93.762987,1.915301e+14,19153.0,19.0,0.0
317680,1704082748,Kairos - 6944 Holly Ct,6944 Holly Ct,Johnston,IA,50131.0,Kairos Living,17,41.686423,-93.762987,1.915301e+14,19153.0,19.0,0.0
342974,1703982907,Kairos - 6916 Holly Ct,6916 Holly Ct,Johnston,IA,50131.0,Kairos Living,17,41.686423,-93.762987,1.915301e+14,19153.0,19.0,0.0


In [34]:
my_super_list['Unique_Cluster'] = my_super_list.apply(lambda row: row['State'] + '_' + str(row['OwnerID']) + '_' + str(int(row['CLUSTERS_DBSCAN'])), axis=1)

In [35]:
# hash for coloring
import hashlib
#hash = hashlib.sha1(str.encode(s)).hexdigest() # you need to encode the strings into bytes here
num_digits = 6
my_super_list['hash'] = my_super_list.apply(lambda row: int(hashlib.sha256(row['Unique_Cluster'].encode('utf-8')).hexdigest(), 16) % 10**num_digits, axis = 1)

In [36]:
my_master_list

Unnamed: 0,State,OwnerID,CLUSTERS_DBSCAN,Num_Homes,Latitude,Longitude,City
1,NV,1,0,15,36.229434,-115.329017,Las Vegas
1,NV,9,0,12,36.277574,-115.140917,North Las Vegas
2,NV,9,1,12,36.273532,-115.127017,North Las Vegas
3,NV,9,2,12,36.312687,-115.303807,Las Vegas
1,NV,3,0,166,36.270673,-115.120495,Las Vegas
...,...,...,...,...,...,...,...
2,AR,17,1,13,36.046932,-94.180920,Fayetteville
1,OR,3,0,17,45.513564,-122.851695,Beaverton
1,LA,17,0,38,30.408540,-91.010535,Baton Rouge
2,LA,17,1,16,30.451468,-91.187147,Baton Rouge


In [37]:
my_super_list.head(1)

Unnamed: 0,UID,Name,Address,City,State,Zip,Owner Name,OwnerID,Latitude,Longitude,BlockFIPS,FIPSCode,StateFips,CLUSTERS_DBSCAN,Unique_Cluster,hash
51,100066620,INVH - 10627 Gibbous Moon Dr,10627 Gibbous Moon Dr,Las Vegas,NV,89129.0,Invitation Homes,1,36.22951,-115.32841,320030000000000.0,32003.0,32.0,0.0,NV_1_0,54793


In [38]:
my_master_list['Unique_Cluster'] = my_master_list.apply(lambda row: row['State'] + '_' + str(row['OwnerID']) + '_' + str(int(row['CLUSTERS_DBSCAN'])), axis=1)

In [39]:
my_super_list.to_csv('Super_List_Clusters.csv', header = True, index = False)

In [40]:
my_master_list.to_csv('Master_List_Clusters.csv', header = True, index = False)

In [None]:
create_map(my_super_list, 'hash', 'Owner Name')

# Ignore

In [None]:
stop

In [None]:
# scatter plot viz -- works but not useful
#X = np.array(sfr_clean[['Latitude', 'Longitude']], dtype='float64')
#plt.scatter(X[:,0], X[:,1], alpha=0.2, s=50)

In [None]:
#date_stamps = super_list.date_stamp.unique()
#this_date = super_list.loc[super_list['date_stamp'] == date_stamps[0]]

df_st = sfr_clean.loc[sfr_clean['State'] == 'TX']

# switch between 1 and 4 (INVH and Camillo, or no B2R and yes B2R)
CODE = 1 

df_own = df_st.loc[df_st['OwnerID'] == CODE]
df_own = df_own.reset_index()
lat_lon_df = df_own[['Latitude', 'Longitude']]

In [None]:
# cluster
#dummy = np.array([-1, -1, -1, 2, 3, 4, 5, -1])
#new = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(dummy)])
# ----------------------------------

# Parameters
# The maximum distance between two samples for one to be considered as in the neighborhood of the other. 
# This is not a maximum bound on the distances of points within a cluster. 
EPS = 0.001

# The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. 
# This includes the point itself.
MIN_SAMPLES = 6

#The power of the Minkowski metric to be used to calculate distance between points. 
#If None, then p=2 (equivalent to the Euclidean distance).
P = 4
# ----------------------------------

model = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, p=P).fit(lat_lon_df)
class_predictions = model.labels_
df_own['CLUSTERS_DBSCAN'] = class_predictions
df_own.head(2)

In [None]:
m = create_map(df_own, 'CLUSTERS_DBSCAN')
print(f'Number of clusters found: {len(np.unique(class_predictions))}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')
print(f'Silhouette ignoring outliers: {silhouette_score(lat_lon_df[class_predictions!=-1], class_predictions[class_predictions!=-1])}')

no_outliers = 0
no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(lat_lon_df, no_outliers)}')

In [None]:
m

In [None]:
#date_stamps = super_list.date_stamp.unique()
#this_date = super_list.loc[super_list['date_stamp'] == date_stamps[0]]

df_st = sfr_clean.loc[sfr_clean['State'] == 'TX']

# switch between 1 and 4 (INVH and Camillo, or no B2R and yes B2R)
CODE = 1 

df_own = df_st.loc[df_st['OwnerID'] == CODE]
#df_own = df_own.reset_index()
lat_lon_df = df_own[['Latitude', 'Longitude']]

# cluster
#dummy = np.array([-1, -1, -1, 2, 3, 4, 5, -1])
#new = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(dummy)])
# ----------------------------------

# Parameters
# The maximum distance between two samples for one to be considered as in the neighborhood of the other. 
# This is not a maximum bound on the distances of points within a cluster. 
EPS = 0.001

# The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. 
# This includes the point itself.
MIN_SAMPLES = 7

#The power of the Minkowski metric to be used to calculate distance between points. 
#If None, then p=2 (equivalent to the Euclidean distance).
P = 4
# ----------------------------------

model = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, p=P).fit(lat_lon_df)
class_predictions = model.labels_
df_own['CLUSTERS_DBSCAN'] = class_predictions
df_own.head(2)

'''
m = create_map(df_own, 'CLUSTERS_DBSCAN')
print(f'Number of clusters found: {len(np.unique(class_predictions))}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')
print(f'Silhouette ignoring outliers: {silhouette_score(lat_lon_df[class_predictions!=-1], class_predictions[class_predictions!=-1])}')

no_outliers = 0
no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
print(f'Silhouette outliers as singletons: {silhouette_score(lat_lon_df, no_outliers)}')
'''

this_clust = df_own.groupby(['State', 'OwnerID', 'CLUSTERS_DBSCAN']).agg({'Address':['count'],
                                                             'Latitude':['mean'],
                                                             'Longitude':['mean'],
                                                             #'City':['mode'],
                                                             #'City':['count','sum','mean','std']
                                                            })
this_clust.columns = this_clust.columns.droplevel(0)
this_clust.columns = ['Num_Homes', 'Latitude', 'Longitude']
this_clust = this_clust.reset_index()
my_master_list = my_master_list.drop(labels=0, axis=0)
my_super_list = my_super_list.loc[my_super_list.CLUSTERS_DBSCAN != -1]

# save results in super and master lists
my_master_list = pd.concat([my_master_list, this_clust])
my_super_list = pd.concat([my_super_list, df_own])