In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time
from sklearn.cluster import DBSCAN
from sklearn import metrics
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
%matplotlib inline

In [2]:
# define the number of kilometers in one radian
kms_per_radian = 6371.0088

In [3]:
# load the data set
df = pd.read_csv('./local2.csv', encoding='utf-8')
df.head()

Unnamed: 0,business,lat,lon,state,pcode
0,6EvETd9FVPJfhT_6AW9iEw,35.352128,-80.851235,NC,28216
1,lHYMeXf8JH1Q8Dazn9s3Gg,35.295623,-80.754429,NC,28262
2,sY_hAfwWzgr0sxyMgMB5Jg,35.443015,-80.863925,NC,28078
3,PCNeANrp7puV3nE-HI2JrQ,35.059117,-80.811356,NC,28277
4,7cIRj82eLbyvNG43lemt1A,35.218163,-80.861911,NC,28203


In [4]:
# represent points consistently as (lat, lon)
coords = df.as_matrix(columns=['lat', 'lon'])

# define epsilon as 1.5 kilometers, converted to radians for use by haversine
epsilon = 1.5 / kms_per_radian

In [5]:
start_time = time.time()
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_

# get the number of clusters
num_clusters = len(set(cluster_labels))

# all done, print the outcome
message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression in {:,.2f} seconds'
print(message.format(len(df), num_clusters, 100*(1 - float(num_clusters) / len(df)), time.time()-start_time))
print('Silhouette coefficient: {:0.03f}'.format(metrics.silhouette_score(coords, cluster_labels)))

Clustered 10,649 points down to 127 clusters, for 98.8% compression in 1.72 seconds
Silhouette coefficient: -0.211


In [6]:
# turn the clusters in to a pandas series, where each element is a cluster of points
clusters = pd.Series([coords[cluster_labels==n] for n in range(num_clusters)])

In [7]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

centermost_points = clusters.map(get_centermost_point)

In [8]:
# unzip the list of centermost points (lat, lon) tuples into separate lat and lon lists
lats, lons = zip(*centermost_points)

# from these lats/lons create a new df of one representative point for each cluster
rep_points = pd.DataFrame({'lon':lons, 'lat':lats})
rep_points.tail()

Unnamed: 0,lat,lon
122,35.039863,-80.740318
123,34.967913,-80.820987
124,35.334807,-80.624391
125,35.465725,-80.804398
126,35.441389,-80.732734


In [9]:
# pull row from original data set where lat/lon match the lat/lon of each row of representative points
# that way we get the full details like city, country, and date from the original dataframe
rs = rep_points.apply(lambda row: df[(df['lat']==row['lat']) & (df['lon']==row['lon'])].iloc[0], axis=1)
rs.to_csv('./local2-temp.csv', encoding='utf-8')
rs.tail()

Unnamed: 0,business,lat,lon,state,pcode
122,Xrni9GAjKOrg4JNhE4ojIg,35.039863,-80.740318,NC,28104
123,dO-u3fNZOQZS13aE9FypoQ,34.967913,-80.820987,SC,29707
124,Lrvrjsb9ZzEtkpvqaPoUHw,35.334807,-80.624391,NC,28075
125,KjJQK6Pk2EsciZdt2-r1sQ,35.465725,-80.804398,NC,28036
126,4CeCU1e8uLw8t88BtMPeWg,35.441389,-80.732734,NC,28027
