In [None]:
# get all data
# convert time to dates
# get unique users
# get unique dates
# iterate over each user and day
# dbscan to id events
# get user's name from worker fl
# get center, start time, stop time, duration of each event
# missing times
# return to same site on same day
# most traveled roads
# analyze results
# reduce smaller clusters to single points
## snap to roads
## create route
## how much time, distance?
## most efficient route

In [None]:
from arcgis.gis import GIS
import pandas as pd
import numpy as np
import json
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
%matplotlib notebook
import pytz
import getpass

In [None]:
# gis object
password = getpass.getpass()
gis = GIS(username = 'jason_cnx',
          password = password)

In [None]:
# get layers needed
tracks_id = "619be0d131594c579132caf247802c15"
cluster_id = "0af1d86ee5bf4bc3ac38849c3119a7d8"

# get tracks layer
tracks_item = gis.content.get(tracks_id)
tracks_layer = tracks_item.layers[0]

# get cluster layer, for results
cluster_item = gis.content.get(cluster_id)
cluster_layer = cluster_item.layers[0]

In [None]:
# query for all features
track_features = tracks_layer.query(where="Creator = 'lss0wos_consol'", out_fields="*")
# And CreationDate > '2018-10-12 00:00:00' And CreationDate < '2018-10-13 00:00:00'

In [None]:
# convert query featureset to spatially enabled dataframe
# requires arcgis v1.5 or later
df = track_features.sdf

In [None]:
# set globalid as index
# this will allow update of cluster field in main dataframe
df.set_index('GlobalID', inplace=True, drop=False)

In [None]:
# add cluster column, with default value
df["cluster"] = -2

In [None]:
# convert CreationDate to datetime
df2['CreationDate'] = pd.to_datetime(df['CreationDate'], unit='ms')

In [None]:
# convert timezone
# TODO: confirm this matches what is on AGO
utc = pytz.utc
eastern = pytz.timezone('US/Eastern')

def convert_timezone(date):
    return utc.localize(date).astimezone(eastern)

def convert_date(date):
    return date.date()

df2['dt_tz'] = df2['CreationDate'].apply(convert_timezone)
df2['date_tz'] = df2['dt_tz'].apply(convert_date)
# df2.head()

# confirm dates look right
# df2.groupby("date")["date"].count()

In [None]:
# populate fields for x and y
def get_x(shape):
    return shape['x']
def get_y(shape):
    return shape['y']

df2['x'] = df2['SHAPE'].apply(get_x)
df2['y'] = df2['SHAPE'].apply(get_y)
# df2.head() 

In [None]:
df.head()

In [None]:
# get unique dates and workers
dates = np.unique(df2['date_tz'])
workers = np.unique(df2['Creator'])

In [None]:
eps = [30.48, 45.72, 60.96, 76.2, 91.44, 106.68, 121.92]   # unit: latitude/longitude 100-400
min_sample = [5, 8, 12, 15, 18, 21]
n1, n2 = len(eps), len(min_sample)

In [None]:
# get layers needed
cluster_id = "56923a29a9b443af9bbfb45c8702c075"

# get cluster layer, for results
cluster_item = gis.content.get(cluster_id)
cluster_layer = cluster_item.layers[0]

In [None]:
# get arrays of y, x
X = df2[['y', 'x']].values

for i in range(n1):
    for j in range(n2):
        

        db = DBSCAN(eps=eps[i], min_samples=min_sample[j], metric="euclidean").fit(X)

        # count of arrays
        clusters = np.unique(db.labels_)
        cluster_count = (len(clusters) - 1)
        print(cluster_count)
        
        # add cluster info to df3
        df2['cluster'] = db.labels_.tolist()
        
        features = []
        for index, row in df2.iterrows():
            feature = {"attributes": 
                   {"username": row["Creator"],
                    "capture_time": row["CreationDate"],
                    "original_globalid": row["GlobalID"],
                    "cluster": row["cluster"],
                    "eps": eps[i],
                    "min_samples": min_sample[j]}, 
                   "geometry": row['SHAPE']}
            features.append(feature)
            
        add_result = cluster_layer.edit_features(adds = features)
        print(add_result)

In [None]:
# add cluster info to df3
df2['cluster'] = db.labels_.tolist()

In [None]:
# rename Creator to username
# username, cluster, capture_time, capture_time_tz
# original_globalid, eps, min_samples, capture_date_tz
df3 = df2[['Creator', 'CreationDate', 'GlobalID', 'SHAPE', 'cluster',
          'dt_tz', 'date_tz', 'eps', 'min_samples']]
df3.columns = ['username', 'capture_time', 'original_globalid',
               'SHAPE', 'cluster', 'capture_time_tz',
               'capture_date_tz', 'eps', 'min_samples']
df3.head()

In [None]:
#['username', 'capture_time', 'original_globalid',
#               'SHAPE', 'cluster', 'capture_time_tz',
#               'capture_date_tz', 'eps', 'min_samples']

features = []
for index, row in df3.iterrows():
    feature = {"attributes": 
           {"username": row["username"],
            "capture_time": row["capture_time"],
            "original_globalid": row["original_globalid"],
            "cluster": row["cluster"],
            "eps": row["eps"],
            "min_samples": row["min_samples"]}, 
           "geometry": row['SHAPE']}
    features.append(feature)
    
    
features

In [None]:
add_result = cluster_layer.edit_features(adds = features)
add_result

In [None]:
df2.dtypes

In [None]:
df2.reset_index(drop=True, inplace=True)
df2.head()

In [None]:
# add fields for eps and min_samples
df2['eps'] = 0.0
df2['min_samples'] = 0
df2.head()
df2.spatial.to_featureclass(location=r"/Users/jasonbartley/Development/python/cnx")

In [None]:
import inspect
inspect.getargspec(df2.spatial.to_featureclass)[0]

In [None]:
df2.spatial.plot(map_widget= m1, renderer_type='u', col='cluster')

In [None]:
df.to_featurelayer("clustering_results",gis)

In [None]:
# plot x, y
plt.scatter(df2['x'], df2['y'], c=df2['cluster'])
plt.show()

In [None]:
df.dtypes

In [None]:
df2.dtypes

In [None]:
df3 = df2['cluster']
df3.head()

In [None]:
df.update(df3)
df.head()

In [None]:
df.groupby('cluster')['cluster'].count()

In [None]:
# filter df2 by worker and date
count = 0
for worker in workers:
    for date in dates:
        print(date)
        df3 = df2.loc[(df2['Creator'] == worker) & (df2['date_tz'] == date)]
        
        # get arrays of y, x
        X = df3[['y', 'x']].values
        db = DBSCAN(eps=91.44, min_samples=15, metric="euclidean").fit(X)
        
        clusters = np.unique(db.labels_)
        cluster_count = (len(clusters) - 1)
        count += len(db.labels_)
        print(cluster_count)

        
        # add cluster info to df3
        df3['cluster'] = db.labels_.tolist()
        print(df3.head())
        
        del df3
        # get back into df2
        
print(count)

In [None]:
# plot x, y
plt.scatter(df2['x'], df2['y'])
plt.show()

In [None]:
# get arrays of y, x
X = df[['y', 'x']].values
db = DBSCAN(eps=91.44, min_samples=15, metric="euclidean").fit(X)
np.unique(db.labels_)

In [None]:
# add labels as new column in df
df['cluster'] = db.labels_.tolist()
df.head()

In [None]:
# plot x, y
plt.scatter(df['x'], df['y'], c=df['cluster'])
plt.show()


In [None]:
# get min max of each cluster, creation date
cluster_min_dt = df.groupby('cluster')['CreationDate'].min()
cluster_max_dt = df.groupby('cluster')['CreationDate'].max()
cluster_max_dt

In [None]:

eps = [30.48, 45.72, 60.96, 76.2, 91.44, 106.68, 121.92]   # unit: latitude/longitude 100-400
min_sample = [5, 8, 12, 15, 18, 21]
n1, n2 = len(eps), len(min_sample)
plt.subplots(nrows=n1, ncols=n2, figsize=(20, 15))
ax.scatter(lons, lats, marker = 'o', color=color, edgecolor='gray', zorder=5, alpha=1.0, s=15)
for i in range(n1):
    for j in range(n2):
        est = DBSCAN(eps=eps[i], min_samples=min_sample[j], metric="euclidean").fit(X)
        df2['oid'] = est.labels_.tolist()

        ax = plt.subplot(n1, n2, n2*i+j+1)
        ax.set_title("DBSCAN ('euclidean', eps={}, min_sample={})".format(eps[i], min_sample[j]))
        
        # plot_stations_map(ax, works_data)

In [None]:
est[]

In [None]:
X
len(X)

In [None]:
est1 = DBSCAN(eps=91.44, min_samples=15, metric="euclidean").fit(X)
np.unique(est1.labels_)

In [None]:
est1.core_sample_indices_

In [None]:
len(est1.components_)

In [None]:
y_list = [x[0] for x in est1.components_]
y_list
x_list = [x[1] for x in est1.components_]
len(x_list)

In [None]:
plt.scatter(x_list, y_list)
plt.show()

In [None]:
df2.to_featureclass(out_location=r"C:\Users\jaso9356\Desktop\dev\py\cnx\test.gdb",
                   out_name="df_test3")

In [None]:
# Log into ArcGIS anonymously
g = GIS()
# Retrieve an item from ArcGIS Online from a known ID value
known_item = g.content.get("85d0ca4ea1ca4b9abf0c51b9bd34de2e")
known_item

# Obtain the first feature layer from the item
fl = known_item.layers[0]

# Use the `from_layer` method of the Spatial DataFrame to create a new Spatial DataFrame
sdf = SpatialDataFrame.from_layer(fl)

# Return the first 5 records. 
sdf.head()


sdf.to_featureclass(out_location=r"path\to\your\data\output_example",
                   out_name="output_cities.shp")

In [None]:
def plot_stations_map(ax, stns):
    # determine range to print based on min, max lat and lon of the data
    lat = list(stns['y'])
    lon = list(stns['x'])
    margin = 0.01 # buffer to add to the range
    lat_min = min(lat) - margin
    lat_max = max(lat) + margin
    lon_min = min(lon) - margin
    lon_max = max(lon) + margin

    # create map using BASEMAP
    m = Basemap(llcrnrlon=lon_min,
                llcrnrlat=lat_min,
                urcrnrlon=lon_max,
                urcrnrlat=lat_max,
                lat_0=(lat_max - lat_min)/2,
                lon_0=(lon_max - lon_min)/2,
                projection='lcc',
                resolution = 'f',)

    m.drawcoastlines()
    m.fillcontinents(lake_color='aqua')
    m.drawmapboundary(fill_color='aqua')
    m.drawrivers()    
    
    # plot points
    clist = list(stns['cluster'].unique())
    if -1 in clist:
        clist.remove(-1)
    k = len(clist)
    colors = iter(cm.Set1(np.linspace(0, 1, max(10, k))))
    for i in range(k):
        color = next(colors)
        df = stns.loc[stns['cluster'] == clist[i]]        
        #print("Cluster {} has {} samples.".format(clist[i], df.shape[0]))
        
        # convert lat and lon to map projection coordinates
        lons, lats = m(list(df['station longitude']), list(df['station latitude']))        
        ax.scatter(lons, lats, marker = 'o', color=color, edgecolor='gray', zorder=5, alpha=1.0, s=15)