In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import pdb

In [8]:
df_pickle = '/home/lashi/assets/clean-data/merged_141516.pickle'
df = pd.read_pickle(df_pickle).reset_index(drop=True)
train_set, test_set = train_test_split(df, test_size=0.05)
train_set = train_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)

In [9]:
print(len(train_set))
print(len(test_set))

2744654
144456


In [16]:
pickup_table = '/home/lashi/assets/taxi_zones/taxi_zone_lookup.csv'
taxi_zone_df = pd.read_csv(pickup_table)
kmeans_df = taxi_zone_df.drop(['service_zone'], axis=1)
kmeans_df['ClusterSize'] = None
kmeans_df['ClusterGlobalIDs'] = None
kmeans_df['KmeansMSE'] = None
kmeans_df['Centroids'] = None

kmeans_df.astype('object')

print(kmeans_df.head())

   LocationID        Borough                     Zone ClusterSize  \
0           1            EWR           Newark Airport        None   
1           2         Queens              Jamaica Bay        None   
2           3          Bronx  Allerton/Pelham Gardens        None   
3           4      Manhattan            Alphabet City        None   
4           5  Staten Island            Arden Heights        None   

  ClusterGlobalIDs KmeansMSE Centroids  
0             None      None      None  
1             None      None      None  
2             None      None      None  
3             None      None      None  
4             None      None      None  


In [18]:
# Run K-means on training set to get clusters
clusters_dict = dict()
global_cluster_id = 0
num_clusters = 10
cluster_id = np.zeros((len(train_set),))

for large_id in range(280):
    if large_id % 10 == 0:
        print("Finished clustering up to id:", large_id)
    df_of_id = train_set.loc[train_set['pickup_id'] == large_id]
    if not df_of_id.empty:
        PU_merc_of_id = np.stack(df_of_id['pickup_merc_x_y'].values, axis=0)
        if len(PU_merc_of_id) >= num_clusters:
            kmeans = KMeans(n_clusters=num_clusters, n_jobs=-1).fit(PU_merc_of_id)
            cluster_centroids = kmeans.cluster_centers_
            labels = kmeans.labels_
            
            # inserting the cluster size into the kmeans dataframe
            kmeans_df.loc[kmeans_df.LocationID == large_id, 'ClusterSize'] = num_clusters
            
            # inserting the cluster error into the kmeans dataframe
            kmeans_df.loc[kmeans_df.LocationID == large_id, 'KmeansMSE'] = kmeans.inertia_
        else:
            cluster_centroids = PU_merc_of_id
            labels = np.asarray(range(len(cluster_centroids)))
            
            # inserting the cluster size into the kmeans dataframe
            kmeans_df.loc[kmeans_df.LocationID == large_id, 'ClusterSize'] = len(cluster_centroids)
            kmeans_df.loc[kmeans_df.LocationID == large_id, 'KmeansMSE'] = 0.0

        clusters_in_large_id = []
        cluster_centroids_in_large_id = []
        for k, cluster in enumerate(cluster_centroids):
            cluster_centroids_in_large_id.append(cluster)
            clusters_dict[global_cluster_id] = cluster
            k_group = df_of_id.loc[labels==k]
            k_group_i = k_group.index.values
            cluster_id[k_group_i] = global_cluster_id
            clusters_in_large_id.append(global_cluster_id)
            global_cluster_id += 1

            # inserting the cluster dictionary keys into the kmeans dataframe
#         print(clusters_dict[large_id])
        kmeans_df.at[large_id-1, 'ClusterGlobalIDs'] = [clusters_in_large_id]
        kmeans_df.at[large_id-1, 'Centroids'] = [cluster_centroids_in_large_id]

train_set['cluster_id'] = cluster_id.astype('int')

Finished clustering up to id: 0
Finished clustering up to id: 10
Finished clustering up to id: 20
Finished clustering up to id: 30
Finished clustering up to id: 40
Finished clustering up to id: 50
Finished clustering up to id: 60
Finished clustering up to id: 70
Finished clustering up to id: 80
Finished clustering up to id: 90
Finished clustering up to id: 100
Finished clustering up to id: 110
Finished clustering up to id: 120
Finished clustering up to id: 130
Finished clustering up to id: 140
Finished clustering up to id: 150
Finished clustering up to id: 160
Finished clustering up to id: 170
Finished clustering up to id: 180
Finished clustering up to id: 190
Finished clustering up to id: 200
Finished clustering up to id: 210
Finished clustering up to id: 220
Finished clustering up to id: 230
Finished clustering up to id: 240
Finished clustering up to id: 250
Finished clustering up to id: 260
Finished clustering up to id: 270


In [36]:
# Get cluster distributions for each Pickup ID using train_set K-means centroids
df_cluster_ids = train_set.groupby(['pickup_id','cluster_id']).size().reset_index()
df_pickup_ids = train_set.groupby(['pickup_id']).size().reset_index()
cluster_dist_dict = dict()

for i in range(len(df_pickup_ids)):
    pickup_id = df_pickup_ids['pickup_id'][i]
    id_set = df_cluster_ids.loc[df_cluster_ids['pickup_id']==pickup_id].reset_index()
    num_clusters = len(id_set)
    cluster_dist = np.zeros(num_clusters)
    for cluster in range(num_clusters):
        cluster_dist[cluster] = id_set[0].loc[cluster]
    cluster_dist = cluster_dist/np.sum(cluster_dist)
    cluster_dist_dict[pickup_id] = cluster_dist
    
# print(cluster_dist_dict)

In [45]:
# Distribute test_set data into different clusters based on distributions above
loc_ids = test_set['pickup_id'].values
cluster_ids = np.zeros(loc_ids.size)

for i,loc_id in enumerate(loc_ids):
    loc_id = loc_ids[i]
    if i%10000 == 0:
        print(i)
    if kmeans_df.iloc[int(loc_id-1)]['ClusterGlobalIDs'] == None:
        cluster_ids[i] = None
    else:
        cluster_ids[i] = np.random.choice(kmeans_df.iloc[int(loc_id-1)]['ClusterGlobalIDs'][0],p=cluster_dist_dict[loc_id])

test_set['cluster_id'] = cluster_ids.astype('int')

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000


In [112]:
# Get distribution of classified test_set data (predicted distribution)
df_cluster_ids = test_set.groupby(['pickup_id','cluster_id']).size().reset_index()
df_pickup_ids = test_set.groupby(['pickup_id']).size().reset_index()
cluster_dist_test_dict = dict()

for i in range(len(df_pickup_ids)):
    pickup_id = df_pickup_ids['pickup_id'][i]
    if kmeans_df.iloc[int(pickup_id-1)]['ClusterGlobalIDs'] == None:
        cluster_dist_test_dict[pickup_id] = np.zeros(1)
    else:
        global_ids = kmeans_df.iloc[int(pickup_id-1)]['ClusterGlobalIDs'][0]
        num_clusters = len(global_ids)
        cluster_dist = np.zeros(num_clusters)
        id_set = df_cluster_ids.loc[df_cluster_ids['pickup_id']==pickup_id].reset_index()
        num_present = len(id_set)
        for cluster in range(num_present):
            curr_id = id_set['cluster_id'].loc[cluster]
            ind = global_ids.index(curr_id)
            cluster_dist[ind] = id_set[0].loc[cluster]
        cluster_dist = cluster_dist/np.sum(cluster_dist)
        cluster_dist_test_dict[pickup_id] = cluster_dist

In [149]:
# Get truth cluster_ids from test_set data (based on K-means centroids)
true_cluster_id = np.zeros(len(test_set))
for i in range(len(test_set)):
    coord = test_set.iloc[i].pickup_merc_x_y
    pu_id = test_set.iloc[i].pickup_id
    if i%5000 == 0:
        print(i,pu_id)
    if kmeans_df.iloc[int(pu_id-1)].Centroids == None:
        print('None:',i,pu_id)
        selected_id = 1000000
    else:
        centroids = kmeans_df.iloc[int(pu_id-1)].Centroids[0]
        min_error = 10000000
        for ind, centroid in enumerate(centroids):
            error = np.linalg.norm(abs(coord-centroid))
            if error < min_error:
                min_error = error
                selected_id = kmeans_df.iloc[int(pu_id-1)].ClusterGlobalIDs[0][ind]

        true_cluster_id[i] = selected_id

test_set['true_cluster_id'] = true_cluster_id
    

# print(kmeans_df.iloc[50-1].ClusterGlobalIDs)
# print(kmeans_df.iloc[50-1].Centroids[0][0])
# print(abs(kmeans_df.iloc[50-1].Centroids[0][0] - test_set.iloc[0].pickup_merc_x_y))
# print(np.linalg.norm(abs(kmeans_df.iloc[50-1].Centroids[0][0] - test_set.iloc[0].pickup_merc_x_y)))

0 50.0
5000 79.0
10000 48.0
15000 142.0
20000 129.0
25000 79.0
30000 138.0
35000 239.0
40000 263.0
45000 161.0
50000 246.0
55000 79.0
60000 48.0
65000 239.0
70000 107.0
75000 90.0
None: 78364 44.0
80000 229.0
85000 234.0
90000 234.0
95000 114.0
100000 262.0
105000 79.0
110000 143.0
115000 97.0
120000 143.0
125000 229.0
130000 68.0
135000 75.0
140000 234.0


In [152]:
# Get distribution of true test_set data (true distribution)
df_cluster_ids = test_set.groupby(['pickup_id','true_cluster_id']).size().reset_index()
df_pickup_ids = test_set.groupby(['pickup_id']).size().reset_index()
cluster_dist_true_dict = dict()

for i in range(len(df_pickup_ids)):
    pickup_id = df_pickup_ids['pickup_id'][i]
    if kmeans_df.iloc[int(pickup_id-1)]['ClusterGlobalIDs'] == None:
        cluster_dist_test_dict[pickup_id] = np.zeros(1)
    else:
        global_ids = kmeans_df.iloc[int(pickup_id-1)]['ClusterGlobalIDs'][0]
        num_clusters = len(global_ids)
        cluster_dist = np.zeros(num_clusters)
        id_set = df_cluster_ids.loc[df_cluster_ids['pickup_id']==pickup_id].reset_index()
        num_present = len(id_set)
        for cluster in range(num_present):
            curr_id = id_set['true_cluster_id'].loc[cluster]
            ind = global_ids.index(curr_id)
            cluster_dist[ind] = id_set[0].loc[cluster]
        cluster_dist = cluster_dist/np.sum(cluster_dist)
        cluster_dist_true_dict[pickup_id] = cluster_dist

In [170]:
num_correct = 0
for i in range(len(test_set)):
    if test_set.cluster_id[i] == test_set.true_cluster_id[i]:
        num_correct += 1

print(num_correct)
print(len(test_set))
print(num_correct/len(test_set))

16852
144456
0.1166583596389212
