In [114]:
import numpy as np
import pandas as pd
from sklearn.cluster import MeanShift
from sklearn.model_selection import train_test_split

In [115]:
with open('checkins.dat') as input_file:        
    newLines = []
    for line in input_file:
        newLine = [x.strip() for x in line.split('|')]
        if len(newLine) == 6 and newLine[3] and newLine[4]:
            newLines.append(newLine)

In [116]:
labels = newLines[0]

In [117]:
df = pd.DataFrame(newLines[1:], columns=newLines[0])

In [118]:
df.shape

(396634, 6)

In [119]:
df.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984222,15824,5222,38.8951118,-77.0363658,2012-04-21 17:43:47
1,984234,44652,5222,33.800745,-84.41052,2012-04-21 17:43:43
2,984291,105054,5222,45.5234515,-122.6762071,2012-04-21 17:39:22
3,984318,2146539,5222,40.764462,-111.904565,2012-04-21 17:35:46
4,984232,93870,380645,33.4483771,-112.0740373,2012-04-21 17:38:18


In [120]:
df100 = df[['latitude', 'longitude']].iloc[:100000, :]

In [121]:
ms = MeanShift(bandwidth=0.1, min_bin_freq=15, n_jobs=-1)

In [122]:
%%time
ms.fit(df100)

CPU times: user 12.1 s, sys: 381 ms, total: 12.5 s
Wall time: 49.4 s


MeanShift(bandwidth=0.1, bin_seeding=False, cluster_all=True, min_bin_freq=15,
     n_jobs=-1, seeds=None)

In [123]:
ms.cluster_centers_.shape

(3230, 2)

In [124]:
offices = np.array(([33.751277, -118.188740],
                    [25.867736, -80.324116],
                    [51.503016, -0.075479],
                    [52.378894, 4.885084],
                    [39.366487, 117.036146],
                    [-33.868457, 151.205134]
                   ))

In [125]:
cities = ['Los Angeles', 'Miami', 'London', 'Amsterdam', 'Beijing', 'Sydney']

In [126]:
offices = pd.DataFrame(offices, columns = ['latitude', 'longitude'])

In [127]:
offices['city'] = cities

In [128]:
offices

Unnamed: 0,latitude,longitude,city
0,33.751277,-118.18874,Los Angeles
1,25.867736,-80.324116,Miami
2,51.503016,-0.075479,London
3,52.378894,4.885084,Amsterdam
4,39.366487,117.036146,Beijing
5,-33.868457,151.205134,Sydney


In [129]:
clusters = ms.cluster_centers_

In [140]:
dist_to_office = np.sum((clusters - offices[offices.city == 'London'][['latitude', 'longitude']].values)**2, axis=1)
dist_to_office = np.sqrt(dist_to_office)
idx = dist_to_office.argsort()[:20]
list(zip(clusters[idx]))

[(array([ 51.50299126,  -0.12553729]),),
 (array([ 51.42676329,  -0.30373207]),),
 (array([ 51.5741517,   0.1838708]),),
 (array([ 51.50647877,  -0.36517727]),),
 (array([ 51.75153 ,  -0.333892]),),
 (array([ 51.2329101,  -0.3297445]),),
 (array([ 51.1133388,  -0.1829137]),),
 (array([ 51.4934557,   0.3529197]),),
 (array([ 51.4307509,  -0.5481532]),),
 (array([ 51.386522 ,   0.5443375]),),
 (array([ 51.522751,  -0.720209]),),
 (array([ 51.6664235,  -0.7286419]),),
 (array([ 50.819522,  -0.13642 ]),),
 (array([ 51.2940006,  -0.754624 ]),),
 (array([ 52.2025441,   0.1312368]),),
 (array([ 51.8157917,  -0.8166621]),),
 (array([ 52.038601,  -0.757072]),),
 (array([ 51.455041 ,  -0.9690884]),),
 (array([ 50.785996,  -0.675879]),),
 (array([ 51.7522792,  -1.2558838]),)]

In [153]:
dist_to_office = np.zeros(clusters.shape)
distances = pd.DataFrame(columns = cities)
locations = pd.DataFrame(columns = cities)

for city in cities:
    dist_to_office = np.sum((clusters - offices[offices.city == city][['latitude', 'longitude']].values)**2, axis=1)
    dist_to_office = np.sqrt(dist_to_office)
    distances[city] = np.sort(dist_to_office)[:20]
    idx = dist_to_office.argsort()[:20]
    locations[city] = list(zip(clusters[idx]))

In [154]:
distances

Unnamed: 0,Los Angeles,Miami,London,Amsterdam,Beijing,Sydney
0,0.070848,0.022674,0.050058,0.009353,0.303308,0.007835
1,0.195779,0.134109,0.240653,0.251502,0.827318,0.154103
2,0.211811,0.167406,0.268929,0.27532,1.028142,0.192227
3,0.222233,0.188876,0.289719,0.298373,9.188567,0.906813
4,0.294979,0.271301,0.35852,0.371112,9.267575,1.090489
5,0.304731,0.30227,0.370956,0.413724,9.327714,2.530161
6,0.314884,0.340846,0.404216,0.418038,9.855948,3.923192
7,0.33881,0.4275,0.428505,0.475003,9.932652,5.019206
8,0.378688,0.540471,0.478166,0.550234,10.103449,6.182523
9,0.386706,0.671049,0.630669,0.607774,10.211337,6.417077


In [155]:
locations

Unnamed: 0,Los Angeles,Miami,London,Amsterdam,Beijing,Sydney
0,"([33.8098779553, -118.148923807],)","([25.8456722643, -80.3188905964],)","([51.5029912609, -0.12553728871],)","([52.3729639903, 4.89231722258],)","([39.121079, 117.214389],)","([-33.8606304286, 151.204775929],)"
1,"([33.8883253428, -118.048928172],)","([25.78581242, -80.2179380368],)","([51.4267632889, -0.303732066667],)","([52.388501, 4.63376546667],)","([39.904214, 116.407413],)","([-34.00190615, 151.12806905],)"
2,"([33.8729860116, -118.362091147],)","([25.7053497211, -80.2834287382],)","([51.5741517, 0.1838708],)","([52.2644, 4.6347],)","([40.376322, 116.842992],)","([-33.9522629, 151.0321372],)"
3,"([33.9725748214, -118.168370667],)","([26.0100982493, -80.1999905857],)","([51.5064787667, -0.365177266667],)","([52.2738177, 5.1643425],)","([30.593087, 114.305357],)","([-33.7149546, 150.3114074],)"
4,"([33.983935874, -118.007404973],)","([26.1388437868, -80.3343468368],)","([51.75153, -0.333892],)","([52.080058, 5.10513275],)","([31.230393, 121.473704],)","([-32.8970039, 151.7005582],)"
5,"([33.8173064339, -117.891249171],)","([26.1208626586, -80.1589066802],)","([51.2329101, -0.3297445],)","([51.974932, 4.9744262],)","([31.112842, 121.381672],)","([-35.3080556, 149.1244444],)"
6,"([34.0603975546, -118.248709027],)","([26.200584641, -80.2507161256],)","([51.1133388, -0.1829137],)","([52.787747, 4.797934],)","([37.4562557, 126.7052062],)","([-36.506376, 148.301203],)"
7,"([33.6743026598, -117.858789268],)","([25.4687224, -80.4775569],)","([51.4934557, 0.3529197],)","([52.132633, 5.291266],)","([37.5588652, 126.8029305],)","([-31.2532183, 146.921099],)"
8,"([34.0354869531, -118.438997719],)","([26.3525711663, -80.0852787174],)","([51.4307509, -0.5481532],)","([52.156651, 5.388438],)","([37.566535, 126.9779692],)","([-28.0673655, 153.343106667],)"
9,"([34.1314601492, -118.118011806],)","([26.4888424714, -80.0700818714],)","([51.386522, 0.5443375],)","([51.924216, 4.481776],)","([37.2635727, 127.0286009],)","([-27.7866521, 153.252212],)"


In [163]:
ans = str(locations.loc[0, 'Sydney'][0][0])+" "+str(locations.loc[0, 'Sydney'][0][1])
print(ans)

-33.8606304286 151.204775929


In [165]:
with open('ans.txt', 'w') as file:
    file.write(ans)