# Predicting Check-ins of Foursquare Users in Tokyo

## 2 - Data Wrangling, Google API

In [1]:
import pandas as pd
import urllib.request, json

In [2]:
df = pd.read_csv('df_afterwrangling.csv',
                 index_col = 0,
                 parse_dates = [0],
                 date_parser = pd.to_datetime, 
                 infer_datetime_format = True)

In [3]:
nearby_types = ['train_station', 'subway_station']

api_key = '???'

center_lat = 35.688667555
center_long = 139.6917354

city_radius = 20 / 111.32                      #20km in units of long/lat

nearby_radius_m = 1000
nearby_radius = nearby_radius_m / 1000 / 111.32      #1km in units of long/lat

url_base = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?'

nearby_type, nearby_name, nearby_lat, nearby_long = [], [], [], []

delta_longs = [0, 0.8660254 * nearby_radius]
delta_lats = [0, 1.5 * nearby_radius]

xs = [1, -1, -1,  1]
ys = [1,  1, -1, -1]

for x, y in zip(xs, ys):

    iter_long = center_long
    iter_lat = center_lat

    while ((center_lat - iter_lat)**2 + (center_long - iter_long)**2) < city_radius**2:

        ##########################################
        #  Below for loop is for two passes across
        ##########################################

        for delta_long, delta_lat in zip(delta_longs, delta_lats):

            iter_long += delta_long * x
            iter_lat += delta_lat * y

            ##########################################
            #  Below while loop is for one pass across
            ##########################################

            while_loops = 0

            while ((center_lat - iter_lat)**2 + (center_long - iter_long)**2) < city_radius**2:

                while_loops += 1

                ########################################
                #  Below for loop is for only one radius
                ########################################

                for nearby in nearby_types:

                    url_open = url_base + 'location=' + str(iter_lat) + ',' + str(iter_long) \
                        + '&radius=' + str(nearby_radius_m) + '&rankby=prominence&type=' \
                        + nearby + '&key=' + api_key

                    with urllib.request.urlopen(url_open) as url:
                        data = json.loads(url.read().decode())
                        url.close()

                    for result in data['results']:
                        nearby_name.append(result['name'])
                        nearby_lat.append(result['geometry']['location']['lat'])
                        nearby_long.append(result['geometry']['location']['lng'])
                        nearby_type.append(nearby)

                    if 'next_page_token' in data:

                        url_open = url_base + 'pagetoken=' + data['next_page_token']

                        with urllib.request.urlopen(url_open) as url:
                            data = json.loads(url.read().decode())
                            url.close()

                        for result in data['results']:
                            nearby_name.append(result['name'])
                            nearby_lat.append(result['geometry']['location']['lat'])
                            nearby_long.append(result['geometry']['location']['lng'])
                            nearby_type.append(nearby)

                        if 'next_page_token' in data:

                            url_open = url_base + 'pagetoken=' + data['next_page_token']

                            with urllib.request.urlopen(url_open) as url:
                                data = json.loads(url.read().decode())
                                url.close()

                            for result in data['results']:
                                nearby_name.append(result['name'])
                                nearby_lat.append(result['geometry']['location']['lat'])
                                nearby_long.append(result['geometry']['location']['lng'])
                                nearby_type.append(nearby)

                            if len(data['results']) == 20:
                                print('nearby_radius is too large!')

                iter_long += 1.732051 * nearby_radius * x

            # Need to reset original longitudinal position
            iter_long = center_long
            print('Loops: ' + str(while_loops))
            print('iter_long: ' + str(iter_long))
            print('iter_lat: ' + str(iter_lat))

        # Need to reset original longitudinal position
        iter_long = center_long
        iter_lat += 1.5 * nearby_radius * y

df_stations_raw = pd.DataFrame({
    'type': nearby_type,
    'name': nearby_name,
    'lat': nearby_lat,
    'long': nearby_long})


Loops: 12
iter_long: 139.6917354
iter_lat: 35.688667555
Loops: 12
iter_long: 139.6917354
iter_lat: 35.702142222624865
Loops: 12
iter_long: 139.6917354
iter_lat: 35.71561689024973
Loops: 11
iter_long: 139.6917354
iter_lat: 35.72909155787459
Loops: 12
iter_long: 139.6917354
iter_lat: 35.74256622549945
Loops: 11
iter_long: 139.6917354
iter_lat: 35.756040893124315
Loops: 11
iter_long: 139.6917354
iter_lat: 35.76951556074918
Loops: 10
iter_long: 139.6917354
iter_lat: 35.78299022837404
Loops: 10
iter_long: 139.6917354
iter_lat: 35.7964648959989
Loops: 9
iter_long: 139.6917354
iter_lat: 35.809939563623765
Loops: 8
iter_long: 139.6917354
iter_lat: 35.82341423124863
Loops: 7
iter_long: 139.6917354
iter_lat: 35.83688889887349
Loops: 6
iter_long: 139.6917354
iter_lat: 35.85036356649835
Loops: 3
iter_long: 139.6917354
iter_lat: 35.863838234123214
Loops: 12
iter_long: 139.6917354
iter_lat: 35.688667555
Loops: 12
iter_long: 139.6917354
iter_lat: 35.702142222624865
Loops: 12
iter_long: 139.6917354
it

In [4]:
df_stations_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082 entries, 0 to 1081
Data columns (total 4 columns):
lat     1082 non-null float64
long    1082 non-null float64
name    1082 non-null object
type    1082 non-null object
dtypes: float64(2), object(2)
memory usage: 33.9+ KB


In [5]:
df_stations_raw.head()

Unnamed: 0,lat,long,name,type
0,35.689738,139.700391,Shinjuku Station,train_station
1,35.683689,139.698916,Minami-Shinjuku Station,train_station
2,35.680986,139.686182,Hatsudai Station,train_station
3,35.686888,139.698322,Toei Chikatetsuniijuku Station,subway_station
4,35.689738,139.700391,Shinjuku Station,subway_station


In [6]:
df.head()

Unnamed: 0_level_0,userid,venuecat,lat,long,day,hour,maincat,venuecat_encoded,maincat_encoded
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-04-04 03:17:18,1541,Cosmetics Shop,35.705101,139.61959,2,3,Shop & Service,129,1
2012-04-04 03:22:04,868,Ramen / Noodle House,35.715581,139.800317,2,3,Food,2,2
2012-04-04 04:12:07,114,Convenience Store,35.714542,139.480065,2,4,Shop & Service,3,1
2012-04-04 04:12:13,868,Food & Drink Shop,35.725592,139.776633,2,4,Shop & Service,6,1
2012-04-04 04:18:23,1458,Housing Development,35.656083,139.734046,2,4,Residence,212,8


In [7]:
df_train = df.loc[df.venuecat == "Train Station", :]
df_subway = df.loc[df.venuecat == "Subway", :]

In [8]:
train_lat = df_train.lat.values
train_long = df_train.long.values

subway_lat = df_subway.lat.values
subway_long = df_subway.long.values

lat = df_stations_raw.lat.values
long = df_stations_raw.long.values

In [9]:
train_min = [min((lat - train_lat)**2 + (long - train_long)**2) for lat, long in zip(lat, long)]
subway_min = [min((lat - subway_lat)**2 + (long - subway_long)**2) for lat, long in zip(lat, long)]

In [10]:
df_stations_raw["type"] = ["Train Station" if train < subway else "Subway" for train, subway in zip(train_min, subway_min)]

In [11]:
df_stations_raw.head()

Unnamed: 0,lat,long,name,type
0,35.689738,139.700391,Shinjuku Station,Train Station
1,35.683689,139.698916,Minami-Shinjuku Station,Train Station
2,35.680986,139.686182,Hatsudai Station,Train Station
3,35.686888,139.698322,Toei Chikatetsuniijuku Station,Subway
4,35.689738,139.700391,Shinjuku Station,Train Station


In [12]:
df_stations = df_stations_raw.drop_duplicates()
df_stations.reset_index(drop = True, inplace = True)

In [13]:
df_stations.head()

Unnamed: 0,lat,long,name,type
0,35.689738,139.700391,Shinjuku Station,Train Station
1,35.683689,139.698916,Minami-Shinjuku Station,Train Station
2,35.680986,139.686182,Hatsudai Station,Train Station
3,35.686888,139.698322,Toei Chikatetsuniijuku Station,Subway
4,35.69057,139.692661,Tochōmae Sta.,Subway


In [14]:
df_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 580 entries, 0 to 579
Data columns (total 4 columns):
lat     580 non-null float64
long    580 non-null float64
name    580 non-null object
type    580 non-null object
dtypes: float64(2), object(2)
memory usage: 18.2+ KB


In [15]:
df_stations.to_csv('stations.csv')