# Predicting Check-ins of Foursquare Users in Tokyo

## 1 - Data Wrangling, Foursquare

In [1]:
# Import libraries.
import pandas as pd
import numpy as np
from datetime import datetime
import urllib.request, json
from pandas.io.json import json_normalize

In [2]:
# Data obtained from Kaggle (https://www.kaggle.com/chetanism/foursquare-nyc-and-tokyo-checkin-dataset/data).
df = pd.read_csv('dataset_TSMC2014_TKY.csv')

In [3]:
# Inspect the data.
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573703 entries, 0 to 573702
Data columns (total 8 columns):
userId             573703 non-null int64
venueId            573703 non-null object
venueCategoryId    573703 non-null object
venueCategory      573703 non-null object
latitude           573703 non-null float64
longitude          573703 non-null float64
timezoneOffset     573703 non-null int64
utcTimestamp       573703 non-null object
dtypes: float64(2), int64(2), object(4)
memory usage: 35.0+ MB


Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp
0,1541,4f0fd5a8e4b03856eeb6c8cb,4bf58dd8d48988d10c951735,Cosmetics Shop,35.705101,139.61959,540,Tue Apr 03 18:17:18 +0000 2012
1,868,4b7b884ff964a5207d662fe3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,35.715581,139.800317,540,Tue Apr 03 18:22:04 +0000 2012
2,114,4c16fdda96040f477cc473a5,4d954b0ea243a5684a65b473,Convenience Store,35.714542,139.480065,540,Tue Apr 03 19:12:07 +0000 2012
3,868,4c178638c2dfc928651ea869,4bf58dd8d48988d118951735,Food & Drink Shop,35.725592,139.776633,540,Tue Apr 03 19:12:13 +0000 2012
4,1458,4f568309e4b071452e447afe,4f2a210c4b9023bd5841ed28,Housing Development,35.656083,139.734046,540,Tue Apr 03 19:18:23 +0000 2012


In [4]:
# Create a new column timestamp with datetime type.
df['timestamp'] = pd.to_datetime(df.utcTimestamp, format = '%a %b %d %H:%M:%S +0000 %Y')

In [5]:
# Set the new timestamp column as the new index and adjust to local time in Tokyo.
df.set_index('timestamp', inplace = True)
df.index = df.index.tz_localize('UTC').tz_convert('Asia/Tokyo').tz_localize(None)
df.sort_index(inplace = True)

In [6]:
# Drop timezoneOffset, venueid, and utcTimestamp. 
# Column timezoneOffset contains the same value for every observation.
# For the purposes of this exercise, we will not be concerned with venueid.
# utcTimestamp is no longer needed.
df.drop(['timezoneOffset', 'venueId', 'utcTimestamp'], axis = 1, inplace = True)

In [7]:
# Rename the columns to make them all lowercase and shorter in length.
df = df.rename(columns = {
    'userId': 'userid',
    'venueCategoryId': 'venuecatid',
    'venueCategory': 'venuecat',
    'latitude': 'lat',
    'longitude': 'long'
})

In [8]:
# Create new features for day and hour.
df['day'] = df.index.weekday
df['hour'] = df.index.hour

In [9]:
# What are the unique venue categories?
df.venuecat.value_counts()

Train Station                      200428
Subway                              41666
Ramen /  Noodle House               17303
Convenience Store                   16833
Japanese Restaurant                 15680
Bar                                 14940
Food & Drink Shop                   14023
Electronics Store                   10897
Mall                                10839
Coffee Shop                          8959
Office                               8789
Café                                 8603
Bridge                               8162
Bus Station                          7965
Road                                 7528
Bookstore                            7418
Park                                 7247
Fast Food Restaurant                 6806
Arcade                               6153
College Academic Building            5981
Government Building                  5116
Building                             5036
Chinese Restaurant                   4718
Hobby Shop                        

There are 247 unique venue categories. Having this many classes will make multiclass classification challenging. Let's include a coarser classification of venue category and divide it into 9 primary categories.

In [10]:
# Obtaining a mapping between 9 root venue categories and the 247 specific venue categories.
with urllib.request.urlopen("https://api.foursquare.com/v2/venues/categories?oauth_token=QEJ4AQPTMMNB413HGNZ5YDMJSHTOHZHMLZCAQCCLXIX41OMP&v=20170211") as url:
    data = json.loads(url.read().decode())

In [11]:
# The df_categories dataframe links a specific venue category to one of the 9 root 'maincat'.
categories = range(len(data['response']['categories']))

df_categories = []
for category in categories:
    df_category = pd.DataFrame({'id': json_normalize(data['response']['categories'][category]['categories'])['id'],
                              'name': json_normalize(data['response']['categories'][category]['categories'])['name']
                                })
    df_category['category'] = json_normalize(data['response']['categories'])['name'][category]
    df_categories.append(df_category)
df_categories = pd.concat(df_categories, axis = 0)
df_categories = df_categories.reset_index(drop = True)
df_categories.columns = ['venuecatid', 'name', 'maincat']

df_categories.head()

Unnamed: 0,venuecatid,name,maincat
0,56aa371be4b08b9a8d5734db,Amphitheater,Arts & Entertainment
1,4fceea171983d5d06c3e9823,Aquarium,Arts & Entertainment
2,4bf58dd8d48988d1e1931735,Arcade,Arts & Entertainment
3,4bf58dd8d48988d1e2931735,Art Gallery,Arts & Entertainment
4,4bf58dd8d48988d1e4931735,Bowling Alley,Arts & Entertainment


In [12]:
# Join the tables on venuecatid.
df = pd.merge(df, df_categories[['venuecatid', 'maincat']], how = 'left', on = 'venuecatid').set_index(df.index)

In [13]:
# Several venue categories were not linked to their root categories through the above merge command.
# These will need to be linked manually.
df.loc[df.venuecat == 'Ramen /  Noodle House','maincat'] = 'Food'
df.loc[df.venuecat == 'Food & Drink Shop','maincat'] = 'Shop & Service'
df.loc[df.venuecat == 'Airport','maincat'] = 'Travel & Transport'
df.loc[df.venuecat == 'Shrine','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Mosque','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Synagogue','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Train Station','maincat'] = 'Travel & Transport'
df.loc[df.venuecat == 'Government Building','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Shop & Service','maincat'] = 'Shop & Service'
df.loc[df.venuecat == 'Art Museum','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'Gym / Fitness Center','maincat'] = 'Outdoors & Recreation'
df.loc[df.venuecat == 'Medical Center','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Bar','maincat'] = 'Nightlife Spot'
df.loc[df.venuecat == 'Music Venue','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'Japanese Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Chinese Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Korean Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Thai Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Spanish Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Vietnamese Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Sushi Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Dim Sum Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Brazilian Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'American Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Tapas Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Ethiopian Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Taco Place','maincat'] = 'Food'
df.loc[df.venuecat == 'Peruvian Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Moroccan Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Cuban Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Argentinian Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Burrito Place','maincat'] = 'Food'
df.loc[df.venuecat == 'Arepa Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'Malaysian Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'South American Restaurant','maincat'] = 'Food'
df.loc[df.venuecat == 'College Academic Building','maincat'] = 'College & University'
df.loc[df.venuecat == 'History Museum','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'Movie Theater','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'Office','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Temple','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Convention Center','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Bus Station','maincat'] = 'Travel & Transport'
df.loc[df.venuecat == 'Stadium','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'High School','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Clothing Store','maincat'] = 'Shop & Service'
df.loc[df.venuecat == 'Neighborhood','maincat'] = 'Outdoors & Recreation'
df.loc[df.venuecat == 'Food','maincat'] = 'Food'
df.loc[df.venuecat == 'Church','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Outdoors & Recreation','maincat'] = 'Outdoors & Recreation'
df.loc[df.venuecat == 'Science Museum','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'Hotel','maincat'] = 'Travel & Transport'
df.loc[df.venuecat == 'Beer Garden','maincat'] = 'Nightlife Spot'
df.loc[df.venuecat == 'Theater','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'Athletic & Sport','maincat'] = 'Outdoors & Recreation'
df.loc[df.venuecat == 'Nursery School','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Nightlife Spot','maincat'] = 'Nightlife Spot'
df.loc[df.venuecat == 'Elementary School','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Travel & Transport','maincat'] = 'Travel & Transport'
df.loc[df.venuecat == 'Embassy / Consulate','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Performing Arts Venue','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'College Stadium','maincat'] = 'College & University'
df.loc[df.venuecat == 'Professional & Other Places','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Ice Cream Shop','maincat'] = 'Food'
df.loc[df.venuecat == 'Cupcake Shop','maincat'] = 'Food'
df.loc[df.venuecat == 'Middle School','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'Music School','maincat'] = 'Professional & Other Places'
df.loc[df.venuecat == 'College & University','maincat'] = 'College & University'
df.loc[df.venuecat == 'Arts & Entertainment','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'Planetarium','maincat'] = 'Arts & Entertainment'
df.loc[df.venuecat == 'Ferry','maincat'] = 'Travel & Transport'
df.loc[df.venuecat == 'City','maincat'] = 'Outdoors & Recreation'

In [14]:
# We have finished using venuecatid to join the tables. Let's drop this column now.
df.drop('venuecatid', axis = 1, inplace = True)

In [15]:
# Create new columns to encode both maincat and venuecat as numeric features.
counts_maincat = df.maincat.value_counts()
counts_venuecat = df.venuecat.value_counts()

venuecats = np.arange(len(counts_venuecat))
maincats = np.arange(len(counts_maincat))

dict_venuecat_code = dict(zip(counts_venuecat.index, venuecats))
dict_maincat_code = dict(zip(counts_maincat.index, maincats))

df['venuecat_encoded'] = df.venuecat.map(dict_venuecat_code)
df['maincat_encoded'] = df.maincat.map(dict_maincat_code)

In [16]:
df.head()

Unnamed: 0_level_0,userid,venuecat,lat,long,day,hour,maincat,venuecat_encoded,maincat_encoded
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-04-04 03:17:18,1541,Cosmetics Shop,35.705101,139.61959,2,3,Shop & Service,129,1
2012-04-04 03:22:04,868,Ramen / Noodle House,35.715581,139.800317,2,3,Food,2,2
2012-04-04 04:12:07,114,Convenience Store,35.714542,139.480065,2,4,Shop & Service,3,1
2012-04-04 04:12:13,868,Food & Drink Shop,35.725592,139.776633,2,4,Shop & Service,6,1
2012-04-04 04:18:23,1458,Housing Development,35.656083,139.734046,2,4,Residence,212,8


In [17]:
# Let's save the dataframe into a .csv file.
df.to_csv('df_afterwrangling.csv')