# Predicting Check-ins of Foursquare Users in Tokyo

## 3 - Feature Engineering

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib import rcParams

from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

import itertools

%matplotlib inline

In [2]:
# Setup default matplotlib defaults for later use
def defaultsettings():
    rcParams.update(matplotlib.rcParamsDefault)
    sns.set_style()
    sns.set_context()
    %matplotlib inline

In [3]:
# Setup Seaborn settings for bar plots
def postersettings():
    rcParams.update(matplotlib.rcParamsDefault)
    sns.set_style("whitegrid")
    sns.set_context("poster")

In [4]:
# Load wrangled data.
df = pd.read_csv('df_afterwrangling.csv',
                 index_col = 0,
                 parse_dates = [0],
                 date_parser = pd.to_datetime, 
                 infer_datetime_format = True)
df_stations = pd.read_csv('stations.csv', index_col = 0)

In [5]:
df.head()

Unnamed: 0_level_0,userid,venuecat,lat,long,day,hour,maincat,venuecat_encoded,maincat_encoded
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-04-04 03:17:18,1541,Cosmetics Shop,35.705101,139.61959,2,3,Shop & Service,129,1
2012-04-04 03:22:04,868,Ramen / Noodle House,35.715581,139.800317,2,3,Food,2,2
2012-04-04 04:12:07,114,Convenience Store,35.714542,139.480065,2,4,Shop & Service,3,1
2012-04-04 04:12:13,868,Food & Drink Shop,35.725592,139.776633,2,4,Shop & Service,6,1
2012-04-04 04:18:23,1458,Housing Development,35.656083,139.734046,2,4,Residence,212,8


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 573703 entries, 2012-04-04 03:17:18 to 2013-02-16 11:35:29
Data columns (total 9 columns):
userid              573703 non-null int64
venuecat            573703 non-null object
lat                 573703 non-null float64
long                573703 non-null float64
day                 573703 non-null int64
hour                573703 non-null int64
maincat             573703 non-null object
venuecat_encoded    573703 non-null int64
maincat_encoded     573703 non-null int64
dtypes: float64(2), int64(5), object(2)
memory usage: 43.8+ MB


In [7]:
df_stations.head()

Unnamed: 0,lat,long,name,type
0,35.689738,139.700391,Shinjuku Station,Train Station
1,35.683689,139.698916,Minami-Shinjuku Station,Train Station
2,35.680986,139.686182,Hatsudai Station,Train Station
3,35.686888,139.698322,Toei Chikatetsuniijuku Station,Subway
4,35.69057,139.692661,Tochōmae Sta.,Subway


In [8]:
df_stations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 580 entries, 0 to 579
Data columns (total 4 columns):
lat     580 non-null float64
long    580 non-null float64
name    580 non-null object
type    580 non-null object
dtypes: float64(2), object(2)
memory usage: 22.7+ KB


### Stacked Model, Level 0: Train Station, Subway, or Other

In [9]:
df["y0"] = 2
df.loc[df.venuecat_encoded == 0, "y0"] = 0
df.loc[df.venuecat_encoded == 1, "y0"] = 1

### Who

In [10]:
df_who = pd.concat([df.userid, pd.get_dummies(df.y0, prefix = "rollavg")], axis = 1) \
    .groupby(["userid"]) \
    .expanding() \
    .mean() \
    .reset_index(level = "userid", drop = True) \
    .sort_index()

In [11]:
df_who.tail()

Unnamed: 0_level_0,userid,rollavg_0,rollavg_1,rollavg_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-02-16 11:34:35,326.0,0.637795,0.015748,0.346457
2013-02-16 11:34:53,853.0,0.421569,0.029412,0.54902
2013-02-16 11:34:55,1502.0,0.300971,0.029126,0.669903
2013-02-16 11:35:17,408.0,0.397403,0.181818,0.420779
2013-02-16 11:35:29,1050.0,0.475,0.030556,0.494444


### When

In [12]:
df_when = df.loc[:, ["userid", "day", "hour"]]

In [13]:
# Create new columns denoting weekday and weekend mornings, afternoons, and evenings:

# Monday-Friday, 3am-10am
df_when['weekday_morning'] = (df_when.day <= 4) & (df_when.hour >= 3) & (df_when.hour < 10)

# Monday-Friday, 10am-3pm
df_when['weekday_afternoon'] = (df_when.day <= 4) & (df_when.hour >= 10) & (df_when.hour < 15)

# Monday-Friday after 3pm, or Tuesday-Saturday before 3am
df_when['weekday_evening'] = ((df_when.day <= 4) & (df_when.hour >= 15)) | (((df_when.day >= 1) & (df_when.day <= 5)) & (df_when.hour < 3))

# Saturday-Sunday, 3am-2pm
df_when['weekend_morning'] = (df_when.day >= 5) & (df_when.hour >= 3) & (df_when.hour < 14)

# Saturday-Sunday, 2pm-7pm
df_when['weekend_afternoon'] = (df_when.day >= 5) & (df_when.hour >= 14) & (df_when.hour < 19)

# Saturday-Sunday after 7pm, or Sunday-Monday before 3am
df_when['weekend_evening'] = ((df_when.day >= 5) & (df_when.hour >= 19)) | ((df_when.day == 6) & (df_when.hour < 3)) | ((df_when.day == 0) & (df_when.hour < 3))

In [14]:
df_when.head()

Unnamed: 0_level_0,userid,day,hour,weekday_morning,weekday_afternoon,weekday_evening,weekend_morning,weekend_afternoon,weekend_evening
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-04-04 03:17:18,1541,2,3,True,False,False,False,False,False
2012-04-04 03:22:04,868,2,3,True,False,False,False,False,False
2012-04-04 04:12:07,114,2,4,True,False,False,False,False,False
2012-04-04 04:12:13,868,2,4,True,False,False,False,False,False
2012-04-04 04:18:23,1458,2,4,True,False,False,False,False,False


### Where

In [15]:
lat = df.lat.values
long = df.long.values

trains_lat = df_stations.loc[df_stations.type == "Train Station", "lat"].values
trains_long = df_stations.loc[df_stations.type == "Train Station", "long"].values

subways_lat = df_stations.loc[df_stations.type == "Subway", "lat"].values
subways_long = df_stations.loc[df_stations.type == "Subway", "long"].values

In [16]:
nearest_train = [min((lat - trains_lat)**2 + (long - trains_long)**2) for lat, long in zip(lat, long)]

In [17]:
nearest_subway = [min((lat - subways_lat)**2 + (long - subways_long)**2) for lat, long in zip(lat, long)]

In [18]:
# Anything better than 1.5x and 1x?
# Multiply features. If can't, engineer them
col_train = [(train < subway) & (train < (1.5*0.000898)**2) for train, subway in zip(nearest_train, nearest_subway)]
col_subway = [(train > subway) & (subway < (0.000898)**2) for train, subway in zip(nearest_train, nearest_subway)]

In [19]:
sum(col_train) / len(col_train)

0.40801599433853403

In [20]:
sum(col_subway) / len(col_subway)

0.072823394683311754

In [21]:
df_X = pd.concat([df_who.loc[:, "rollavg_0":], df_when.loc[:, "weekday_morning":]], axis = 1)
df_X["train_nearby"] = col_train
df_X["subway_nearby"] = col_subway
df_X["y"] = df.y0

### Who and When

In [22]:
df_who_when_togroup = df_who.copy()

In [23]:
df_who_when_togroup["timecat"] = df_when.loc[:, "weekday_morning" : ].columns[np.where(df_when.loc[:, "weekday_morning" : ] == True)[1]]

In [24]:
df_who_when = df_who_when_togroup \
    .groupby(["userid", "timecat"]) \
    .expanding() \
    .mean() \
    .reset_index(level = ["userid", "timecat"], drop = True) \
    .sort_index()

In [25]:
df_who_when.head()

Unnamed: 0_level_0,userid,rollavg_0,rollavg_1,rollavg_2,timecat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-04-04 03:17:18,1541.0,0.0,0.0,1.0,weekday_morning
2012-04-04 03:22:04,868.0,0.0,0.0,1.0,weekday_morning
2012-04-04 04:12:07,114.0,0.0,0.0,1.0,weekday_morning
2012-04-04 04:12:13,868.0,0.0,0.0,1.0,weekday_morning
2012-04-04 04:18:23,1458.0,0.0,0.0,1.0,weekday_morning


### Who and Where

In [26]:
df_who_where_togroup = df_who.copy()

In [27]:
df_who_where_togroup["train_nearby"] = col_train
df_who_where_togroup["subway_nearby"] = col_subway
df_who_where_togroup["none"] = (df_who_where_togroup.train_nearby | df_who_where_togroup.subway_nearby == 0)

In [28]:
df_who_where_togroup.head()

Unnamed: 0_level_0,userid,rollavg_0,rollavg_1,rollavg_2,train_nearby,subway_nearby,none
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-04-04 03:17:18,1541.0,0.0,0.0,1.0,True,False,False
2012-04-04 03:22:04,868.0,0.0,0.0,1.0,False,False,True
2012-04-04 04:12:07,114.0,0.0,0.0,1.0,False,False,True
2012-04-04 04:12:13,868.0,0.0,0.0,1.0,False,False,True
2012-04-04 04:18:23,1458.0,0.0,0.0,1.0,False,False,True


In [29]:
df_who_where_togroup["nearcat"] = df_who_where_togroup.loc[:, "train_nearby" : ].columns[np.where(df_who_where_togroup.loc[:, "train_nearby" : ] == True)[1]]

In [30]:
df_who_where_togroup.head()

Unnamed: 0_level_0,userid,rollavg_0,rollavg_1,rollavg_2,train_nearby,subway_nearby,none,nearcat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-04-04 03:17:18,1541.0,0.0,0.0,1.0,True,False,False,train_nearby
2012-04-04 03:22:04,868.0,0.0,0.0,1.0,False,False,True,none
2012-04-04 04:12:07,114.0,0.0,0.0,1.0,False,False,True,none
2012-04-04 04:12:13,868.0,0.0,0.0,1.0,False,False,True,none
2012-04-04 04:18:23,1458.0,0.0,0.0,1.0,False,False,True,none


In [31]:
df_who_where = df_who_where_togroup \
    .drop(["train_nearby", "subway_nearby", "none"], axis = 1) \
    .groupby(["userid", "nearcat"]) \
    .expanding() \
    .mean() \
    .reset_index(level = ["userid", "nearcat"], drop = True) \
    .sort_index()

In [32]:
df_who_where.head()

Unnamed: 0_level_0,userid,rollavg_0,rollavg_1,rollavg_2,nearcat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-04-04 03:17:18,1541.0,0.0,0.0,1.0,train_nearby
2012-04-04 03:22:04,868.0,0.0,0.0,1.0,none
2012-04-04 04:12:07,114.0,0.0,0.0,1.0,none
2012-04-04 04:12:13,868.0,0.0,0.0,1.0,none
2012-04-04 04:18:23,1458.0,0.0,0.0,1.0,none


### Who, When, and Where

In [33]:
df_who_when_where_togroup = df_who_when.copy()

In [34]:
df_who_when_where_togroup["nearcat"] = df_who_where.nearcat

In [35]:
df_who_when_where_togroup.head()

Unnamed: 0_level_0,userid,rollavg_0,rollavg_1,rollavg_2,timecat,nearcat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-04-04 03:17:18,1541.0,0.0,0.0,1.0,weekday_morning,train_nearby
2012-04-04 03:22:04,868.0,0.0,0.0,1.0,weekday_morning,none
2012-04-04 04:12:07,114.0,0.0,0.0,1.0,weekday_morning,none
2012-04-04 04:12:13,868.0,0.0,0.0,1.0,weekday_morning,none
2012-04-04 04:18:23,1458.0,0.0,0.0,1.0,weekday_morning,none


In [36]:
df_who_when_where = df_who_when_where_togroup \
    .groupby(["userid", "timecat", "nearcat"]) \
    .expanding() \
    .mean() \
    .reset_index(level = ["userid", "timecat", "nearcat"], drop = True) \
    .sort_index()

In [37]:
df_who_when_where.head()

Unnamed: 0_level_0,userid,rollavg_0,rollavg_1,rollavg_2,timecat,nearcat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-04-04 03:17:18,1541.0,0.0,0.0,1.0,weekday_morning,train_nearby
2012-04-04 03:22:04,868.0,0.0,0.0,1.0,weekday_morning,none
2012-04-04 04:12:07,114.0,0.0,0.0,1.0,weekday_morning,none
2012-04-04 04:12:13,868.0,0.0,0.0,1.0,weekday_morning,none
2012-04-04 04:18:23,1458.0,0.0,0.0,1.0,weekday_morning,none


In [38]:
df_who_when_where.tail()

Unnamed: 0_level_0,userid,rollavg_0,rollavg_1,rollavg_2,timecat,nearcat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-02-16 11:34:35,326.0,0.679612,0.002939,0.317449,weekend_morning,none
2013-02-16 11:34:53,853.0,0.45529,0.050181,0.494529,weekend_morning,train_nearby
2013-02-16 11:34:55,1502.0,0.42144,0.010025,0.568535,weekend_morning,none
2013-02-16 11:35:17,408.0,0.5654,0.10096,0.333641,weekend_morning,none
2013-02-16 11:35:29,1050.0,0.571252,0.020238,0.40851,weekend_morning,train_nearby


In [39]:
df_X = pd.concat([df_who.loc[:, "rollavg_0":], df_when.loc[:, "weekday_morning":]], axis = 1)
df_X["train_nearby"] = col_train
df_X["subway_nearby"] = col_subway
df_X["y0"] = df.y0
df_X["y1"] = df.venuecat_encoded

In [40]:
df_X0 = pd.concat([df_X,
                  df_who_when.loc[:, "rollavg_0" : "rollavg_2"],
                  df_who_where.loc[:, "rollavg_0" : "rollavg_2"],
                  df_who_when_where.loc[:, "rollavg_0" : "rollavg_2"]], axis = 1)

In [41]:
df_X0.head()

Unnamed: 0_level_0,rollavg_0,rollavg_1,rollavg_2,weekday_morning,weekday_afternoon,weekday_evening,weekend_morning,weekend_afternoon,weekend_evening,train_nearby,...,y1,rollavg_0,rollavg_1,rollavg_2,rollavg_0,rollavg_1,rollavg_2,rollavg_0,rollavg_1,rollavg_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-04-04 03:17:18,0.0,0.0,1.0,True,False,False,False,False,False,True,...,129,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2012-04-04 03:22:04,0.0,0.0,1.0,True,False,False,False,False,False,False,...,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2012-04-04 04:12:07,0.0,0.0,1.0,True,False,False,False,False,False,False,...,3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2012-04-04 04:12:13,0.0,0.0,1.0,True,False,False,False,False,False,False,...,6,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2012-04-04 04:18:23,0.0,0.0,1.0,True,False,False,False,False,False,False,...,212,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [42]:
df_X0.tail()

Unnamed: 0_level_0,rollavg_0,rollavg_1,rollavg_2,weekday_morning,weekday_afternoon,weekday_evening,weekend_morning,weekend_afternoon,weekend_evening,train_nearby,...,y1,rollavg_0,rollavg_1,rollavg_2,rollavg_0,rollavg_1,rollavg_2,rollavg_0,rollavg_1,rollavg_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-02-16 11:34:35,0.637795,0.015748,0.346457,False,False,False,True,False,False,False,...,42,0.672337,0.006685,0.320978,0.660851,0.009129,0.33002,0.679612,0.002939,0.317449
2013-02-16 11:34:53,0.421569,0.029412,0.54902,False,False,False,True,False,False,True,...,0,0.463668,0.045831,0.490501,0.473825,0.054248,0.471927,0.45529,0.050181,0.494529
2013-02-16 11:34:55,0.300971,0.029126,0.669903,False,False,False,True,False,False,False,...,121,0.358849,0.021469,0.619682,0.321361,0.023385,0.655254,0.42144,0.010025,0.568535
2013-02-16 11:35:17,0.397403,0.181818,0.420779,False,False,False,True,False,False,False,...,17,0.452211,0.146535,0.401254,0.417051,0.153969,0.42898,0.5654,0.10096,0.333641
2013-02-16 11:35:29,0.475,0.030556,0.494444,False,False,False,True,False,False,True,...,50,0.541239,0.029657,0.429104,0.541044,0.028694,0.430262,0.571252,0.020238,0.40851


In [43]:
df_X0.to_csv("df_X0.csv")