# Predicting Check-ins of Foursquare Users in Tokyo

## 4 - Feature Engineering

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Load wrangled data.
df = pd.read_csv('df_afterwrangling.csv',
                 index_col = 0,
                 parse_dates = [0],
                 date_parser = pd.to_datetime, 
                 infer_datetime_format = True)
df_stations = pd.read_csv('stations.csv', index_col = 0)

In [3]:
df.head()

Unnamed: 0_level_0,userid,venuecat,lat,long,day,hour,maincat,venuecat_encoded,maincat_encoded
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2012-04-04 03:17:18,1541,Cosmetics Shop,35.705101,139.61959,2,3,Shop & Service,129,1
2012-04-04 03:22:04,868,Ramen / Noodle House,35.715581,139.800317,2,3,Food,2,2
2012-04-04 04:12:07,114,Convenience Store,35.714542,139.480065,2,4,Shop & Service,3,1
2012-04-04 04:12:13,868,Food & Drink Shop,35.725592,139.776633,2,4,Shop & Service,6,1
2012-04-04 04:18:23,1458,Housing Development,35.656083,139.734046,2,4,Residence,212,8


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 573703 entries, 2012-04-04 03:17:18 to 2013-02-16 11:35:29
Data columns (total 9 columns):
userid              573703 non-null int64
venuecat            573703 non-null object
lat                 573703 non-null float64
long                573703 non-null float64
day                 573703 non-null int64
hour                573703 non-null int64
maincat             573703 non-null object
venuecat_encoded    573703 non-null int64
maincat_encoded     573703 non-null int64
dtypes: float64(2), int64(5), object(2)
memory usage: 43.8+ MB


In [5]:
df_stations.head()

Unnamed: 0,lat,long,name,type
0,35.689738,139.700391,Shinjuku Station,Train Station
1,35.683689,139.698916,Minami-Shinjuku Station,Train Station
2,35.680986,139.686182,Hatsudai Station,Train Station
3,35.686888,139.698322,Toei Chikatetsuniijuku Station,Subway
4,35.69057,139.692661,Tochōmae Sta.,Subway


In [6]:
df_stations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 580 entries, 0 to 579
Data columns (total 4 columns):
lat     580 non-null float64
long    580 non-null float64
name    580 non-null object
type    580 non-null object
dtypes: float64(2), object(2)
memory usage: 22.7+ KB


### Stacked Model, Level 0: Train Station, Subway, or Other

In [7]:
df["y0"] = 2
df.loc[df.venuecat_encoded == 0, "y0"] = 0
df.loc[df.venuecat_encoded == 1, "y0"] = 1

### Who

In [8]:
df_who_rollavg = pd.concat([df.userid, pd.get_dummies(df.y0, prefix = "poweruser")], axis = 1) \
    .groupby(["userid"]) \
    .expanding() \
    .mean() \
    .reset_index(level = "userid", drop = True) \
    .sort_index()

In [9]:
df_who_rollavg.tail()

Unnamed: 0_level_0,userid,poweruser_0,poweruser_1,poweruser_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-02-16 11:34:35,326.0,0.637795,0.015748,0.346457
2013-02-16 11:34:53,853.0,0.421569,0.029412,0.54902
2013-02-16 11:34:55,1502.0,0.300971,0.029126,0.669903
2013-02-16 11:35:17,408.0,0.397403,0.181818,0.420779
2013-02-16 11:35:29,1050.0,0.475,0.030556,0.494444


In [10]:
df_who = pd.DataFrame()

df_who["poweruser_0"] = df_who_rollavg.poweruser_0 > 0.85
df_who["poweruser_1"] = df_who_rollavg.poweruser_1 > 0.5
df_who["poweruser_2"] = df_who_rollavg.poweruser_2 > 0.85

In [11]:
df_who.head()

Unnamed: 0_level_0,poweruser_0,poweruser_1,poweruser_2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-04-04 03:17:18,False,False,True
2012-04-04 03:22:04,False,False,True
2012-04-04 04:12:07,False,False,True
2012-04-04 04:12:13,False,False,True
2012-04-04 04:18:23,False,False,True


### When

In [12]:
df_when = df.loc[:, ["day", "hour"]]

In [13]:
# Create new columns denoting weekday and weekend mornings, afternoons, and evenings:

# Monday-Friday, 3am-10am
df_when['weekday_morning'] = (df_when.day <= 4) & (df_when.hour >= 3) & (df_when.hour < 10)

# Monday-Friday, 10am-3pm
df_when['weekday_afternoon'] = (df_when.day <= 4) & (df_when.hour >= 10) & (df_when.hour < 15)

# Monday-Friday after 3pm, or Tuesday-Saturday before 3am
df_when['weekday_evening'] = ((df_when.day <= 4) & (df_when.hour >= 15)) | (((df_when.day >= 1) & (df_when.day <= 5)) & (df_when.hour < 3))

# Saturday-Sunday, 3am-2pm
df_when['weekend_morning'] = (df_when.day >= 5) & (df_when.hour >= 3) & (df_when.hour < 14)

# Saturday-Sunday, 2pm-7pm
df_when['weekend_afternoon'] = (df_when.day >= 5) & (df_when.hour >= 14) & (df_when.hour < 19)

# Saturday-Sunday after 7pm, or Sunday-Monday before 3am
df_when['weekend_evening'] = ((df_when.day >= 5) & (df_when.hour >= 19)) | ((df_when.day == 6) & (df_when.hour < 3)) | ((df_when.day == 0) & (df_when.hour < 3))

In [14]:
df_when.drop(["day", "hour"], axis = 1, inplace = True)

In [15]:
df_when.head()

Unnamed: 0_level_0,weekday_morning,weekday_afternoon,weekday_evening,weekend_morning,weekend_afternoon,weekend_evening
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-04-04 03:17:18,True,False,False,False,False,False
2012-04-04 03:22:04,True,False,False,False,False,False
2012-04-04 04:12:07,True,False,False,False,False,False
2012-04-04 04:12:13,True,False,False,False,False,False
2012-04-04 04:18:23,True,False,False,False,False,False


### Where

In [16]:
lat = df.lat.values
long = df.long.values

trains_lat = df_stations.loc[df_stations.type == "Train Station", "lat"].values
trains_long = df_stations.loc[df_stations.type == "Train Station", "long"].values

subways_lat = df_stations.loc[df_stations.type == "Subway", "lat"].values
subways_long = df_stations.loc[df_stations.type == "Subway", "long"].values

In [17]:
nearest_train = [min((lat - trains_lat)**2 + (long - trains_long)**2) for lat, long in zip(lat, long)]

In [18]:
nearest_subway = [min((lat - subways_lat)**2 + (long - subways_long)**2) for lat, long in zip(lat, long)]

In [19]:
# Anything better than 1.2x and 1x? 0.000898 ~ 100 meters
col_train = [(train < subway) & (train < (1.2*0.000898)**2) for train, subway in zip(nearest_train, nearest_subway)]
col_subway = [(train > subway) & (subway < (1.0*0.000898)**2) for train, subway in zip(nearest_train, nearest_subway)]

### Concatenate into One DataFrame

In [20]:
df_X0 = pd.concat([df_who, df_when], axis = 1)
df_X0["train_nearby"] = col_train
df_X0["subway_nearby"] = col_subway
df_X0["y0"] = df.y0

df_X0["y1"] = df.venuecat_encoded

In [21]:
df_X0.head(20)

Unnamed: 0_level_0,poweruser_0,poweruser_1,poweruser_2,weekday_morning,weekday_afternoon,weekday_evening,weekend_morning,weekend_afternoon,weekend_evening,train_nearby,subway_nearby,y0,y1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2012-04-04 03:17:18,False,False,True,True,False,False,False,False,False,True,False,2,129
2012-04-04 03:22:04,False,False,True,True,False,False,False,False,False,False,False,2,2
2012-04-04 04:12:07,False,False,True,True,False,False,False,False,False,False,False,2,3
2012-04-04 04:12:13,False,False,True,True,False,False,False,False,False,False,False,2,6
2012-04-04 04:18:23,False,False,True,True,False,False,False,False,False,False,False,2,212
2012-04-04 04:20:09,False,False,True,True,False,False,False,False,False,True,False,2,60
2012-04-04 04:21:00,False,False,True,True,False,False,False,False,False,False,False,2,3
2012-04-04 04:35:36,False,False,False,True,False,False,False,False,False,False,False,0,0
2012-04-04 04:51:50,False,False,True,True,False,False,False,False,False,False,False,2,28
2012-04-04 04:51:59,False,False,True,True,False,False,False,False,False,False,True,2,2


In [22]:
df_X0.to_csv("df_X0.csv")