# Predicting Check-ins of Foursquare Users in Tokyo

## 6 - Feature Engineering II

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
# Load wrangled data.
df = pd.read_csv('df_afterwrangling.csv',
                 index_col = 0,
                 parse_dates = [0],
                 date_parser = pd.to_datetime, 
                 infer_datetime_format = True)
df_stations = pd.read_csv('stations.csv', index_col = 0)

In [3]:
# Load X for Layer 0.
df_X0 = pd.read_csv('df_X0.csv',
                    index_col = 0,
                    parse_dates = [0],
                    date_parser = pd.to_datetime,
                    infer_datetime_format = True)

In [4]:
df_X0.head()

Unnamed: 0_level_0,poweruser_0,poweruser_1,poweruser_2,weekday_morning,weekday_afternoon,weekday_evening,weekend_morning,weekend_afternoon,weekend_evening,train_nearby,subway_nearby,y0,y1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2012-04-04 03:17:18,False,False,True,True,False,False,False,False,False,True,False,2,129
2012-04-04 03:22:04,False,False,True,True,False,False,False,False,False,False,False,2,2
2012-04-04 04:12:07,False,False,True,True,False,False,False,False,False,False,False,2,3
2012-04-04 04:12:13,False,False,True,True,False,False,False,False,False,False,False,2,6
2012-04-04 04:18:23,False,False,True,True,False,False,False,False,False,False,False,2,212


In [5]:
# Load predictions from Layer 0
df_layer0_results = pd.read_csv("layer0_results.csv")

In [6]:
ypred0 = df_layer0_results.ypred0.values

### Stacked Model, Level 1: All Other Venue Categories

In [7]:
df_who1 = pd.concat([df.userid, pd.get_dummies(df.venuecat_encoded, prefix = "rollavg")], axis = 1) \
    .groupby(["userid"]) \
    .expanding() \
    .mean() \
    .reset_index(level = "userid", drop = True) \
    .sort_index()

In [8]:
df_who1.tail()

Unnamed: 0_level_0,userid,rollavg_0,rollavg_1,rollavg_2,rollavg_3,rollavg_4,rollavg_5,rollavg_6,rollavg_7,rollavg_8,...,rollavg_237,rollavg_238,rollavg_239,rollavg_240,rollavg_241,rollavg_242,rollavg_243,rollavg_244,rollavg_245,rollavg_246
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-02-16 11:34:35,326.0,0.637795,0.015748,0.011811,0.0,0.019685,0.03937,0.023622,0.035433,0.003937,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-02-16 11:34:53,853.0,0.421569,0.029412,0.019608,0.0,0.029412,0.029412,0.0,0.0,0.058824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-02-16 11:34:55,1502.0,0.300971,0.029126,0.019417,0.0,0.019417,0.0,0.0,0.009709,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-02-16 11:35:17,408.0,0.397403,0.181818,0.028571,0.031169,0.023377,0.015584,0.0,0.01039,0.01039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-02-16 11:35:29,1050.0,0.475,0.030556,0.025,0.008333,0.008333,0.0,0.022222,0.005556,0.005556,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_who1["y1"] = df.venuecat_encoded
df_who1["y1_and_ypred0"] = list(df.venuecat_encoded.values[ : -len(ypred0)]) + list(ypred0)

df_who1["train_nearby"] = df_X0.train_nearby
df_who1["subway_nearby"] = df_X0.subway_nearby

In [10]:
df_X1 = df_who1.loc[df_who1.y1_and_ypred0 > 1, "rollavg_2" : ].drop(["y1_and_ypred0"], axis = 1)

In [11]:
df_X1.head()

Unnamed: 0_level_0,rollavg_2,rollavg_3,rollavg_4,rollavg_5,rollavg_6,rollavg_7,rollavg_8,rollavg_9,rollavg_10,rollavg_11,...,rollavg_240,rollavg_241,rollavg_242,rollavg_243,rollavg_244,rollavg_245,rollavg_246,y1,train_nearby,subway_nearby
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-04-04 03:17:18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,129,True,False
2012-04-04 03:22:04,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,False,False
2012-04-04 04:12:07,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,False,False
2012-04-04 04:12:13,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,False,False
2012-04-04 04:18:23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,212,False,False


In [12]:
df_X1.tail()

Unnamed: 0_level_0,rollavg_2,rollavg_3,rollavg_4,rollavg_5,rollavg_6,rollavg_7,rollavg_8,rollavg_9,rollavg_10,rollavg_11,...,rollavg_240,rollavg_241,rollavg_242,rollavg_243,rollavg_244,rollavg_245,rollavg_246,y1,train_nearby,subway_nearby
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-02-16 11:33:18,0.007812,0.039062,0.023438,0.0,0.0,0.023438,0.007812,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17,False,False
2013-02-16 11:33:52,0.022814,0.0,0.030418,0.0,0.0,0.0,0.019011,0.007605,0.019011,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29,False,False
2013-02-16 11:34:35,0.011811,0.0,0.019685,0.03937,0.023622,0.035433,0.003937,0.003937,0.003937,0.003937,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42,False,False
2013-02-16 11:34:55,0.019417,0.0,0.019417,0.0,0.0,0.009709,0.0,0.0,0.009709,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,121,False,False
2013-02-16 11:35:17,0.028571,0.031169,0.023377,0.015584,0.0,0.01039,0.01039,0.0,0.01039,0.005195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17,False,False


In [13]:
df_X1.to_csv("df_X1.csv")