# Predicting Check-ins of Foursquare Users in Tokyo

## 5 - Feature Engineering II

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib import rcParams

from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

import itertools

%matplotlib inline

In [2]:
# Setup default matplotlib defaults for later use
def defaultsettings():
    rcParams.update(matplotlib.rcParamsDefault)
    sns.set_style()
    sns.set_context()
    %matplotlib inline

In [3]:
# Setup Seaborn settings for bar plots
def postersettings():
    rcParams.update(matplotlib.rcParamsDefault)
    sns.set_style("whitegrid")
    sns.set_context("poster")

In [4]:
# Load wrangled data.
df = pd.read_csv('df_afterwrangling.csv',
                 index_col = 0,
                 parse_dates = [0],
                 date_parser = pd.to_datetime, 
                 infer_datetime_format = True)
df_stations = pd.read_csv('stations.csv', index_col = 0)

In [5]:
df_X0 = pd.read_csv('df_X0.csv',
                    index_col = 0,
                    parse_dates = [0],
                    date_parser = pd.to_datetime,
                    infer_datetime_format = True)

In [6]:
# Run Logistic Regression

Xtrain0 = df_X0['2012-July':'2013-Jan'].drop(['y0', 'y1'], axis = 1)
Xtest0 = df_X0['2013-Feb':].drop(['y0', 'y1'], axis = 1)
ytrain0 = df_X0['2012-July':'2013-Jan']
ytest0 = df_X0['2013-Feb':]

clf_LR0 = LogisticRegression()
clf_LR0.fit(Xtrain0, ytrain0.y0)
ypred0 = clf_LR0.predict(Xtest0)
print("Accuracy: " + str(round(accuracy_score(ytest0.y0, ypred0) * 100, 2)) + "%")

Accuracy: 83.01%


In [7]:
calc_accuracy_ypred0 = list(ypred0[ypred0 < 2])
calc_accuracy_ytest0 = list(ytest0.y0.values[np.where(ypred0 < 2)])

### Stacked Model, Level 1: All Other Venue Categories

In [8]:
df_who1 = pd.concat([df.userid, pd.get_dummies(df.venuecat_encoded, prefix = "rollavg")], axis = 1) \
    .groupby(["userid"]) \
    .expanding() \
    .mean() \
    .reset_index(level = "userid", drop = True) \
    .sort_index()

In [9]:
df_who1["y1"] = df.venuecat_encoded
df_who1["y1_and_ypred0"] = list(df.venuecat_encoded.values[ : -len(ypred0)]) + list(ypred0)

In [10]:
df_X1 = df_who1.loc[df_who1.y1_and_ypred0 > 1, : ].drop(["y1_and_ypred0"], axis = 1)

In [11]:
df_X1.to_csv("df_X1.csv")