# Predicting Check-ins of Foursquare Users in Tokyo

## 6 - Machine Learning II

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib import rcParams

from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

import itertools

%matplotlib inline

In [2]:
# Setup default matplotlib defaults for later use
def defaultsettings():
    rcParams.update(matplotlib.rcParamsDefault)
    sns.set_style()
    sns.set_context()
    %matplotlib inline

In [3]:
# Setup Seaborn settings for bar plots
def postersettings():
    rcParams.update(matplotlib.rcParamsDefault)
    sns.set_style("whitegrid")
    sns.set_context("poster")

In [4]:
# Load wrangled data.
df = pd.read_csv('df_afterwrangling.csv',
                 index_col = 0,
                 parse_dates = [0],
                 date_parser = pd.to_datetime, 
                 infer_datetime_format = True)
df_stations = pd.read_csv('stations.csv', index_col = 0)

In [5]:
df_X0 = pd.read_csv('df_X0.csv',
                    index_col = 0,
                    parse_dates = [0],
                    date_parser = pd.to_datetime,
                    infer_datetime_format = True)

In [6]:
df_X1 = pd.read_csv('df_X1.csv',
                    index_col = 0,
                    parse_dates = [0],
                    date_parser = pd.to_datetime,
                    infer_datetime_format = True)

In [7]:
# Run Logistic Regression

Xtrain0 = df_X0['2012-July':'2013-Jan'].drop(['y0', 'y1'], axis = 1)
Xtest0 = df_X0['2013-Feb':].drop(['y0', 'y1'], axis = 1)
ytrain0 = df_X0['2012-July':'2013-Jan']
ytest0 = df_X0['2013-Feb':]

clf_LR0 = LogisticRegression()
clf_LR0.fit(Xtrain0, ytrain0.y0)
ypred0 = clf_LR0.predict(Xtest0)
print("Accuracy: " + str(round(accuracy_score(ytest0.y0, ypred0) * 100, 2)) + "%")

Accuracy: 83.01%


In [8]:
calc_accuracy_ypred0 = list(ypred0[ypred0 < 2])
calc_accuracy_ytest0 = list(ytest0.y0.values[np.where(ypred0 < 2)])

In [9]:
x_cols = np.matrix(df_X1.drop(["userid", "y1"], axis = 1).values)

df_accuracy = pd.DataFrame()

for ii in [110]:
    
    pca = PCA(n_components = ii)
    pca_features = pca.fit_transform(x_cols)
    
    accuracy = []
    
    for jj in [5]:
        
        clusterer = KMeans(n_clusters = jj, random_state = 1)
        cluster_labels = clusterer.fit_predict(pca_features)
        
        df_pca = pd.DataFrame(pca_features, index = df_X1.index)
        df_pca["cluster"] = cluster_labels
        df_pca["y1"] = df_X1.y1
        
        ypred1_jj, ytest1_jj = [], []
        
        for kk in range(0, jj):
            
            Xtrain1 = df_pca.loc[df_pca.cluster == kk, : ]['2012-July':'2013-Jan'].drop(["cluster", "y1"], axis = 1)
            ytrain1 = df_pca.loc[df_pca.cluster == kk, : ]['2012-July':'2013-Jan'].y1
            
            Xtest1 = df_pca.loc[df_pca.cluster == kk, : ]['2013-Feb':].drop(["cluster", "y1"], axis = 1)
            ytest1 = df_pca.loc[df_pca.cluster == kk, : ]['2013-Feb':].y1
            
            clf_LR1 = LogisticRegression()
            clf_LR1.fit(Xtrain1, ytrain1)
            ypred1_jj += list(clf_LR1.predict(Xtest1))
            ytest1_jj += list(ytest1)
        
        accuracy_jj = accuracy_score(calc_accuracy_ytest0 + ytest1_jj, calc_accuracy_ypred0 + ypred1_jj) * 100
        
        accuracy.append(accuracy_jj)
            
        print("PCA Components: " + str(ii) + ", KMeans Clusters: " + str(jj))
        print('Accuracy: ' + str(round(accuracy_jj, 2)) + "%")
        print("========================================")
        
    df_accuracy["PCA_components_" + str(ii)] = accuracy

PCA Components: 110, KMeans Clusters: 5
Accuracy: 46.03%
