<a href="https://colab.research.google.com/github/laurence-lin/Retail-Store-Location-Ranking/blob/master/Final_Project_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

import gc

print('Library imported.')

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

from google.colab import files



Library imported.


In [None]:
files.upload()
df_ny_data = pd.read_csv('df_ny_data.csv', index_col = False)
df_ny_data.head()

Unnamed: 0,Density,neighbor_entropy,competitiveness,other_food_neighbor,food_neighbor_rate,residence_venue,comments,ratings,total_compet_rate,weight_comment
0,41.0,2.466631,-0.268293,25,0.609756,3,318,3.3,0.878049,1049.4
1,43.0,3.150678,-0.186047,29,0.674419,2,622,3.7,0.860465,2301.4
2,26.0,2.971377,-0.230769,20,0.769231,5,1644,3.8,1.0,6247.2
3,26.0,2.971377,-0.230769,20,0.769231,5,1644,3.8,1.0,6247.2
4,15.0,2.488328,-0.2,8,0.533333,5,652,3.9,0.733333,2542.8


In [None]:
files.upload()
df_to_data = pd.read_csv('df_to_data.csv', index_col = False)
df_to_data.head()

Saving df_to_data.csv to df_to_data.csv


Unnamed: 0,Density,neighbor_entropy,competitiveness,other_food_neighbor,food_neighbor_rate,residence_venue,comments,ratings,weight_comment,total_compet_rate
0,4.0,1.386294,-0.25,1,0.25,0,2472,3.5,8652.0,0.5
1,2.0,0.693147,-0.5,0,0.0,0,1347,3.1,4175.7,0.5
2,10.0,2.302585,-0.2,4,0.4,0,1816,3.6,6537.6,0.6
3,9.0,2.197225,-0.222222,4,0.444444,0,160,2.9,464.0,0.666667
4,12.0,2.369382,-0.333333,8,0.666667,0,898,3.6,3232.8,1.0


In [None]:
# In this cell, define the models and preprocessing function
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import ndcg_score

print('Candidate models loaded.')

#Test the efficiency of each feature, assess the performance
# Define metrics
from scipy.stats import rankdata
import random
from numpy.random import permutation

scaler = MinMaxScaler()

def preprocessing(df, feature, degree, interaction_only = False):
    # create training features, scaling, and target value
    # df: dataframe containing all features and target
    # degree: degree for polynomial features
    # feature: list containing selected feature for training
    # return: poly_feature, y_true (2D shape)

    df_y = df['comments'].values.reshape(-1, 1)

    if len(feature) == 1:
       df_x = df[feature].values.reshape(-1, 1)
    else:
       df_x = df[feature]
    
    #scaling
    scaler = MinMaxScaler()
    df_x = scaler.fit_transform(df_x)
    df_y = scaler.fit_transform(df_y)

    # Create polynomial feature
    x_data = df_x
    poly = PolynomialFeatures(degree = degree, interaction_only = interaction_only) # require 2D input
    poly_x = poly.fit_transform(x_data)
    y_data = df_y.reshape(-1, 1)

    return poly_x, y_data



Candidate models loaded.


In [None]:
# Combine multiple geographic features
features = ['Density', 'competitiveness', 'other_food_neighbor', 'food_neighbor_rate', 'total_compet_rate']
poly_x, y_data = preprocessing(df_ny_data, features, 20, False)

lr = Ridge(alpha = 0.01)
mean_score = 0
mean_rand_score = 0
iterations = 1000

test_size = 0.12
print('Training size:', int(poly_x.shape[0]*(1 - test_size) ))
print('Testing size:', int(len(y_data)*test_size))

for iterate in range(iterations):
    x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = test_size)

    lr.fit(x_train, y_train)
    predict_score = lr.predict(x_test)
    
    score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 3)
    mean_score += score

    rand_score = [np.random.uniform(y_test.min(), y_test.max()) for i in range(len(y_test))]
    rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 3)
    mean_rand_score += rand_score  
    
mean_score = mean_score / iterations
mean_rand_score = mean_rand_score/ iterations
print('Mean score = ', mean_score)
print('Random rank score = ', mean_rand_score)

Training size: 44
Testing size: 6
Mean score =  0.7894367780541077
Random rank score =  0.6959919375642509


In [None]:
columns = ['LR', 'DecisionTree', 'SVR', 'Neural Net']
df_combine_perform = pd.DataFrame(index = ['NDCG@k of combine features'], columns = columns)
df_combine_perform.iloc[0]['LR'] = mean_score
df_combine_perform

Unnamed: 0,LR,DecisionTree,SVR,Neural Net
NDCG@k of combine features,0.789437,,,


###The combined geographic features do not increase the performance significantly than the best result of individual feature.

##We test the model on location data of another city: Toronto, to see if data features in another city represents similar information.

In [None]:
# test on toronto data
lr = Ridge(alpha = 0.01)
lr.fit(poly_x, y_data) # fit on whole new york data

df_to_data['total_compet_rate'] = df_to_data['food_neighbor_rate'] - df_to_data['competitiveness']
poly_x_to, y_data_to = preprocessing(df_to_data, features, 20)
y_pred = lr.predict(poly_x_to)
score = ndcg_score([list(y_data_to.flatten())], [list(y_pred.flatten())], 10)
print('Predict score on toronto:', score)

rand_score = [np.random.uniform(0, 1) for i in range(len(y_data_to))]
rand_score = ndcg_score([list(y_data_to.flatten())], [rand_score], 10)
print('Random score:', rand_score)  

Predict score on toronto: 0.5699437154752105
Random score: 0.5591066913600066


###Model in New york data don't work well on Toronto data. It seems venue data information within one city couldn't be applied on another city directly.

###Now, we compare the performance of different models on the combined features

In [None]:
features = ['total_compet_rate', 'residence_venue', 'Density', 'competitiveness', 'neighbor_entropy']
poly_x, y_data = preprocessing(df_ny_data, features, 1)

regressor = DecisionTreeRegressor(max_depth = 30)

mean_score = 0
mean_rand_score = 0

print('Train size:', int(poly_x.shape[0]*(1- test_size) ))
print('Test size: ', int(len(y_data)*test_size))

iterations = 1000
for iterate in range(iterations):
    x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

    regressor.fit(x_train, y_train)
    predict_score = regressor.predict(x_test)
    
    score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 3)
    mean_score += score

    rand_score = [np.random.uniform(y_test.min(), y_test.max()) for i in range(len(y_test))]
    rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 3)
    mean_rand_score += rand_score
    
    
mean_score = mean_score / iterations
mean_rand_score = mean_rand_score/ iterations
print('Mean score = ', mean_score)
print('Random rank score = ', mean_rand_score)

df_combine_perform.loc[0, 'DecisionTree'] = mean_score

feature_importance = pd.DataFrame(index = ['Feature Importance'], columns = features)

for i in range(len(features)):
  feature_importance.iloc[0, i] = regressor.feature_importances_[i]

print('Feature importance defined by decision tree: ')
print(feature_importance.transpose().sort_values(by = 'Feature Importance', ascending = False))

Train size: 44
Test size:  6
Mean score =  0.7133253262088121
Random rank score =  0.704634887048053
Feature importance defined by decision tree: 
                  Feature Importance
neighbor_entropy             0.36491
Density                     0.327644
residence_venue            0.0877319
competitiveness            0.0671991
total_compet_rate                  0


###The combined features don't improve significantly. However, the feature importance defined by decision tree show interesting result. We expect total_compet_rate and Density to have higher importance, however neighbor_entropy and residence_venue gets good score.

###The factor that influence a retail store's location is: consumer behavior, area popularity, and competitiveness. 
###Popularity: Density and residence venue 
###Competitiveness: Total_compet_rate, comptetitiveness 

In [None]:
features = ['competitiveness', 'residence_venue']
poly_x, y_data = preprocessing(df_ny_data, features, 1)

regressor = DecisionTreeRegressor(max_depth = 30)

mean_score = 0
mean_rand_score = 0

print('Train size:', int(poly_x.shape[0]*(1- test_size) ))
print('Test size: ', int(len(y_data)*test_size))

iterations = 1000
for iterate in range(iterations):
    x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

    regressor.fit(x_train, y_train)
    predict_score = regressor.predict(x_test)
    
    score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 3)
    mean_score += score

    rand_score = [np.random.uniform(y_test.min(), y_test.max()) for i in range(len(y_test))]
    rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 3)
    mean_rand_score += rand_score
    
    
mean_score = mean_score / iterations
mean_rand_score = mean_rand_score/ iterations
print('Mean score = ', mean_score)
print('Random rank score = ', mean_rand_score)

df_combine_perform['DecisionTree'] = mean_score

feature_importance = pd.DataFrame(index = ['Feature Importance'], columns = features)

for i in range(len(features)):
  feature_importance.iloc[0, i] = regressor.feature_importances_[i]

print('Feature importance defined by decision tree: ')
print(feature_importance.transpose())

Train size: 44
Test size:  6
Mean score =  0.7676976716344307
Random rank score =  0.6976118379336727
Feature importance defined by decision tree: 
                Feature Importance
competitiveness                  0
residence_venue           0.617949


###Interestingly, for features representing popularity, residence_venue works better than Density. This is opposite from the result when we use Linear Regression.
###As for the decision tree performance, the best result is obtained from single total_compet_rate feature. Since this might lack information of popularity, add popularity feature reduces the overall performance. We conclude the reason results from the small size of dataset.
###Other than that, polynomial features seems useless in Decision Tree model. Using 1 degree polynomial features get the best result.

In [None]:
from sklearn.svm import SVR

svr = SVR(kernel = 'rbf', C = 0.01)

poly_x, y_data = preprocessing(df_ny_data, ['neighbor_entropy'], 1)

mean_score = 0
mean_rand_score = 0

iterations = 1000
for iterate in range(iterations):
    x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

    y_train = y_train.flatten()    
    svr.fit(x_train, y_train)
    predict_score = svr.predict(x_test)
    
    score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 4)
    mean_score += score

    rand_score = [np.random.uniform(y_test.min(), y_test.max()) for i in range(len(y_test))]
    rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 4)
    mean_rand_score += rand_score
    

mean_score = mean_score / iterations
mean_rand_score = mean_rand_score/ iterations
print('Mean score = ', mean_score)
print('Random rank score = ', mean_rand_score)

Mean score =  0.7348566085441478
Random rank score =  0.7542253012144078


In [None]:
df_combine_perform['SVR'] = mean_score
df_combine_perform

Unnamed: 0,LR,DecisionTree,SVR,Neural Net
NDCG@k of combine features,0.789437,0.767698,0.813482,


In [None]:
#define nn
from sklearn.neural_network import MLPRegressor
import sklearn.neural_network as nn

hidden_size = (100)

feature = ['total_compet_rate', 'Density']

nn = MLPRegressor(
    hidden_size,
    'relu',
    'adam',
    learning_rate_init = 0.01,
    max_iter = 1000
)

poly_x, y_data = preprocessing(df_ny_data, feature, 10)

mean_score = 0
mean_rand_score = 0

iterations = 1000
for iterate in range(iterations):
    x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

    y_train = y_train.flatten()    
    nn.fit(x_train, y_train)
    predict_score = nn.predict(x_test)
    
    score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 4)
    mean_score += score

    rand_score = [np.random.uniform(-1, 1) for i in range(len(y_test))]
    rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 4)
    mean_rand_score += rand_score
    

mean_score = mean_score / iterations
mean_rand_score = mean_rand_score/ iterations
print('Mean score = ', mean_score)
print('Random rank score = ', mean_rand_score)


Mean score =  0.8176415739073725
Random rank score =  0.7380976633281107


In [None]:
df_combine_perform['Neural Net'] = mean_score
df_combine_perform

Unnamed: 0,LR,DecisionTree,SVR,Neural Net
NDCG@k of combine features,0.789437,0.767698,0.813482,0.812825


In [None]:
print(df_combine_perform.transpose().sort_values(by = 'NDCG@k of combine features', ascending = False))

             NDCG@k of combine features
SVR                            0.813482
Neural Net                     0.812825
LR                             0.789437
DecisionTree                   0.767698


In [None]:
index = ['total_compet_rate', 'total_compet + residence_venue', 'total_compet + Density', 'total_compet + residence_venue + Density', 'total_compet + residence_venue + Density + competitiveness']
df_performance = pd.DataFrame(index = index, columns = df_combine_perform.columns)
df_performance

Unnamed: 0,LR,DecisionTree,SVR,Neural Net
total_compet_rate,,,,
total_compet + residence_venue,,,,
total_compet + Density,,,,
total_compet + residence_venue + Density,,,,
total_compet + residence_venue + Density + competitiveness,,,,


In [None]:
df = df_performance.copy()
df.loc['total_compet_rate'][0] = 22
df

Unnamed: 0,LR,DecisionTree,SVR,Neural Net
total_compet_rate,22.0,,,
total_compet + residence_venue,,,,
total_compet + Density,,,,
total_compet + residence_venue + Density,,,,
total_compet + residence_venue + Density + competitiveness,,,,


In [None]:
feature = ['total_compet_rate']

hidden_size = (100, 50)

lr = Ridge(alpha = 0.01)
tree = DecisionTreeRegressor(max_depth=30)
svr = SVR(kernel='rbf', C = 0.01)
nn = nn = MLPRegressor(
     hidden_size,
    'relu',
    'adam',
    learning_rate_init = 0.01,
    max_iter = 1000
)

models = [lr, tree, svr, nn]
ind = 1
for model in models:
  poly_x, y_data = preprocessing(df_ny_data, feature, 10)

  mean_score = 0
  mean_rand_score = 0

  iterations = 1000
  for iterate in range(iterations):
      x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

      y_train = y_train.flatten()    
      model.fit(x_train, y_train)
      predict_score = model.predict(x_test)
    
      score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 4)
      mean_score += score

      rand_score = [np.random.uniform(-1, 1) for i in range(len(y_test))]
      rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 4)
      mean_rand_score += rand_score
    

  mean_score = mean_score / iterations
  mean_rand_score = mean_rand_score/ iterations
  print('Mean score = ', mean_score)
  print('Random rank score = ', mean_rand_score)
 
  df_performance.loc['total_compet_rate'][ind - 1] = mean_score
  ind += 1


Mean score =  0.8282827131598964
Random rank score =  0.7544181196461998
Mean score =  0.8274825439332425
Random rank score =  0.7495161336059888
Mean score =  0.8443677051929939
Random rank score =  0.7412178764721918
Mean score =  0.8402199257893767
Random rank score =  0.7421685751666126


In [None]:
feature = ['total_compet_rate', 'residence_venue']

hidden_size = (100, 50)

lr = Ridge(alpha = 0.01)
tree = DecisionTreeRegressor(max_depth=30)
svr = SVR(kernel='rbf', C = 0.01)
nn = nn = MLPRegressor(
     hidden_size,
    'relu',
    'adam',
    learning_rate_init = 0.01,
    max_iter = 1000
)

models = [lr, tree, svr, nn]
for i in range(len(models)):
  # Decision tree and SVR don't need polynomial degree
  if i == 1 or i == 2:
    degree = 1
  else:
    degree = 10
  poly_x, y_data = preprocessing(df_ny_data, feature, degree)

  mean_score = 0
  mean_rand_score = 0

  iterations = 1000
  for iterate in range(iterations):
      x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

      y_train = y_train.flatten()    
      model.fit(x_train, y_train)
      predict_score = model.predict(x_test)
    
      score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 4)
      mean_score += score

      rand_score = [np.random.uniform(-1, 1) for i in range(len(y_test))]
      rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 4)
      mean_rand_score += rand_score
    

  mean_score = mean_score / iterations
  mean_rand_score = mean_rand_score/ iterations
  print('Mean score = ', mean_score)
  print('Random rank score = ', mean_rand_score)
 
  df_performance.loc['total_compet + residence_venue'][i] = mean_score


Mean score =  0.7866047375935289
Random rank score =  0.7398403934682085
Mean score =  0.8037788101272282
Random rank score =  0.7484029342621792
Mean score =  0.8068595761175302
Random rank score =  0.7542074242264938
Mean score =  0.7835834681646515
Random rank score =  0.743268594184471


In [None]:
feature = ['total_compet_rate', 'Density']

hidden_size = (100, 50)

lr = Ridge(alpha = 0.01)
tree = DecisionTreeRegressor(max_depth=30)
svr = SVR(kernel='rbf', C = 0.01)
nn = nn = MLPRegressor(
     hidden_size,
    'relu',
    'adam',
    learning_rate_init = 0.01,
    max_iter = 1000
)

models = [lr, tree, svr, nn]
for i in range(len(models)):
  # Decision tree and SVR don't need polynomial degree
  if i == 1 or i == 2:
    degree = 1
  else:
    degree = 10
  poly_x, y_data = preprocessing(df_ny_data, feature, degree)

  mean_score = 0
  mean_rand_score = 0

  iterations = 1000
  for iterate in range(iterations):
      x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

      y_train = y_train.flatten()    
      model.fit(x_train, y_train)
      predict_score = model.predict(x_test)
    
      score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 4)
      mean_score += score

      rand_score = [np.random.uniform(-1, 1) for i in range(len(y_test))]
      rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 4)
      mean_rand_score += rand_score
    

  mean_score = mean_score / iterations
  mean_rand_score = mean_rand_score/ iterations
  print('Mean score = ', mean_score)
  print('Random rank score = ', mean_rand_score)
 
  df_performance.loc['total_compet + Density'][i] = mean_score


Mean score =  0.7990040067897394
Random rank score =  0.7407407730962983
Mean score =  0.8196790537944082
Random rank score =  0.7548993204502346
Mean score =  0.8290416231124533
Random rank score =  0.7453334425068492
Mean score =  0.8068697665689095
Random rank score =  0.7364991243809696


In [None]:
df_performance

Unnamed: 0,LR,DecisionTree,SVR,Neural Net
total_compet_rate,0.828283,0.827483,0.844368,0.84022
total_compet + residence_venue,0.786605,0.803779,0.80686,0.783583
total_compet + Density,0.799004,0.819679,0.829042,0.80687
total_compet + residence_venue + Density,,,,
total_compet + residence_venue + Density + competitiveness,,,,


In [None]:
feature = ['total_compet_rate', 'Density', 'residence_venue']

hidden_size = (100, 50)

lr = Ridge(alpha = 0.01)
tree = DecisionTreeRegressor(max_depth=30)
svr = SVR(kernel='rbf', C = 0.01)
nn = nn = MLPRegressor(
     hidden_size,
    'relu',
    'adam',
    learning_rate_init = 0.01,
    max_iter = 1000
)

models = [lr, tree, svr, nn]
for i in range(len(models)):
  # Decision tree and SVR don't need polynomial degree
  if i == 0:
    degree = 10
  elif i == 1 or i == 2:
    degree = 1
  elif i == 3:
    degree = 5
  poly_x, y_data = preprocessing(df_ny_data, feature, degree)

  mean_score = 0
  mean_rand_score = 0

  iterations = 1000
  for iterate in range(iterations):
      x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

      y_train = y_train.flatten()    
      model.fit(x_train, y_train)
      predict_score = model.predict(x_test)
    
      score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 4)
      mean_score += score

      rand_score = [np.random.uniform(-1, 1) for i in range(len(y_test))]
      rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 4)
      mean_rand_score += rand_score
    

  mean_score = mean_score / iterations
  mean_rand_score = mean_rand_score/ iterations
  print('Mean score = ', mean_score)
  print('Random rank score = ', mean_rand_score)
 
  df_performance.loc['total_compet + residence_venue + Density'][i] = mean_score


Mean score =  0.8032006906274504
Random rank score =  0.7497006025095458
Mean score =  0.8042025231178748
Random rank score =  0.7503936651988291
Mean score =  0.7986871836282052
Random rank score =  0.74464447172594
Mean score =  0.8043781516523937
Random rank score =  0.7507944888615818


In [None]:
df_performance

Unnamed: 0,LR,DecisionTree,SVR,Neural Net
total_compet_rate,0.828283,0.827483,0.844368,0.84022
total_compet + residence_venue,0.786605,0.803779,0.80686,0.783583
total_compet + Density,0.799004,0.819679,0.829042,0.80687
total_compet + residence_venue + Density,0.803201,0.804203,0.798687,0.804378
total_compet + residence_venue + Density + competitiveness,,,,


In [None]:
feature = ['total_compet_rate', 'Density', 'residence_venue', 'competitiveness']

hidden_size = (100, 50)

lr = Ridge(alpha = 0.01)
tree = DecisionTreeRegressor(max_depth=30)
svr = SVR(kernel='rbf', C = 0.01)
nn = nn = MLPRegressor(
     hidden_size,
    'relu',
    'adam',
    learning_rate_init = 0.01,
    max_iter = 1000
)

models = [lr, tree, svr, nn]
for i in range(len(models)):
  # Decision tree and SVR don't need polynomial degree
  if i == 0:
    degree = 10
  elif i == 1 or i == 2:
    degree = 1
  elif i == 3:
    degree = 5
  poly_x, y_data = preprocessing(df_ny_data, feature, degree)

  mean_score = 0
  mean_rand_score = 0

  iterations = 1000
  for iterate in range(iterations):
      x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

      y_train = y_train.flatten()    
      model.fit(x_train, y_train)
      predict_score = model.predict(x_test)
    
      score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 4)
      mean_score += score

      rand_score = [np.random.uniform(-1, 1) for i in range(len(y_test))]
      rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 4)
      mean_rand_score += rand_score
    

  mean_score = mean_score / iterations
  mean_rand_score = mean_rand_score/ iterations
  print('Mean score = ', mean_score)
  print('Random rank score = ', mean_rand_score)
 
  df_performance.loc['total_compet + residence_venue + Density + competitiveness'][i] = mean_score


Mean score =  0.7866789589438442
Random rank score =  0.7463344985591893
Mean score =  0.7984666577545134
Random rank score =  0.7541347084799783
Mean score =  0.7953331946823621
Random rank score =  0.7479079721411954
Mean score =  0.7856345351144364
Random rank score =  0.7497688063884342


In [None]:
df_performance

Unnamed: 0,LR,DecisionTree,SVR,Neural Net
total_compet_rate,0.828283,0.827483,0.844368,0.84022
total_compet + residence_venue,0.786605,0.803779,0.80686,0.783583
total_compet + Density,0.799004,0.819679,0.829042,0.80687
total_compet + residence_venue + Density,0.803201,0.804203,0.798687,0.804378
total_compet + residence_venue + Density + competitiveness,0.786679,0.798467,0.795333,0.785635


In [None]:
import folium

files.upload()
df = pd.read_csv('df_ny_feature.csv', index_col=False)
df.head()

Saving df_ny_feature.csv to df_ny_feature.csv


Unnamed: 0,name_x,latitude,longitude,venue_id,city,ratings,comments,Density,neighbor_entropy,competitiveness,other_food_neighbor,food_neighbor_rate,residence_venue
0,McDonald's,40.7578,-73.9854,5cf87778b399f7002cf1071a,New York,3.3,318,41.0,2.466631,-0.268293,25,0.609756,3
1,McDonald's,40.7266,-74.0386,4bf5553ecad2c928a9e49c99,Jersey City,3.7,622,43.0,3.150678,-0.186047,29,0.674419,2
2,McDonald's,40.7609,-73.9673,4f7f4df2754a7bf483d814c8,New York,3.8,1644,26.0,2.971377,-0.230769,20,0.769231,5
3,McDonald's,40.7609,-73.9673,52e9a22e498e55622108b7d9,New York,3.8,1644,26.0,2.971377,-0.230769,20,0.769231,5
4,McDonald's,40.7595,-73.9183,4b304c35f964a520b8f824e3,Astoria,3.9,652,15.0,2.488328,-0.2,8,0.533333,5
