<a href="https://colab.research.google.com/github/laurence-lin/Coursera_Capstone/blob/master/Final_Project_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

import gc

print('Library imported.')

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

from google.colab import files



Library imported.


In [7]:
files.upload()
df_ny_data = pd.read_csv('df_ny_data.csv', index_col = False)
df_ny_data.head()

Saving df_ny_data.csv to df_ny_data.csv


Unnamed: 0,Density,neighbor_entropy,competitiveness,other_food_neighbor,food_neighbor_rate,residence_venue,comments,ratings,total_compet_rate,weight_comment
0,41.0,2.466631,-0.268293,25,0.609756,3,318,3.3,0.878049,1049.4
1,43.0,3.150678,-0.186047,29,0.674419,2,622,3.7,0.860465,2301.4
2,26.0,2.971377,-0.230769,20,0.769231,5,1644,3.8,1.0,6247.2
3,26.0,2.971377,-0.230769,20,0.769231,5,1644,3.8,1.0,6247.2
4,15.0,2.488328,-0.2,8,0.533333,5,652,3.9,0.733333,2542.8


In [9]:
# In this cell, define the models and preprocessing function
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import ndcg_score

print('Candidate models loaded.')

#Test the efficiency of each feature, assess the performance
# Define metrics
from scipy.stats import rankdata
import random
from numpy.random import permutation

scaler = MinMaxScaler()

def preprocessing(df, feature, degree, interaction_only = False):
    # create training features, scaling, and target value
    # df: dataframe containing all features and target
    # degree: degree for polynomial features
    # feature: list containing selected feature for training
    # return: poly_feature, y_true (2D shape)

    df_y = df['comments'].values.reshape(-1, 1)

    if len(feature) == 1:
       df_x = df[feature].values.reshape(-1, 1)
    else:
       df_x = df[feature]
    
    #scaling
    scaler = MinMaxScaler()
    df_x = scaler.fit_transform(df_x)
    df_y = scaler.fit_transform(df_y)

    # Create polynomial feature
    x_data = df_x
    poly = PolynomialFeatures(degree = degree, interaction_only = interaction_only) # require 2D input
    poly_x = poly.fit_transform(x_data)
    y_data = df_y.reshape(-1, 1)

    return poly_x, y_data



Candidate models loaded.


In [10]:
# Combine multiple geographic features
features = ['Density', 'competitiveness', 'other_food_neighbor', 'food_neighbor_rate', 'total_compet_rate']
poly_x, y_data = preprocessing(df_ny_data, features, 20, False)

lr = Ridge(alpha = 0.01)
mean_score = 0
mean_rand_score = 0
iterations = 1000

test_size = 0.12
print('Training size:', int(poly_x.shape[0]*(1 - test_size) ))
print('Testing size:', int(len(y_data)*test_size))

for iterate in range(iterations):
    x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = test_size)

    lr.fit(x_train, y_train)
    predict_score = lr.predict(x_test)
    
    score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 3)
    mean_score += score

    rand_score = [np.random.uniform(y_test.min(), y_test.max()) for i in range(len(y_test))]
    rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 3)
    mean_rand_score += rand_score  
    
mean_score = mean_score / iterations
mean_rand_score = mean_rand_score/ iterations
print('Mean score = ', mean_score)
print('Random rank score = ', mean_rand_score)

Training size: 44
Testing size: 6
Mean score =  0.7960497095235736
Random rank score =  0.7009245684213908


In [11]:
columns = ['LR', 'DecirionTree', 'SVR', 'Neural Net']
df_combine_perform = pd.DataFrame(index = ['NDCG@k of combine features'], columns = columns)
df_combine_perform.loc[0, 'LR'] = mean_score

###The combined geographic features do not increase the performance significantly than the best result of individual feature.

##We test the model on location data of another city: Toronto, to see if data features in another city represents similar information.

In [None]:
# test on toronto data
lr = Ridge(alpha = 0.01)
lr.fit(poly_x, y_data) # fit on whole new york data

df_to_data['total_compet_rate'] = df_to_data['food_neighbor_rate'] - df_to_data['competitiveness']
poly_x_to, y_data_to = preprocessing(df_to_data, features, 20)
y_pred = lr.predict(poly_x_to)
score = ndcg_score([list(y_data_to.flatten())], [list(y_pred.flatten())], 10)
print('Predict score on toronto:', score)

rand_score = [np.random.uniform(0, 1) for i in range(len(y_data_to))]
rand_score = ndcg_score([list(y_data_to.flatten())], [rand_score], 10)
print('Random score:', rand_score)  

###Model in New york data don't work well on Toronto data. It seems venue data information within one city couldn't be applied on another city directly.

###Now, we compare the performance of different models on the combined features

In [None]:
poly_x, y_data = preprocessing(df_ny_data, features, 1)

regressor = DecisionTreeRegressor(max_depth = 30)

mean_score = 0
mean_rand_score = 0

print('Train size:', int(poly_x.shape[0]*(1- test_size) ))
print('Test size: ', int(len(y_data)*test_size))

iterations = 1000
for iterate in range(iterations):
    x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

    regressor.fit(x_train, y_train)
    predict_score = regressor.predict(x_test)
    
    score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 3)
    mean_score += score

    rand_score = [np.random.uniform(y_test.min(), y_test.max()) for i in range(len(y_test))]
    rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 3)
    mean_rand_score += rand_score
    
    
mean_score = mean_score / iterations
mean_rand_score = mean_rand_score/ iterations
print('Mean score = ', mean_score)
print('Random rank score = ', mean_rand_score)

df_combine_perform.loc[0, 'DecisionTree'] = mean_score

In [None]:
feature_importance = pd.DataFrame(index = ['Feature Importance'], columns = features)

for i in range(len(features)):
  feature_importance.iloc[0, i] = regressor.feature_importances_[i]

print(feature_importance.transpose())

In [None]:
from sklearn.svm import SVR

svr = SVR(kernel = 'rbf', degree = 10)

poly_x, y_data = preprocessing(df_ny_data, ['Density', 'competitiveness'], 10)

mean_score = 0
mean_rand_score = 0

iterations = 1000
for iterate in range(iterations):
    x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

    y_train = y_train.flatten()    
    svr.fit(x_train, y_train)
    predict_score = svr.predict(x_test)
    
    score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 4)
    mean_score += score

    rand_score = [np.random.uniform(y_test.min(), y_test.max()) for i in range(len(y_test))]
    rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 4)
    mean_rand_score += rand_score
    

mean_score = mean_score / iterations
mean_rand_score = mean_rand_score/ iterations
print('Mean score = ', mean_score)
print('Random rank score = ', mean_rand_score)

In [None]:

#define nn
from sklearn.neural_network import MLPRegressor
import sklearn.neural_network as nn

hidden_size = (100, 100, 50)

feature = ['Density', 'competitiveness', 'total_compet_rate']

nn = MLPRegressor(
    hidden_size,
    'relu',
    'adam',
    learning_rate_init = 0.01,
    max_iter = 1000
)

poly_x, y_data = preprocessing(df_ny_data, feature, 10)

mean_score = 0
mean_rand_score = 0

iterations = 1000
for iterate in range(iterations):
    x_train, x_test, y_train, y_test = train_test_split(poly_x, y_data, test_size = 0.12)

    y_train = y_train.flatten()    
    nn.fit(x_train, y_train)
    predict_score = nn.predict(x_test)
    
    score = ndcg_score([list(y_test.flatten())], [list(predict_score.flatten())], 4)
    mean_score += score

    rand_score = [np.random.uniform(-1, 1) for i in range(len(y_test))]
    rand_score = ndcg_score([list(y_test.flatten())], [rand_score], 4)
    mean_rand_score += rand_score
    

mean_score = mean_score / iterations
mean_rand_score = mean_rand_score/ iterations
print('Mean score = ', mean_score)
print('Random rank score = ', mean_rand_score)
