# Ranking Model Notebook
This notebook contains code for baseline (non-ML) and ML model for ranking the venues according to venue meta data and user behavior from user sessions. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.utils import resample

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# loading data
session_df = pd.read_csv('../data/sessions.csv', index_col=0, dtype={'venue_id':object})
venue_df = pd.read_csv('../data/venues.csv', index_col=0, dtype={'venue_id':object})

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# drop duplicates form session data as observed in EDA
session_df.drop_duplicates(inplace=True)

# missing value imputation - real valued with mean, and categorical attribute in session data with mode
venue_df.fillna(venue_df.mean(), inplace=True)
session_df['has_seen_venue_in_this_session'].fillna(session_df['has_seen_venue_in_this_session'].mode()[0], inplace=True)

In [5]:
# relavent_list = session_df.venue_id[session_df.purchased==True].unique().tolist()

## Metrics and Evaluation
This section contains evaluation metrics for classification and modules for ranking evaluation in baseline and ML models

In [6]:
def calculate_reciprocal_rank_at_k(relavent_list, predicted_list, k=None):
    """
    Calculates Reciprocal Rank given relavent items and ranked list
    
    relavent_list: list of relevent item
    predicted_list: generated ranked list of items 
    k: length of list to evaluate
    returns: reciprocal of rank of first relevent item
    """
    if not k:
        k = len(predicted_list)
    # Taking only top k items.
    predicted_list = predicted_list[:k]

    hit = 0.0
    for rank, item in enumerate(predicted_list):
        if item in relavent_list:
            return 1.0 / (rank + 1)
    return 0.0

def calculate_mrr(relavent_list_of_list, ranked_list_of_list):
    """
    Calculates Mean Reciprocal Rank given all relavent items and ranked lists
    
    relavent_list_of_list: list of list of relevent item
    ranked_list_of_list: list of generated ranked list of items 
    returns: mrr score over all the ranked lists
    """
    score = 0
    total_records = len(ranked_list_of_list)
    for ranked_list, relavent_list in zip(ranked_list_of_list, relavent_list_of_list):
        score_mid = calculate_reciprocal_rank_at_k(relavent_list, ranked_list)
        score += score_mid
    return score/total_records if score else 0.0

In [7]:
def get_ranked_list(model, predict_rank_function, data_df, cols):
    """
    model: model to use for getting scores
    predict_rank_function: prediction function to use over model scores
    data_df: data for which ranks to be produced
    cols: list of coloumns
    return: list of ranked items
    """
    data_df['prob'] = predict_rank_function(model, data_df[cols].values.astype(np.float32))
    data_df.sort_values('prob', ascending=False, inplace=True)
    return data_df.venue_id.values.tolist()

def evaluate_ranking(session_data, venue_data, model, predict_rank_function, cols):
    """
    Evaluate ranking for ML models
    
    session_data: data for which ranks to be produced
    venue_data: data for which ranks to be produced
    model: trained model to be used
    predict_rank_function: prediction function to use over model scores
    cols: list of coloumns
    """
    user_session_based_ranks = []
    user_session_relavent_list = []
    data_df = session_data.merge(venue_data, on='venue_id', how='right').fillna(False)
    
    for sess, df in data_df.groupby('session_id'):
        rel = df[df.purchased==True].venue_id.values.tolist()
        user_session_relavent_list.append(rel)
        ranked = get_ranked_list(model, predict_rank_function, df, cols)
        user_session_based_ranks.append(ranked)
    print('MRR Score: ', calculate_mrr(user_session_relavent_list, user_session_based_ranks))
    return

def evaluate_ranking_baseline(session_data, ranked_list):
    """
    Evaluate ranking for baseline mdoel
    
    session_data: data for which ranks to be produced
    ranked_list: baseline ranked list (common for all sessions)
    """
    user_session_based_ranks = []
    user_session_relavent_list = []
    
    for sess, df in session_data.groupby('session_id'):
        user_session_relavent_list.append(df[df.purchased==True].venue_id.values.tolist())
        user_session_based_ranks.append(ranked_list)
    print('MRR Score: ', calculate_mrr(user_session_relavent_list, user_session_based_ranks))
    return

# Baseline Model (Non ML)

In [8]:
def get_baseline_ranking(data):
    """
    Baseline ranking method - sorting based
    """
    ranked = data.sort_values(['popularity', 'rating'], ascending=False)
    return ranked.venue_id.tolist()

def get_baseline_ranking_non_linear(data):
    """
    Baseline ranking method - weighted average of nonlinear functions of attributes
    """
    pop = np.log(data.popularity+ 1)
    rat = np.sqrt(data.rating + 1)
    pri = 1 - np.sqrt(data.price_range + 1)
    data['score'] = pop*0.4 + rat*0.3 + pri*0.3
    ranked = data.sort_values(['score'], ascending=False)
    return ranked.venue_id.tolist()

In [9]:
# Creating baseline ranks using above methods and evaluating results

# Baseline - sorting
ranked = get_baseline_ranking(venue_df)
ranked = list(map(str, ranked))
print('Baseline - sorting: ')
evaluate_ranking_baseline(session_data=session_df, ranked_list=ranked)

# Baseline nonlinear
ranked_nl = get_baseline_ranking_non_linear(venue_df)
ranked_nl = list(map(str, ranked_nl))
print('\nBaseline - nonlinear: ')
evaluate_ranking_baseline(session_data=session_df, ranked_list=ranked_nl)


Baseline - sorting: 
MRR Score:  0.00927935757757337

Baseline - nonlinear: 
MRR Score:  0.011061915583722565


# ML Model

- train model on venue+new user+is reco + order again
- test and note classification results and accuracy
- Get ranked list corresponding to each user-session from the regression results
- Get MRR over all the users (baseline and ML model)

In [12]:
# utilities

def get_resampled_data(df):
    """
    upsampling data of the minority class
    """
    df_majority = df[df.purchased==False]
    df_minority = df[df.purchased==True]
    
    resampled_df_minority = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=99)
    new_df = pd.concat([df_majority, resampled_df_minority])
    print('Resampled distribution', new_df.purchased.value_counts())
    return new_df

def scale_data(df, cols):
    df[cols] = StandardScaler().fit_transform(df[cols])
    return df

In [13]:
def get_train_test_data(session_data, session_cols, venue_data, venue_cols):
    """
    Generate train and test set of data, and provides option of upsampling
    
    session_data: data to be used
    session_cols: list of cols from session data
    venue_data: data to be used
    venue_cols: list of cols from venue data
    return: train and test set
    """
    cols = session_cols + venue_cols
#     print(cols)

#     session_data = get_resampled_data(session_data)
    X = session_data.merge(venue_data, on='venue_id', how='left')[cols].values.astype(np.float32)
    y = session_data['purchased'].values.astype(np.float32).ravel()
    print('X Shape: ', X.shape)
    print('Y Shape: ', y.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    print('training input shape: ' + str(X_train.shape))
    print('training output shape: ' + str(y_train.shape))

    print('testing input shape: ' + str(X_test.shape))
    print('testing output shape: ' + str(y_test.shape))
    
    return [X_train, X_test, y_train, y_test]

In [14]:
cols_to_scale = venue_df.columns[1:]
venue_df_scaled = scale_data(venue_df, cols_to_scale)

In [15]:
# generating train and test sets

session_cols = ['is_new_user', 'is_from_order_again', 'is_recommended']
venue_cols = venue_df.columns[1:].values.tolist()

X_train, X_test, y_train, y_test = get_train_test_data(session_data=session_df, session_cols=session_cols, venue_data=venue_df_scaled, venue_cols=venue_cols)

X Shape:  (1350658, 9)
Y Shape:  (1350658,)
training input shape: (945460, 9)
training output shape: (945460,)
testing input shape: (405198, 9)
testing output shape: (405198,)


In [16]:
def train_model(model, prediction_function, X_train, y_train, X_test, y_test):
    """
    Module to train and evaluate the classification model
    
    """
    model.fit(X_train, y_train)
    
    y_train_pred = prediction_function(model, X_train)

    print('train precision: ' + str(precision_score(y_train, y_train_pred)))
    print('train recall: ' + str(recall_score(y_train, y_train_pred)))
    print('train accuracy: ' + str(accuracy_score(y_train, y_train_pred)))

    y_test_pred = prediction_function(model, X_test)

    print('test precision: ' + str(precision_score(y_test, y_test_pred)))
    print('test recall: ' + str(recall_score(y_test, y_test_pred)))
    print('test accuracy: ' + str(accuracy_score(y_test, y_test_pred)))
    
    return model

In [17]:
def get_predicted_outcome(model, data):
    """
    prediction module to get classification scores
    """
    return np.argmax(model.predict_proba(data), axis=1).astype(np.float32)

def get_predicted_rank(model, data):
    """
    module to get probabilities for rank list generation
    """
    return model.predict_proba(data)[:, 1]

## Following cells contain different sklearn models and their classification scores, followed by ranking evaluation

#### Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
model_logistic_reg = train_model(LogisticRegression(), get_predicted_outcome, X_train, y_train, X_test, y_test)

train precision: 0.6360153256704981
train recall: 0.1641938674579624
train accuracy: 0.9970173249000487
test precision: 0.5945205479452055
test recall: 0.1594415870683321
test accuracy: 0.9968114353970158


In [20]:
# evaluate
evaluate_ranking(session_data=session_df, venue_data=venue_df_scaled, model=model_logistic_reg, predict_rank_function=get_predicted_rank, cols=session_cols+venue_cols)

MRR Score:  0.4822350373966331


#### Decision Trees

In [34]:
from sklearn import tree

In [35]:
model_decision_tree = train_model(tree.DecisionTreeClassifier(), get_predicted_outcome, X_train, y_train, X_test, y_test)

train precision: 0.7737737737737738
train recall: 0.25486317177711837
train accuracy: 0.9973705920927379
test precision: 0.6081081081081081
test recall: 0.19838354151359294
test accuracy: 0.9968780694870162


In [36]:
# evaluate
evaluate_ranking(session_data=session_df, venue_data=venue_df_scaled, model=model_decision_tree, predict_rank_function=get_predicted_rank, cols=session_cols+venue_cols)

MRR Score:  0.5367197714992566


#### GBoost

In [40]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier

In [51]:
params_gb = {
    'n_estimators': 300,
    'max_depth': 4,
}

In [52]:
model_gboost = train_model(GradientBoostingClassifier(**params_gb), get_predicted_outcome, X_train, y_train, X_test, y_test)

train precision: 0.5705128205128205
train recall: 0.05868776788658094
train accuracy: 0.9968385759312927
test precision: 0.5
test recall: 0.05878030859662013
test accuracy: 0.9966411482781258


In [53]:
# evaluate
evaluate_ranking(session_data=session_df, venue_data=venue_df_scaled, model=model_gboost, predict_rank_function=get_predicted_rank, cols=session_cols+venue_cols)

MRR Score:  0.38722955862096486


#### Neural Nets

In [44]:
from sklearn.neural_network import MLPClassifier

In [45]:
params_nn = {
    'random_state': 99,
    'max_iter': 300,
    'solver': 'sgd',
    'hidden_layer_sizes': (50, 100, 100, 50),
}

In [46]:
model_nn = train_model(MLPClassifier(**params_nn), get_predicted_outcome, X_train, y_train, X_test, y_test)

train precision: 0.6646525679758308
train recall: 0.1450708869106495
train accuracy: 0.9970226133310769
test precision: 0.6385135135135135
test recall: 0.13886847905951505
test accuracy: 0.9968435184773864


In [47]:
# evaluate
evaluate_ranking(session_data=session_df, venue_data=venue_df_scaled, model=model_nn, predict_rank_function=get_predicted_rank, cols=session_cols+venue_cols)

MRR Score:  0.48771859319948674


#### Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
model_rf = train_model(RandomForestClassifier(), get_predicted_outcome, X_train, y_train, X_test, y_test)

train precision: 0.7470641373080398
train recall: 0.272667326079789
train accuracy: 0.9973705920927379
test precision: 0.5966735966735967
test recall: 0.21087435709037472
test accuracy: 0.9968706656992384


In [39]:
# evaluate
evaluate_ranking(session_data=session_df, venue_data=venue_df_scaled, model=model_rf, predict_rank_function=get_predicted_rank, cols=session_cols+venue_cols)

MRR Score:  0.5423520574954974
