# Pairwise Learning To Rank Methods


In [None]:
import math
import numpy as np
import h5py
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops
#from tf_utils import load_dataset, random_mini_batches, predict

%matplotlib inline
np.random.seed(1)

In [None]:
#import h5py
#import numpy as np
#import tensorflow as tf
#import math

def random_mini_batches(X, Y, mini_batch_size = 62, seed = 0):
    """
    Creates a list of random minibatches from (X, Y)
    
    Arguments:
    X -- input data, of shape (input size, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    mini_batch_size - size of the mini-batches, integer
    seed -- this is only for the purpose of grading, so that you're "random minibatches are the same as ours.
    
    Returns:
    mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
    """
    
    m = X.shape[1]                  # number of training examples
    mini_batches = []
    np.random.seed(seed)
    
    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((Y.shape[0],m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size : m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size : m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
        
    return mini_batches

def predict(X, parameters):
    
    W1 = tf.convert_to_tensor(parameters["W1"])
    b1 = tf.convert_to_tensor(parameters["b1"])
    W2 = tf.convert_to_tensor(parameters["W2"])
    b2 = tf.convert_to_tensor(parameters["b2"])
    W3 = tf.convert_to_tensor(parameters["W3"])
    b3 = tf.convert_to_tensor(parameters["b3"])
    
    params = {"W1": W1,
              "b1": b1,
              "W2": W2,
              "b2": b2,
              "W3": W3,
              "b3": b3}
    
    x = tf.placeholder("float", [X.shape[0], 1])
    
    z3 = forward_propagation_for_predict(x, params)
    p = tf.round(tf.sigmoid(z3))
    
    sess = tf.Session()
    prediction = sess.run(p, feed_dict = {x: X})
        
    return prediction

def predict_proba(X, parameters):
    
    W1 = tf.convert_to_tensor(parameters["W1"])
    b1 = tf.convert_to_tensor(parameters["b1"])
    W2 = tf.convert_to_tensor(parameters["W2"])
    b2 = tf.convert_to_tensor(parameters["b2"])
    W3 = tf.convert_to_tensor(parameters["W3"])
    b3 = tf.convert_to_tensor(parameters["b3"])
    
    params = {"W1": W1,
              "b1": b1,
              "W2": W2,
              "b2": b2,
              "W3": W3,
              "b3": b3}
    
    x = tf.placeholder("float", [X.shape[0], X.shape[1]])
    
    z3 = forward_propagation_for_predict(x, params)
    p = tf.sigmoid(z3)
    
    sess = tf.Session()
    prediction = sess.run(p, feed_dict = {x: X})
        
    return prediction

def forward_propagation_for_predict(X, parameters):
    """
    Implements the forward propagation for the model: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SOFTMAX
    
    Arguments:
    X -- input dataset placeholder, of shape (input size, number of examples)
    parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3"
                  the shapes are given in initialize_parameters

    Returns:
    Z3 -- the output of the last LINEAR unit
    """
    
    # Retrieve the parameters from the dictionary "parameters" 
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3'] 
                                                           # Numpy Equivalents:
    Z1 = tf.add(tf.matmul(W1, X), b1)                      # Z1 = np.dot(W1, X) + b1
    A1 = tf.nn.relu(Z1)                                    # A1 = relu(Z1)
    Z2 = tf.add(tf.matmul(W2, A1), b2)                     # Z2 = np.dot(W2, a1) + b2
    A2 = tf.nn.relu(Z2)                                    # A2 = relu(Z2)
    Z3 = tf.add(tf.matmul(W3, A2), b3)                     # Z3 = np.dot(W3,Z2) + b3
    
    return Z3


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

def load_XY():
    dataset = pd.read_csv('D:/GHDataset/dataset/splits/dataSetPropuestaCSVTrainEval')
    #dataset = pd.read_csv('D:/GHDataset/Dataset/dataSetPropuestaCSVMedium')
    #dataset = pd.read_csv('D:/GHDataset/Dataset/dataSetPropuestaCSVLarge')
    X = dataset.loc[:, dataset.columns != 'label']
    X = (X - X.min()) / (X.max() - X.min())
    X = X.fillna(0)
    y = dataset['label']
    
    return X.as_matrix().transpose(),y.values.reshape((1, y.shape[0]))

def load_data_normal(file='D:/GHDataset/dataset/splits/dataSetPropuestaCSVTrainEval'):
    dataset = pd.read_csv(file)
    #dataset = pd.read_csv('D:/GHDataset/Dataset/dataSetPropuestaCSVMedium')
    #dataset = pd.read_csv('D:/GHDataset/Dataset/dataSetPropuestaCSVLarge')
    X = dataset.loc[:, dataset.columns != 'label']
    X = (X - X.min()) / (X.max() - X.min())
    X = X.fillna(0)
    y = dataset['label']
    
    return X,y

def load_XY_shuffle(random_state=0, file='D:/GHDataset/dataset/splits/dataSetPropuestaCSVTrainEval'):
    dataset = pd.read_csv(file)
    dataset = dataset.sample(random_state=random_state,frac=1).reset_index(drop=True)
    X = dataset.loc[:, dataset.columns != 'label']
    #X = dataset[['item1f7','item1f14','item1f24','item2f7','item2f14','item2f24']]
    X = (X - X.min()) / (X.max() - X.min())
    X = X.fillna(0)
    y = dataset['label']
    
    return X.as_matrix().transpose(),y.values.reshape((1, y.shape[0]))

def load_technology_dataset(test_size=0.25, random_state=0, file='D:/GHDataset/datasets/splits/dataSetPropuestaCSVTrainEval'):
    dataset = pd.read_csv(file)
    dataset = dataset.sample(random_state=0,frac=1).reset_index(drop=True)
    
    X = dataset.loc[:, dataset.columns != 'label']
    
    #print(X.head(20))
    X = (X - X.min()) / (X.max() - X.min())
    X = X.fillna(0)
    #print(X.head(20))
    y = dataset[['label']]
    #y = (y - y.min()) / (y.max() - y.min())


    train_set_x_orig, test_set_x_orig, train_set_y_orig, test_set_y_orig = train_test_split(X, y, test_size=test_size, random_state=random_state)
    #print(train_set_y_orig.shape)
    
    #train_set_y_orig = train_set_y_orig.as_matrix().transpose()
    #test_set_y_orig = test_set_y_orig.as_matrix().transpose()
    train_set_y_orig = train_set_y_orig.values.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.values.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig.as_matrix().transpose(), train_set_y_orig, test_set_x_orig.as_matrix().transpose(), test_set_y_orig

In [None]:
# Loading the dataset
X_train, Y_train, X_test, Y_test = load_technology_dataset(0.25,0)
print(X_train.shape,X_test.shape, Y_train.shape, Y_test.shape)

In [None]:
print("number of training examples = " + str(X_train.shape[1]))
print("number of test examples = " + str(X_test.shape[1]))
print("X_train shape: " + str(X_train.shape))
print("Y_train shape: " + str(Y_train.shape))
print("X_test shape: " + str(X_test.shape))
print("Y_test shape: " + str(Y_test.shape))

**The model** is *LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID*.

In [None]:
# GRADED FUNCTION: create_placeholders

def create_placeholders(n_x, n_y, ):

    X = tf.placeholder(tf.float32, [n_x, None], name="X")
    Y = tf.placeholder(tf.float32, [n_y, None], name="Y")
    
    return X, Y

In [None]:
X, Y = create_placeholders(X_train.shape[0], Y_train.shape[0])
print("X = " + str(X))
print("Y = " + str(Y))

In [None]:
# GRADED FUNCTION: initialize_parameters

def initialize_parameters(X,Y):
    
    tf.set_random_seed(1)  
    
    W1 = tf.get_variable("W1", [50, X.shape[0]], initializer = tf.contrib.layers.xavier_initializer(seed=1))
    b1 = tf.get_variable("b1", [50, 1], initializer = tf.zeros_initializer())
    W2 = tf.get_variable("W2", [25, 50], initializer = tf.contrib.layers.xavier_initializer(seed=1))
    b2 = tf.get_variable("b2", [25, 1], initializer = tf.zeros_initializer())
    W3 = tf.get_variable("W3", [1, 25], initializer = tf.contrib.layers.xavier_initializer(seed=1))
    b3 = tf.get_variable("b3", [Y.shape[0], 1], initializer = tf.zeros_initializer())

    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3}
    
    return parameters

In [None]:
tf.reset_default_graph()
with tf.Session() as sess:
    parameters = initialize_parameters(X,Y)
    print("W1 = " + str(parameters["W1"]))
    print("b1 = " + str(parameters["b1"]))
    print("W2 = " + str(parameters["W2"]))
    print("b2 = " + str(parameters["b2"]))
    print("W3 = " + str(parameters["W3"]))
    print("b3 = " + str(parameters["b3"]))

In [None]:
def forward_propagation(X, parameters, keep_prob=1):

    # Retrieve the parameters from the dictionary "parameters" 
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']
    
                                                           # Numpy Equivalents:
    Z1 = tf.add(tf.matmul(W1, X), b1)                      # Z1 = np.dot(W1, X) + b1
    A1 = tf.nn.relu(Z1)                                    # A1 = relu(Z1)
    drop_outA1 = tf.nn.dropout(A1, keep_prob)
    Z2 = tf.add(tf.matmul(W2, drop_outA1), b2)             # Z2 = np.dot(W2, a1) + b2
    A2 = tf.nn.relu(Z2)                                    # A2 = relu(Z2)
    drop_outA2 = tf.nn.dropout(A2, keep_prob)
    Z3 = tf.add(tf.matmul(W3, drop_outA2), b3)             # Z3 = np.dot(W3,Z2) + b3
    ### END CODE HERE ###
    
    return Z3

In [None]:
tf.reset_default_graph()

with tf.Session() as sess:
    X, Y = create_placeholders(X_train.shape[0], Y_train.shape[0])
    parameters = initialize_parameters(X,Y)
    Z3 = forward_propagation(X, parameters,1)
    print("Z3 = " + str(Z3))

In [None]:
def compute_cost(Z3, Y, parameters, beta=0.0):
    
    # to fit the tensorflow requirement for tf.nn.softmax_cross_entropy_with_logits(...,...)
    logits = tf.transpose(Z3)
    labels = tf.transpose(Y)
    
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
    
    regularizer = tf.nn.l2_loss(parameters['W1']) + tf.nn.l2_loss(parameters['W2']) + tf.nn.l2_loss(parameters['W3']) + tf.nn.l2_loss(parameters['b1']) + tf.nn.l2_loss(parameters['b2']) + tf.nn.l2_loss(parameters['b3'])
    
    #vars = tf.trainable_variables() 
    #regularizer = tf.add_n([ tf.nn.l2_loss(v) for v in vars ])

    cost = tf.reduce_mean(cost + beta * regularizer)
    
    return cost

In [None]:
tf.reset_default_graph()

with tf.Session() as sess:
    X, Y = create_placeholders(X_train.shape[0], Y_train.shape[0])
    parameters = initialize_parameters(X, Y)
    Z3 = forward_propagation(X, parameters,1)
    cost = compute_cost(Z3, Y, parameters,0)
    print("cost = " + str(cost))

In [None]:
def model(X_train, Y_train, X_test, Y_test, learning_rate = 0.00004,
          num_epochs = 500, minibatch_size = 35, print_cost = False, keep_prob=0.80, beta=0.005,
          optimizer='gradient'):

    ops.reset_default_graph()                         # to be able to rerun the model without overwriting tf variables
    tf.set_random_seed(1)                             # to keep consistent results
    seed = 3                                          # to keep consistent results
    (n_x, m) = X_train.shape                          # (n_x: input size, m : number of examples in the train set)
    n_y = Y_train.shape[0]                            # n_y : output size
    costs = []                                        # To keep track of the cost
    
    # Create Placeholders of shape (n_x, n_y)
    X, Y = create_placeholders(n_x, n_y)

    # Initialize parameters
    parameters = initialize_parameters(X, Y)
    
    # Forward propagation: Build the forward propagation in the tensorflow graph
    Z3 = forward_propagation(X, parameters, keep_prob)
   
    # Cost function: Add cost function to tensorflow graph
    cost = compute_cost(Z3, Y, parameters, beta)
    
    # Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer.
    if(optimizer =='adam'):
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    elif(optimizer =='gradient'):
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)
    elif(optimizer =='momentum'):
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate).minimize(cost)
    
    
    # Initialize all the variables
    init = tf.global_variables_initializer()

    # Start the session to compute the tensorflow graph
    with tf.Session() as sess:
        
        # Run the initialization
        sess.run(init)
        
        # Do the training loop
        for epoch in range(num_epochs):

            epoch_cost = 0.                       # Defines a cost related to an epoch
            num_minibatches = int(m / minibatch_size) # number of minibatches of size minibatch_size in the train set
            seed = seed + 1
            minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)

            for minibatch in minibatches:

                # Select a minibatch
                (minibatch_X, minibatch_Y) = minibatch
                
                # IMPORTANT: The line that runs the graph on a minibatch.
                # Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).
                _ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})
                #print(minibatch_cost,num_minibatches)
                epoch_cost += minibatch_cost / num_minibatches

            # Print the cost every epoch
            if print_cost == True and epoch % 200 == 0:
                print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
            if print_cost == True and epoch % 50 == 0:
                costs.append(epoch_cost)
                
        # plot the cost
        """plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per tens)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()"""

        # lets save the parameters in a variable
        parameters = sess.run(parameters)
        #print("Parameters have been trained!")

        # Calculate the correct predictions
        #err = tf.constant(0.01, name='err') 
        correct_prediction = tf.equal(tf.round(tf.sigmoid(Z3)), tf.round(Y))
        # Calculate accuracy on the test set
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

        acc_train = accuracy.eval({X: X_train, Y: Y_train})
        acc_test = accuracy.eval({X: X_test, Y: Y_test})
       # print("Train Accuracy:", acc_train)
       # print("Test Accuracy:", acc_test)
        
        return parameters, acc_train, acc_test

In [None]:
import time

from sklearn.model_selection import KFold, cross_val_score
import pandas as pd
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

from datetime import date
from datetime import datetime
import numpy as np


In [None]:
array_datasets = ['datasets/features/pairwise_values/train-eval/dataSetPropuestaCSVOnlyFrontTrainEval',
                  'datasets/features/pairwise_values/train-eval/dataSetPropuestaCSVOnlyBackTrainEval', 
                  'datasets/features/pairwise_values/train-eval/dataSetPropuestaCSVFrontTrainEval',
                  'datasets/features/pairwise_values/train-eval/dataSetPropuestaCSVBackTrainEval', 
                  'datasets/features/pairwise_values/train-eval/dataSetPropuestaCSVTrainEval']

In [None]:
#Cross Validation RankNet


for file in array_datasets:
    
    start_time = time.time()
    
    print("Dataset: " + file)
    print()

    X, Y = load_XY_shuffle(random_state=100, file=file)
    n_splits=5
    k_fold = KFold(n_splits=n_splits)

    #print(X.shape, Y.shape)

    count = 0
    #optimizers = ['gradient','adam']
    beta = 0.005
    optimizer = 'adam'
    keep_prob = 0.80

    acc_train_sum = 0
    acc_test_sum = 0
    for train_indices, test_indices in k_fold.split(X.transpose()):
        count = count +1;
        #print("KFold number " + str(count) )

        xtrain = X[:,train_indices.tolist()]
        ytrain = Y[:,train_indices.tolist()]
        xtest = X[:,test_indices.tolist()]
        ytest = Y[:,test_indices.tolist()]
        minibatch_size=(int)(np.floor(train_indices.size/(n_splits*2)))

        parameters, acc_train, acc_test = model(xtrain, ytrain, xtest, ytest, learning_rate = 0.00004,
                                                minibatch_size=minibatch_size,keep_prob=keep_prob,
                                                beta=beta, num_epochs = 7000, optimizer=optimizer)
        acc_train_sum = acc_train_sum + acc_train
        acc_test_sum = acc_test_sum + acc_test
    print("Train Accuracy KFold:", acc_train_sum/n_splits)
    print("Test Accuracy KFold:", acc_test_sum/n_splits)
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    print()

**Rank Net**

**Best parameters set found on development set:**

* learning_rate = 0.00004
* keep_prob=0.80 
* beta=4e-05  
* epoch=7000

<table>
    <tr> 
        <td>
            **Accuracy/DataSet**
        </td>
        <td>
            **Only Front**
        </td>
        <td>
            **Only Back**
        </td>
        <td>
            **Front**
        </td>
        <td>
            **Back**
        </td>
        <td>
            **ALL**
        </td>
    </tr>
    <tr> 
        <td>
            **Train Accuracy**
        </td>
        <td>
            0.9926710009574891
        </td>
        <td>
            0.9378761649131775
        </td>
        <td>
            0.9655063390731812
        </td>
        <td>
            0.8848104119300843
        </td>
        <td>
            0.8794930934906006
        </td>
    </tr>
    <tr> 
        <td>
            **Test Accuracy**
        </td>
        <td>
            0.8379211902618409
        </td>
        <td>
            0.7746713280677795
        </td>
        <td>
            0.8417721629142761
        </td>
        <td>
            0.7934044241905213
        </td>
        <td>
            0.8018433213233948
        </td>
    </tr>

</table>



In [None]:
#Prediction RankNet

array_parameters = []

for file in array_datasets:
    
    start_time = time.time()
    
    print("Dataset: " + file)
    print()

    X_train, Y_train, X_test, Y_test = load_technology_dataset(0.25,0,file=file)

    beta = 0.005
    optimizer = 'adam'
    keep_prob = 0.80

    parameters, acc_train, acc_test = model(X_train, Y_train, X_test, Y_test, keep_prob=keep_prob,
                                            beta=beta, num_epochs = 7000, optimizer=optimizer)
    array_parameters.append(parameters)
    
    print("Train Accuracy KFold:", acc_train)
    print("Test Accuracy KFold:", acc_test)
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
    print()
    

In [None]:
import pickle

filename = 'RankNet_parameters'
pickle.dump(array_parameters, open(filename, 'wb'))

# some time later...

# load the models from disk
loaded_models = pickle.load(open(filename, 'rb'))

print(X_train[:,0:1].shape)
print(predict(X_train[:,0:1], loaded_models[0]))

In [None]:
### GRADIENT BOOST ###
### AKA GBRank ###
np.random.seed(0)

feature_importances = []

GBRank_models = []

for file in array_datasets:
    
    start_time = time.time()
    
    print("Dataset: " + file)
    print()

    X, y = load_data_normal(file=file)

    X = X.as_matrix()


    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100,test_size=0.20)
    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

    #tuned_parameters = [{'n_neighbors': [10,20,30,40,50,65,100,105,110,120]}]

    scores = ['accuracy']


    '''print("# Tuning hyper-parameters for %s" % score)
    print()'''

    #param_test1 = {'n_estimators':[4500],'learning_rate':[0.004], 'max_depth':[50],
    #               'min_samples_split':[50], 'min_samples_leaf':[10]}
    #param_test1 = {'n_estimators':[3000],'learning_rate':[0.005], 'max_depth':[50],
    #               'min_samples_split':[50], 'min_samples_leaf':[10]}
    param_test1 = {'n_estimators':[2375],'learning_rate':[0.004], 'max_depth':[50],
                   'min_samples_split':[50], 'min_samples_leaf':[10]}
    clf = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.005, 
                                                              min_samples_split=50,min_samples_leaf=10,
                                                              max_depth=50,max_features='log2',subsample=0.8,random_state=200),  
                                    param_grid = param_test1, scoring='accuracy',n_jobs=4,iid=False, cv=5)
    
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    '''print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.5f (+/-%0.5f) for %r"
              % (mean, std * 2, params))
    print()
'''
    #print("Detailed classification report:")
    #print()
    #print("The model is trained on the full development set.")
    #print("The scores are computed on the full evaluation set.")
    
    GBRank_models.append(clf)
    
    y_true, y_pred = y_train, clf.predict(X_train)
    #print(classification_report(y_true, y_pred))
    print("Training Accuracy - KFold")
    print(accuracy_score(y_true, y_pred))
    #print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print("Test Accuracy - KFold")
    print(accuracy_score(y_true, y_pred))
    #print(classification_report(y_true, y_pred))
    print("Feature_Importances_ ")
    #print(clf.best_estimator_.feature_importances_)
    feature_importances.append(clf.best_estimator_.feature_importances_)
    #print()
    print("--- %s seconds ---" % (time.time() - start_time))
    print()

**GradientBoostingClassifier**

**Best parameters set found on development set:**

{'min_samples_leaf': 10, 'min_samples_split': 50, 'max_depth': 50, 'learning_rate': 0.005, 'n_estimators': 3000}

<table>
    <tr> 
        <td>
            **Accuracy/DataSet**
        </td>
        <td>
            **Only Front**
        </td>
        <td>
            **Only Back**
        </td>
        <td>
            **Front**
        </td>
        <td>
            **Back**
        </td>
        <td>
            **ALL**
        </td>
    </tr>
    <tr> 
        <td>
            **Train Accuracy**
        </td>
        <td>
            1
        </td>
        <td>
            1
        </td>
        <td>
            1
        </td>
        <td>
            1
        </td>
        <td>
            1
        </td>
    </tr>
    <tr> 
        <td>
            **Test Accuracy**
        </td>
        <td>
            0.8906250000000000
        </td>
        <td>
            0.8554216867469879
        </td>
        <td>
            0.8787878787878788
        </td>
        <td>
            0.8595041322314050
        </td>
        <td>
            0.8747697974217311
        </td>
    </tr>

</table>



In [None]:
#GBRank save model
import pickle

filename = 'GBRank_models'
pickle.dump(GBRank_models, open(filename, 'wb'))

# some time later...

# load the models from disk
loaded_models = pickle.load(open(filename, 'rb'))
result = loaded_models[0].predict(X_test)
print(X_test.shape)
print(result)

In [None]:
### Linear SVC ###
### AKA RankSVM ###

from sklearn.svm import LinearSVC

RankSVM_Linear_models = []

for file in array_datasets:

    start_time = time.time()

    print("Dataset: " + file)
    print()

    X, y = load_data_normal(file=file)

    X = X.as_matrix()

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

    #tuned_parameters = [{'n_neighbors': [10,20,30,40,50,65,100,105,110,120]}]

    scores = ['accuracy']

    param_test1 = {'C': [5,10,20,30,40,50,70,90,150]}

    clf = GridSearchCV(estimator = LinearSVC(max_iter=10000) , param_grid = param_test1, cv=5)

    clf.fit(X_train, y_train)
    
    RankSVM_Linear_models.append(clf)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()

    y_true, y_pred = y_train, clf.predict(X_train)
    #print(classification_report(y_true, y_pred))
    print("Training Accuracy - KFold")
    print(accuracy_score(y_true, y_pred))
    
    y_true, y_pred = y_test, clf.predict(X_test)
    #print(classification_report(y_true, y_pred))
    print("Test Accuracy - KFold")
    print(accuracy_score(y_true, y_pred))                   

    print()
    print("--- %s seconds ---" % (time.time() - start_time))
    print()

**LinearSVC**

**Best parameters set found on development set:**

{'C': 10}

<table>
    <tr> 
        <td>
            **Accuracy/DataSet**
        </td>
        <td>
            **Only Front**
        </td>
        <td>
            **Only Back**
        </td>
        <td>
            **Front**
        </td>
        <td>
            **Back**
        </td>
        <td>
            **ALL**
        </td>
    </tr>
    <tr> 
        <td>
            **Train Accuracy**
        </td>
        <td>
            0.8932291666666666
        </td>
        <td>
            0.8375838926174497
        </td>
        <td>
            0.8597972972972973
        </td>
        <td>
            0.8328741965105602
        </td>
        <td>
            0.8236017209588199
        </td>
    </tr>
    <tr> 
        <td>
            **Test Accuracy**
        </td>
        <td>
            0.8125
        </td>
        <td>
            0.7670682730923695
        </td>
        <td>
            0.8333333333333334
        </td>
        <td>
            0.8071625344352618
        </td>
        <td>
            0.7974217311233885
        </td>
    </tr>

</table>



In [None]:
#RankSVM save model

filename = 'RankSVM_Linear_models'
pickle.dump(RankSVM_Linear_models, open(filename, 'wb'))

# some time later...

# load the models from disk
loaded_models = pickle.load(open(filename, 'rb'))
result = loaded_models[0].predict(X_test)
print(result)

In [None]:
### AdaBoostClassifier ###
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

RankBoost_ada_randomforest_models=[]

for file in array_datasets:

    start_time = time.time()

    print("Dataset: " + file)
    print()

    X, y = load_data_normal(file=file)

    X = X.as_matrix()

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)


    scores = ['accuracy']
    
    param_test1 = {'n_estimators':[3500],'learning_rate':[0.006]}
    
    #estimator=RandomForestClassifier(random_state=200)
    clf = GridSearchCV(estimator = AdaBoostClassifier(random_state=200,n_estimators=1000,learning_rate=0.005),  
                                    param_grid = param_test1, scoring='accuracy',
                       n_jobs=4,iid=False, cv=5)

    clf.fit(X_train, y_train)
    RankBoost_ada_randomforest_models.append(clf)
    
    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()

    y_true, y_pred = y_train, clf.predict(X_train)
    #print(classification_report(y_true, y_pred))
    print("Training Accuracy - KFold")
    print(accuracy_score(y_true, y_pred))
    
    y_true, y_pred = y_test, clf.predict(X_test)
    #print(classification_report(y_true, y_pred))
    print("Test Accuracy - KFold")
    print(accuracy_score(y_true, y_pred))                   

    print()
    print("--- %s seconds ---" % (time.time() - start_time))
    print()

In [None]:
#RankBoost ADA save model
import pickle

filename = 'RankBoost_models'
pickle.dump(RankBoost_ada_randomforest_models, open(filename, 'wb'))

# some time later...

# load the models from disk
loaded_models = pickle.load(open(filename, 'rb'))
result = loaded_models[0].predict(X)
print(result)

In [None]:
#GBRank save model
import pickle
import matplotlib.pyplot as plt
import numpy as np
from scipy import interp
from sklearn.metrics import roc_curve, auc
from sklearn.calibration import CalibratedClassifierCV

configurations = ['OnlyWeb', 
                  'OnlyNode',
                  'Web', 
                  'Node',
                  'All']

stored_models = ['GBRank_models','RankSVM_Linear_models', 'RankBoost_models', 'RankNet_parameters']

for filename in stored_models:

    loaded_models = pickle.load(open("Models/"+filename, 'rb'))

    tprs = []
    base_fpr = np.linspace(0, 1, 101)

    plt.figure(figsize=(5, 5))

    for i in range(0,len(array_datasets)):

        file=array_datasets[i]

    #    print("Dataset: " + file)
    #    print()

        X, y = load_data_normal(file=file)

        X = X.as_matrix()


        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100,test_size=0.20)
    #    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
        
        if(filename == 'RankSVM_Linear_models'):
            clf = CalibratedClassifierCV(loaded_models[i])
            clf.fit(X_train, y_train)
            y_score = clf.predict_proba(X_test)
            fpr, tpr, _ = roc_curve(y_test, y_score[:, 1])

        elif(filename == 'RankNet_parameters'):
            y_score = predict_proba(X_test.T, loaded_models[i])
            y_score = y_score.T
            fpr, tpr, _ = roc_curve(y_test, y_score)
            
        else:
            y_score = loaded_models[i].predict_proba(X_test)
            fpr, tpr, _ = roc_curve(y_test, y_score[:, 1])
             
        roc_auc = auc(fpr, tpr)

        plt.plot(fpr, tpr, lw=1, alpha=0.5, label='ROC %s (AUC = %0.2f)' % (configurations[i], roc_auc))
        tpr = interp(base_fpr, fpr, tpr)
        tpr[0] = 0.0
        tprs.append(tpr)


    tprs = np.array(tprs)
    mean_tprs = tprs.mean(axis=0)
    std = tprs.std(axis=0)

    tprs_upper = np.minimum(mean_tprs + std, 1)
    tprs_lower = mean_tprs - std


    plt.plot(base_fpr, mean_tprs, 'b')
    plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3, label=r'$\pm$ 1 std. dev.')

    plt.plot([0, 1], [0, 1],'r--', label='Random')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.axes().set_aspect('equal', 'datalim')
    plt.legend(loc="lower right")
    plt.title('ROC-AUC in pairwise ranking ' + filename.split('_')[0])
    plt.savefig('ROC' + filename.split('_')[0] + '.pdf' , bbox_inches='tight', pad_inches=0.1)
    plt.show()