In [1]:
import os 
import warnings
import time
import numpy as np
import scipy.stats
import sys
import sklearn
import sklearn.datasets

from pyspark.sql import SparkSession
warnings.filterwarnings('ignore')
import pandas as pd

# launch this cell if you have issues on windows with py4j (think about updating your PATH)

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("feature_selection") \
    .getOrCreate()

sc=spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/27 11:06:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/27 11:06:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/27 11:06:49 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/27 11:06:49 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


# Feature selection

As we have many features (especially after the one-hot encoding of many categorical features), feature selection should be performed in order to reduce the computational load in further steps.

Two feature selection algorithms will be used

1) A Feature Ranking algorithm using MRMR (Maximum relevance - Minimum redundency)
2) A foward-selection algorithm using cross-validation. With a machine learning model, features will iteratively added as they increase the performance of the model.

In [2]:
train_sessions_engineered = spark.read.csv('../Data/session_engineered_features.csv',header=True,
                                          inferSchema=True)

train_purchases = spark.read.load('../Data/train_purchases.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

                                                                                

In [3]:
train_sessions_engineered.printSchema()

[1, 2, 4, 10, 13, 17, 21, 22, 23, 26, 27, 28, 29, 30, 31, 33, 35, 36, 37, 42]

root
 |-- session_id: double (nullable = true)
 |-- session_time: double (nullable = true)
 |-- season: double (nullable = true)
 |-- day_period: double (nullable = true)
 |-- month: double (nullable = true)
 |-- year: double (nullable = true)
 |-- item_most_time_spent: double (nullable = true)
 |-- most_time_spent_on_item: double (nullable = true)
 |-- most_frequently_bought_for_time_spent: double (nullable = true)
 |-- least_frequently_bought_for_time_spent: double (nullable = true)
 |-- mean_time: double (nullable = true)
 |-- std_time: double (nullable = true)
 |-- item_most_visited: double (nullable = true)
 |-- number_o_visit: double (nullable = true)
 |-- number_o_revisited_items: double (nullable = true)
 |-- most_frequently_bought_for_most_revisited: double (nullable = true)
 |-- first_item_visited: double (nullable = true)
 |-- last_item_visited: double (nullable = true)
 |-- normalized_features_vector: double (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double 

In [4]:
# Sets the values inside a numpy array

X = train_sessions_engineered.orderBy('session_id') 
X = X.drop('session_id')

X_np = np.array(X.collect())

22/05/27 11:06:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [5]:
X_np.shape

(1000000, 44)

## Value binning

The following values must be binned in order to be compatible with MRMR algorithms.

* Session duration
* Most time spent on a single item (float)
* Mean time
* Standard deviation time
* The 25 item value features cluster per session

In [6]:
# The data will be binned using percentile values, so that every feature is categorical

def assign_value_bins(x, n_bins=10):
    bin_array = np.zeros(x.shape[0], dtype=np.int32)
    prev_percentile = None
    
    for bin_idx in range(n_bins):
        percentile = (100.0 / n_bins) * (bin_idx + 1)
        
        percentile_value = np.percentile(x, percentile)
        
        if prev_percentile is None:
            mask = (x <= percentile_value)
            bin_array[mask] = bin_idx
        else:
            mask = np.bitwise_and(x <= percentile_value, x > prev_percentile)
            bin_array[mask] = bin_idx
                    
        prev_percentile = percentile_value
    
    return bin_array

columns_to_bin = [0, 6, 9, 10, 12, 13, *range(19, 44)]

for col in columns_to_bin:
    array_to_bin = X_np[:, col].astype(np.float32)
    binned_array = assign_value_bins(array_to_bin)
    X_np[:, col] = binned_array

X_np = X_np.astype(np.int32)  # Sets the type to int (should work well as everything is binned)

In [31]:
t_X = np.transpose(X_np)
n_total_features = t_X.shape[0]

# we want the nb partition to be between 2 or 3 times more than the number of core in our computer
Nb_partition = 10
X_RDD = sc.parallelize(t_X, Nb_partition)

## Item label clustering

As there is a high number of possible output items, the corellations migh be very low. We will use put them inside clusters in order to obtain better ranking values.

In [8]:
import pickle

# Loading the item to cluster dictionnary
with open('../Data/item_dict.pd', 'rb') as file:
    item_cluster_dict = pickle.load(file)
    
len(item_cluster_dict)

23691

In [9]:
train_purchases_clustered = (
    train_purchases.rdd
    .map(lambda x: (x.session_id, item_cluster_dict[x.item_id]))
)

Y_numpy = np.array(train_purchases_clustered.collect())[:, 1]
Y_numpy.shape

                                                                                

(1000000,)

# mRMR algorithm

The most relevant - minimum redundancy algorithm will select the features with the most corellation with the output values by computing the Pearson statistical test, but also with as less redundancy with previously selectionned features.

In many cases, this algorithm showed to prune unecessary features while keeping good model performances.

In [10]:
def get_score_mrmr(x, y):
    """
    return the correlation value between two variable in absolute value
    """
    return np.abs(scipy.stats.pearsonr(x, y)[0])


def get_mrmr_score_spark(x,selected_features):
    """
    x : the feature to evaluate to add into the model
    y : output value
    selected_features : the features already selected
    return the score for x with the rest of selected variables
    """
    
    y = broadcast_Y.value
    # Get correlation score between feature x and output y (relevance)
    score_x_y_s = get_score_mrmr(x, y)
    
    
    nb_selected_features = selected_features.shape[0]
    # If some features have already been selected
    if nb_selected_features > 0:
        
        # Get corrrelation scores between x and each feature already selected (redundancy)
        score_features_x_s = np.zeros(nb_selected_features, dtype=float)
        
        for j in range(nb_selected_features):
            
            score_x_s_j = get_score_mrmr(x, selected_features[j,:])
            
            # if score is nan considering that we want to calculate the mean
            # we transform it in 0 ?
            if np.isnan(score_x_s_j):
                score_x_s_j=0
            
                
            score_features_x_s[j] = score_x_s_j
                        
        # Final score is relevance to output feature - average redundancy with already selected features
        score_x_y_s = score_x_y_s - np.mean(score_features_x_s)
        
    return score_x_y_s


def mrmr_spark(n_total_features, K, sc, X_RDD):
    """
    n_total_features : total number of features
    K : number of feature to select
    sc : spark context
    X_RDD : RDD of the variable X
    Y: Output data
    
    :return: the indice of selected features and time execution using mrmr
    """
    time_execution = []
    remaining_features_indices = list(range(n_total_features))
    selected_features_indices = []
    

    for k in range(K):
        print("Step: "+str(k))
    
        start_time=time.time()
    
        # Get the subset of selected features values, and cast as an array
        selected_features = X_RDD.zipWithIndex().filter(lambda x: x[1] in selected_features_indices).map(lambda x: x[0]).collect()
        selected_features = np.array(selected_features)
    
        # mRMR scores are computed by first filtering `t_X` to remove already selected features, and then mapping 
        # each remaining feature using the `get_mrmr_score_spark` function
        scores = X_RDD.zipWithIndex().filter(lambda x: x[1] in remaining_features_indices).map(lambda x:get_mrmr_score_spark(x[0],selected_features)).collect()
    
        # Once all mRMR scores are computed, the index of the feature with the highest score is selected
        scores = np.array(scores)
        
    
        index_max_score_features = np.argmax(scores)
    
        selected_features_indices.append(remaining_features_indices[index_max_score_features])
    
        del(remaining_features_indices[index_max_score_features])
    
        print('Took time:', time.time()-start_time)
        time_execution.append(time.time()-start_time)
        
    return selected_features_indices, time_execution



In [11]:
broadcast_Y = sc.broadcast(Y_numpy)
selected_features_indices, execution_time = mrmr_spark(n_total_features, 20, sc, X_RDD)

Step: 0


22/05/27 11:07:32 WARN TaskSetManager: Stage 9 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:33 WARN TaskSetManager: Stage 10 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:33 WARN TaskSetManager: Stage 11 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:33 WARN TaskSetManager: Stage 12 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:35 WARN TaskSetManager: Stage 13 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 2.1447408199310303
Step: 1


22/05/27 11:07:35 WARN TaskSetManager: Stage 14 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:35 WARN TaskSetManager: Stage 15 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:36 WARN TaskSetManager: Stage 16 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:37 WARN TaskSetManager: Stage 17 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 2.134049892425537
Step: 2


22/05/27 11:07:37 WARN TaskSetManager: Stage 18 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:37 WARN TaskSetManager: Stage 19 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:38 WARN TaskSetManager: Stage 20 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:39 WARN TaskSetManager: Stage 21 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 2.5261402130126953
Step: 3


22/05/27 11:07:39 WARN TaskSetManager: Stage 22 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:40 WARN TaskSetManager: Stage 23 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:40 WARN TaskSetManager: Stage 24 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Took time: 2.8091843128204346
Step: 4


22/05/27 11:07:42 WARN TaskSetManager: Stage 25 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:42 WARN TaskSetManager: Stage 26 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:43 WARN TaskSetManager: Stage 27 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:43 WARN TaskSetManager: Stage 28 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:46 WARN TaskSetManager: Stage 29 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 3.506394624710083
Step: 5


22/05/27 11:07:46 WARN TaskSetManager: Stage 30 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:46 WARN TaskSetManager: Stage 31 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:46 WARN TaskSetManager: Stage 32 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Took time: 3.5136046409606934
Step: 6


22/05/27 11:07:49 WARN TaskSetManager: Stage 33 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:49 WARN TaskSetManager: Stage 34 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:50 WARN TaskSetManager: Stage 35 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:50 WARN TaskSetManager: Stage 36 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:53 WARN TaskSetManager: Stage 37 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 3.891164779663086
Step: 7


22/05/27 11:07:53 WARN TaskSetManager: Stage 38 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:54 WARN TaskSetManager: Stage 39 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:54 WARN TaskSetManager: Stage 40 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:57 WARN TaskSetManager: Stage 41 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 4.081225872039795
Step: 8


22/05/27 11:07:57 WARN TaskSetManager: Stage 42 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:58 WARN TaskSetManager: Stage 43 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:07:58 WARN TaskSetManager: Stage 44 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Took time: 4.222369909286499
Step: 9


22/05/27 11:08:01 WARN TaskSetManager: Stage 45 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:02 WARN TaskSetManager: Stage 46 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:02 WARN TaskSetManager: Stage 47 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:02 WARN TaskSetManager: Stage 48 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:06 WARN TaskSetManager: Stage 49 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 4.567854642868042
Step: 10


22/05/27 11:08:06 WARN TaskSetManager: Stage 50 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:06 WARN TaskSetManager: Stage 51 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:07 WARN TaskSetManager: Stage 52 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:10 WARN TaskSetManager: Stage 53 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 4.696666717529297
Step: 11


22/05/27 11:08:11 WARN TaskSetManager: Stage 54 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:11 WARN TaskSetManager: Stage 55 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:12 WARN TaskSetManager: Stage 56 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:16 WARN TaskSetManager: Stage 57 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 5.251950263977051
Step: 12


22/05/27 11:08:16 WARN TaskSetManager: Stage 58 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:16 WARN TaskSetManager: Stage 59 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:17 WARN TaskSetManager: Stage 60 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:21 WARN TaskSetManager: Stage 61 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 5.462134122848511
Step: 13


22/05/27 11:08:21 WARN TaskSetManager: Stage 62 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:22 WARN TaskSetManager: Stage 63 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:22 WARN TaskSetManager: Stage 64 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:26 WARN TaskSetManager: Stage 65 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 5.255688905715942
Step: 14


22/05/27 11:08:27 WARN TaskSetManager: Stage 66 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:27 WARN TaskSetManager: Stage 67 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:28 WARN TaskSetManager: Stage 68 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:32 WARN TaskSetManager: Stage 69 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 5.87001895904541
Step: 15


22/05/27 11:08:33 WARN TaskSetManager: Stage 70 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:33 WARN TaskSetManager: Stage 71 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:34 WARN TaskSetManager: Stage 72 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:38 WARN TaskSetManager: Stage 73 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 5.998129367828369
Step: 16


22/05/27 11:08:39 WARN TaskSetManager: Stage 74 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:39 WARN TaskSetManager: Stage 75 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:39 WARN TaskSetManager: Stage 76 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:44 WARN TaskSetManager: Stage 77 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 5.596835374832153
Step: 17


22/05/27 11:08:44 WARN TaskSetManager: Stage 78 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:45 WARN TaskSetManager: Stage 79 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:45 WARN TaskSetManager: Stage 80 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:50 WARN TaskSetManager: Stage 81 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 5.772828817367554
Step: 18


22/05/27 11:08:50 WARN TaskSetManager: Stage 82 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:50 WARN TaskSetManager: Stage 83 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:51 WARN TaskSetManager: Stage 84 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:57 WARN TaskSetManager: Stage 85 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


Took time: 6.967563152313232
Step: 19


22/05/27 11:08:57 WARN TaskSetManager: Stage 86 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:58 WARN TaskSetManager: Stage 87 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:08:58 WARN TaskSetManager: Stage 88 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.

Took time: 6.410759210586548


                                                                                

In [12]:
selected_features_indices = sorted(selected_features_indices)
selected_features_indices

[1, 2, 7, 8, 16, 17, 18, 21, 23, 24, 25, 26, 32, 34, 36, 37, 38, 40, 41, 42]

# Forward feature selection algorithm

Using a machine learning model, we will iteratively add new features into the training and compare at each iteration which feature provided the best results using test set validation. The performance metric is computed by taking the position of the label in the 100 most probable outputs.

As Naïve Bayes does not require training and can directly output prediction probabilities, it is considered the best for this problem as many different results are possible. 

In [39]:
# Need to replaced -1 by 0 in the training dataset as bayes don't support negative values

def replace_negative_by_zero(array):
    negative_element_mask = (array == -1)
    array[negative_element_mask] = 0
    return array
    

X_RDD_no_negative = X_RDD.map(replace_negative_by_zero)

In [40]:
from sklearn import tree
from sklearn import linear_model
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split


def score_model(model, x_test, y_test):
    y_pred = model.predict_proba(x_test)
    
    # Takes the 100 most probable indices (sorted)
    most_probable = np.argsort(y_pred, axis=1)[-100:]
    
    # That's all the labels inside the model
    labels = model.labels
    
    top_100[:] = 
    top_100 = y_pred[:, most_probable]
    
    # Take the most 100 probable values for each instance
    

def fit_get_score(x, selected_features):
    """
    train an model to get the score with the addition of a specified x feature
    """    
    
    y = broadcast_Y.value
    n_selected_features = selected_features.shape[0]
    
    if n_selected_features>0:
        # need to merge x and the already selected features
        x = np.vstack((selected_features,x))
        x = x.transpose()
    else:
        x = x.reshape(-1, 1)
        
    
    #split data
    x_train,x_test,y_train,y_test = train_test_split(x,y, test_size= 0.3)
    
    # train on training data
    model = CategoricalNB()
    model = model.fit(x_train,y_train)
    
    # get score on testing set
    return model.score(x_test, y_test)


# Forward feature selection based 
def forward_feature_selection(n_total_features, K, sc, X_RDD):
    """
    n_total_features : number of total features
    K : number of feature to select
    sc : spark context
    X_RDD : RDD of the variable X
    Y: Output data
    
    return the indice of selected features and time execution
    by using a decision tree as model to calculate the score
    """
    time_execution = []
    
    remaining_features_indices = list(range(n_total_features))
    selected_features_indices = []
    
    for k in range(K):
        print("Step: "+str(k))
    
        start_time=time.time()

        # Get the subset of selected features values, and cast as an array
        selected_features = X_RDD.zipWithIndex().filter(lambda x: x[1] in selected_features_indices).map(lambda x: x[0]).collect()
        selected_features = np.array(selected_features)
        
    
        #  scores for a certain model are computed by first filtering `t_X` to remove already selected features, and then mapping 
        # each remaining feature using the `fit_get_score` function
        scores = X_RDD.zipWithIndex().filter(lambda x: x[1] in remaining_features_indices).map(lambda x:fit_get_score(x[0],selected_features)).collect()
    
        # Once all scores are computed, the index of the feature with the highest value is chosen
        scores = np.array(scores)
        
        print("best_score :", np.max(scores))
    
        index_max_score_features = np.argmax(scores)
    
        selected_features_indices.append(remaining_features_indices[index_max_score_features])
    
        del(remaining_features_indices[index_max_score_features])
    
        print(time.time()-start_time)
        time_execution.append(time.time()-start_time)
        
    return selected_features_indices, time_execution



# Forward feature selection based on a accuracy increase. 
# Stops adding features when the accuracy stops increasing by a fixed amount.
def forward_feature_selection_diff(n_total_features, inc_accuracy, sc, X_RDD):
    """
    n_total_features : number of total features
    inc_accuracy : increase of accuracy needed to continue the algorithm
    sc : spark context
    X_RDD : RDD of the variable X
    Y: Output data
    
    return the indice of selected features and time execution
    by using a decision tree as model to calculate the score
    """
    time_execution = []
    
    remaining_features_indices = list(range(n_total_features))
    selected_features_indices = []
    
    last_best_score = 0
    diff_accuracy = 1
    k = 0
    while diff_accuracy > inc_accuracy:
        print("Step: "+str(k))
    
        start_time=time.time()

        # Get the subset of selected features values, and cast as an array
        selected_features = X_RDD.zipWithIndex().filter(lambda x: x[1] in selected_features_indices).map(lambda x: x[0]).collect()
        selected_features = np.array(selected_features)
        
    
        #  scores for a certain model are computed by first filtering `t_X` to remove already selected features, and then mapping 
        # each remaining feature using the `fit_get_score` function
        scores = X_RDD.zipWithIndex().filter(lambda x: x[1] in remaining_features_indices).map(lambda x:fit_get_score(x[0],selected_features)).collect()
    
        # Once all scores are computed, the index of the feature with the highest value is chosen
        scores = np.array(scores)
        
        # compute the difference between last result and new result
        best_score = np.max(scores)
        diff_accuracy = best_score - last_best_score
        
        
        print("best_accuracy:", best_score)
        
        if best_score > last_best_score:
        
            index_max_score_features = np.argmax(scores)
    
            selected_features_indices.append(remaining_features_indices[index_max_score_features])
    
            del(remaining_features_indices[index_max_score_features])
        
        last_best_score = best_score
        # print(time.time()-start_time)
        time_execution.append(time.time()-start_time)
        
        k += 1
        
    return selected_features_indices, time_execution



selected_features_indices_forward, execution_time_forward = forward_feature_selection(n_total_features,
                                                                                      20,
                                                                                      sc,
                                                                                      X_RDD_no_negative)

Step: 0


22/05/27 11:28:42 WARN TaskSetManager: Stage 100 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:28:42 WARN TaskSetManager: Stage 101 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:28:42 WARN TaskSetManager: Stage 102 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:28:43 WARN TaskSetManager: Stage 103 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
Exception in thread "serve RDD 127" java.net.SocketTimeoutException: Accept timed out
	at java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)
2

best_score : 0.47807333333333335
13.276906967163086
Step: 1


22/05/27 11:28:55 WARN TaskSetManager: Stage 105 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:28:56 WARN TaskSetManager: Stage 106 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:28:56 WARN TaskSetManager: Stage 107 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:29:15 WARN TaskSetManager: Stage 108 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.48411
19.84473943710327
Step: 2


22/05/27 11:29:15 WARN TaskSetManager: Stage 109 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:29:16 WARN TaskSetManager: Stage 110 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:29:16 WARN TaskSetManager: Stage 111 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:29:41 WARN TaskSetManager: Stage 112 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.48728
26.119744777679443
Step: 3


22/05/27 11:29:41 WARN TaskSetManager: Stage 113 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:29:42 WARN TaskSetManager: Stage 114 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:29:42 WARN TaskSetManager: Stage 115 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:30:13 WARN TaskSetManager: Stage 116 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.48944666666666664
31.955032348632812
Step: 4


22/05/27 11:30:13 WARN TaskSetManager: Stage 117 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:30:14 WARN TaskSetManager: Stage 118 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:30:14 WARN TaskSetManager: Stage 119 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:30:49 WARN TaskSetManager: Stage 120 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49080666666666667
36.063637018203735
Step: 5


22/05/27 11:30:49 WARN TaskSetManager: Stage 121 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:30:50 WARN TaskSetManager: Stage 122 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:30:50 WARN TaskSetManager: Stage 123 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:31:30 WARN TaskSetManager: Stage 124 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49342
40.73336052894592
Step: 6


22/05/27 11:31:30 WARN TaskSetManager: Stage 125 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:31:31 WARN TaskSetManager: Stage 126 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:31:31 WARN TaskSetManager: Stage 127 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:32:15 WARN TaskSetManager: Stage 128 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.4942666666666667
45.23266100883484
Step: 7


22/05/27 11:32:15 WARN TaskSetManager: Stage 129 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:32:16 WARN TaskSetManager: Stage 130 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:32:16 WARN TaskSetManager: Stage 131 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:33:03 WARN TaskSetManager: Stage 132 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49457666666666666
48.07389187812805
Step: 8


22/05/27 11:33:03 WARN TaskSetManager: Stage 133 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:33:04 WARN TaskSetManager: Stage 134 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:33:04 WARN TaskSetManager: Stage 135 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:33:55 WARN TaskSetManager: Stage 136 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49365
51.99153399467468
Step: 9


22/05/27 11:33:55 WARN TaskSetManager: Stage 137 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:33:56 WARN TaskSetManager: Stage 138 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:33:56 WARN TaskSetManager: Stage 139 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:34:50 WARN TaskSetManager: Stage 140 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49368666666666666
54.713850259780884
Step: 10


22/05/27 11:34:50 WARN TaskSetManager: Stage 141 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:34:51 WARN TaskSetManager: Stage 142 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:34:51 WARN TaskSetManager: Stage 143 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:35:48 WARN TaskSetManager: Stage 144 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49310333333333334
58.303258180618286
Step: 11


22/05/27 11:35:48 WARN TaskSetManager: Stage 145 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:35:49 WARN TaskSetManager: Stage 146 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:35:49 WARN TaskSetManager: Stage 147 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:36:50 WARN TaskSetManager: Stage 148 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.4928533333333333
62.2854278087616
Step: 12


22/05/27 11:36:51 WARN TaskSetManager: Stage 149 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:36:51 WARN TaskSetManager: Stage 150 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:36:52 WARN TaskSetManager: Stage 151 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:37:54 WARN TaskSetManager: Stage 152 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49344333333333334
63.08950757980347
Step: 13


22/05/27 11:37:54 WARN TaskSetManager: Stage 153 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:37:54 WARN TaskSetManager: Stage 154 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:37:55 WARN TaskSetManager: Stage 155 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:38:59 WARN TaskSetManager: Stage 156 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49378666666666665
65.68100547790527
Step: 14


22/05/27 11:39:00 WARN TaskSetManager: Stage 157 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:39:00 WARN TaskSetManager: Stage 158 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:39:00 WARN TaskSetManager: Stage 159 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:40:07 WARN TaskSetManager: Stage 160 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49251
67.3301157951355
Step: 15


22/05/27 11:40:07 WARN TaskSetManager: Stage 161 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:40:07 WARN TaskSetManager: Stage 162 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:40:08 WARN TaskSetManager: Stage 163 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:41:16 WARN TaskSetManager: Stage 164 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49239333333333335
69.6287112236023
Step: 16


22/05/27 11:41:16 WARN TaskSetManager: Stage 165 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:41:17 WARN TaskSetManager: Stage 166 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:41:17 WARN TaskSetManager: Stage 167 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:42:25 WARN TaskSetManager: Stage 168 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.4923133333333333
68.77218794822693
Step: 17


22/05/27 11:42:25 WARN TaskSetManager: Stage 169 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:42:26 WARN TaskSetManager: Stage 170 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:42:26 WARN TaskSetManager: Stage 171 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:43:38 WARN TaskSetManager: Stage 172 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.49066333333333334
73.03322410583496
Step: 18


22/05/27 11:43:38 WARN TaskSetManager: Stage 173 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:43:39 WARN TaskSetManager: Stage 174 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:43:39 WARN TaskSetManager: Stage 175 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:44:53 WARN TaskSetManager: Stage 176 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.


best_score : 0.48976333333333333
75.3446478843689
Step: 19


22/05/27 11:44:54 WARN TaskSetManager: Stage 177 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:44:54 WARN TaskSetManager: Stage 178 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.
22/05/27 11:44:55 WARN TaskSetManager: Stage 179 contains a task of very large size (15629 KiB). The maximum recommended task size is 1000 KiB.

best_score : 0.48872666666666664
77.79691362380981


                                                                                

In [42]:
sorted(selected_features_indices_forward)

[1, 2, 4, 10, 13, 17, 21, 22, 23, 26, 27, 28, 29, 30, 31, 33, 35, 36, 37, 42]