In [1]:
import os 
import warnings
import time
import numpy as np
import scipy.stats
import sys
import sklearn
import sklearn.datasets

from pyspark.sql import SparkSession
warnings.filterwarnings('ignore')
import pandas as pd

# launch this cell if you have issues on windows with py4j (think about updating your PATH)

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("feature_selection") \
    .getOrCreate()

sc=spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/26 15:42:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/26 15:42:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Feature selection

As we have many features (especially after the one-hot encoding of many categorical features), feature selection should be performed in order to reduce the computational load in further steps.

Two feature selection algorithms will be used

1) A Feature Ranking algorithm using MRMR (Maximum relevance - Minimum redundency)
2) A foward-selection algorithm using cross-validation. With a machine learning model, features will iteratively added as they increase the performance of the model.

In [2]:
train_sessions_engineered = spark.read.csv('../Data/session_engineered_features.csv',header=True,
                                          inferSchema=True)

train_purchases = spark.read.load('../Data/train_purchases.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

                                                                                

In [5]:
train_sessions_engineered.printSchema()

root
 |-- session_id: double (nullable = true)
 |-- session_time: double (nullable = true)
 |-- season: double (nullable = true)
 |-- day_period: double (nullable = true)
 |-- month: double (nullable = true)
 |-- year: double (nullable = true)
 |-- item_most_time_spent: double (nullable = true)
 |-- most_time_spent_on_item: double (nullable = true)
 |-- most_frequently_bought_for_time_spent: double (nullable = true)
 |-- least_frequently_bought_for_time_spent: double (nullable = true)
 |-- mean_time: double (nullable = true)
 |-- std_time: double (nullable = true)
 |-- item_most_visited: double (nullable = true)
 |-- number_o_visit: double (nullable = true)
 |-- number_o_revisited_items: double (nullable = true)
 |-- most_frequently_bought_for_most_revisited: double (nullable = true)
 |-- first_item_visited: double (nullable = true)
 |-- last_item_visited: double (nullable = true)
 |-- normalized_features_vector: double (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double 

In [7]:
# Sets the values inside a numpy array

X = train_sessions_engineered.orderBy('session_id') 
X = X.drop('session_id')

X_np = np.array(X.collect())

                                                                                

In [9]:
X_np.shape

(1000000, 44)

## Value binning

The following values must be binned in order to be compatible with MRMR algorithms.

* Session duration
* Most time spent on a single item (float)
* Mean time
* Standard deviation time
* The 25 item value features cluster per session

In [11]:
# The data will be binned using percentile values

def assign_value_bins(x, n_bins=10):
    bin_array = np.zeros(x.shape[0], dtype=np.int32)
    prev_percentile = None
    
    for bin_idx in range(n_bins):
        percentile = (100.0 / n_bins) * (bin_idx + 1)
        
        percentile_value = np.percentile(x, percentile)
        
        if prev_percentile is None:
            mask = (x <= percentile_value)
            bin_array[mask] = bin_idx
        else:
            mask = np.bitwise_and(x <= percentile_value, x > prev_percentile)
            bin_array[mask] = bin_idx
                    
        prev_percentile = percentile_value
    
    return bin_array

columns_to_bin = [0, 6, 9, 10, 12, 13, *range(19, 44)]

for col in columns_to_bin:
    array_to_bin = X_np[:, col].astype(np.float32)
    binned_array = assign_value_bins(array_to_bin)
    X_np[:, col] = binned_array

X_np = X_np.astype(np.int32)  # Sets the type to int (should work well as everything is binned)

In [13]:
t_X = np.transpose(X_np)
n_total_features = t_X.shape[0]

# we want the nb partition to be between 2 or 3 times more than the number of core in our computer
Nb_partition = 10
X_RDD = sc.parallelize(t_X, Nb_partition)

## Item label clustering

As there is a high number of possible output items, the corellations migh be very low. We will use put them inside clusters in order to obtain better ranking values.

In [14]:
import pickle

# Loading the item to cluster dictionnary
with open('../Data/item_dict.pd', 'rb') as file:
    item_cluster_dict = pickle.load(file)
    
len(item_cluster_dict)

23691

In [15]:
train_purchases_clustered = (
    train_purchases.rdd
    .map(lambda x: (x.session_id, item_cluster_dict[x.item_id]))
)

Y_numpy = np.array(train_purchases_clustered.collect())[:, 1]
Y_numpy.shape

                                                                                

(1000000,)

# mRMR algorithm

The most relevant - minimum redundancy algorithm will select the features with the most corellation with the output values by computing the Pearson statistical test, but also with as less redundancy with previously selectionned features.

In many cases, this algorithm showed to prune unecessary features while keeping good model performances.

In [20]:
def get_score_mrmr(x, y):
    """
    return the correlation value between two variable in absolute value
    """
    return np.abs(scipy.stats.pearsonr(x, y)[0])


def get_mrmr_score_spark(x,selected_features):
    """
    x : the feature to evaluate to add into the model
    y : output value
    selected_features : the features already selected
    return the score for x with the rest of selected variables
    """
    
    y = broadcast_Y.value
    # Get correlation score between feature x and output y (relevance)
    score_x_y_s = get_score_mrmr(x, y)
    
    
    nb_selected_features = selected_features.shape[0]
    # If some features have already been selected
    if nb_selected_features > 0:
        
        # Get corrrelation scores between x and each feature already selected (redundancy)
        score_features_x_s = np.zeros(nb_selected_features, dtype=float)
        
        for j in range(nb_selected_features):
            
            score_x_s_j = get_score_mrmr(x, selected_features[j,:])
            
            # if score is nan considering that we want to calculate the mean
            # we transform it in 0 ?
            if np.isnan(score_x_s_j):
                score_x_s_j=0
            
                
            score_features_x_s[j] = score_x_s_j
                        
        # Final score is relevance to output feature - average redundancy with already selected features
        score_x_y_s = score_x_y_s - np.mean(score_features_x_s)
        
    return score_x_y_s


def mrmr_spark(n_total_features, K, sc, X_RDD):
    """
    n_total_features : total number of features
    K : number of feature to select
    sc : spark context
    X_RDD : RDD of the variable X
    Y: Output data
    
    :return: the indice of selected features and time execution using mrmr
    """
    time_execution = []
    remaining_features_indices = list(range(n_total_features))
    selected_features_indices = []
    

    for k in range(K):
        print("Step: "+str(k))
    
        start_time=time.time()
    
        # Get the subset of selected features values, and cast as an array
        selected_features = X_RDD.zipWithIndex().filter(lambda x: x[1] in selected_features_indices).map(lambda x: x[0]).collect()
        selected_features = np.array(selected_features)
    
        # mRMR scores are computed by first filtering `t_X` to remove already selected features, and then mapping 
        # each remaining feature using the `get_mrmr_score_spark` function
        scores = X_RDD.zipWithIndex().filter(lambda x: x[1] in remaining_features_indices).map(lambda x:get_mrmr_score_spark(x[0],selected_features)).collect()
    
        # Once all mRMR scores are computed, the index of the feature with the highest score is selected
        scores = np.array(scores)
        
    
        index_max_score_features = np.argmax(scores)
    
        selected_features_indices.append(remaining_features_indices[index_max_score_features])
    
        del(remaining_features_indices[index_max_score_features])
    
        print('Took time:', time.time()-start_time)
        time_execution.append(time.time()-start_time)
        
    return selected_features_indices, time_execution



In [None]:
broadcast_Y = sc.broadcast(Y_numpy)
selected_features_indices, execution_time = mrmr_spark(n_total_features, 20, sc, X_RDD)

In [23]:
selected_features_indices = sorted(selected_features_indices)
selected_features_indices

[1, 2, 5, 11, 15, 16, 17, 19, 23, 26, 27, 28, 29, 33, 34, 36, 37, 39, 40, 42]

# Forward feature selection algorithm

Using a machine learning model, we will iteratively add new features into the training and compare at each iteration which feature provided the best results using cross-validation.

As Naïve Bayes does not require training and can directly output prediction probabilities, it is considered the best for this problem as many different results are possible. 