In [1]:
import os 
import warnings
import time
import numpy as np
import scipy.stats
import sys
import sklearn
import sklearn.datasets

from pyspark.sql import SparkSession
warnings.filterwarnings('ignore')
import pandas as pd

# launch this cell if you have issues on windows with py4j (think about updating your PATH)

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.dynamicAllocation.enabled", "false") \
    .config("spark.executor.instances", "1") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "8G") \
    .appName("load_explore") \
    .getOrCreate()

sc=spark.sparkContext

22/05/20 15:54:43 WARN Utils: Your hostname, pierre-hp resolves to a loopback address: 127.0.1.1; using 192.168.0.194 instead (on interface eno1)
22/05/20 15:54:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/20 15:54:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Feature Selection

After the feature engineering, we could have a lot of features which are not really worth to give to the predicting model. Considering this problem, we want to use some feature selection algorithms to take the feature which are the most interesting to the model.

We were asked to implement two scalable feature selection algorithms, a ranking algorithm and a forward feature selection.

We could be interested in Minimum-redundancy-maximum-relevance (mRMR) feature selection.


In [2]:

def generate_dataset(n_samples=100, n_informative=1, n_noisy=2, n_redundant=1, random_seed=0):
    """
    generate a dataset to test
    """
    # Set random seed
    np.random.seed(random_seed)
    
    # Use sklearn.datasets.make_regression to generate an artificial dataset where the output y 
    # is correlated with a subset of of the input features
    X, Y = sklearn.datasets.make_classification(n_samples=n_samples, 
                                            n_features=n_informative+n_noisy, 
                                            n_informative=n_informative)
    
    # Create a random mixing matrix for generating redundant features from informative ones
    mixing_matrix = np.random.random((n_informative, n_redundant))
    
    # Create redundant features by taking random linear combinations of informative features
    redundant_features = np.dot(X[:,0:n_informative], mixing_matrix)
    
    # Add redundant features to the input data
    X = np.concatenate((X, redundant_features), axis=1)
    
    # Return input data X, output data Y
    return X, Y


# Let us generate the dataset
N = 1000
n_informative = 100
n_noisy = 100
n_redundant = 100

X, Y = generate_dataset(n_samples=N, 
                        n_informative=n_informative, 
                        n_noisy=n_noisy, 
                        n_redundant=n_redundant)

t_X = np.transpose(X)

B=3
X_RDD=sc.parallelize(t_X,B).cache()

# Counting the number of rows will allow to implicitly cache the data
X_RDD.count()
X_RDD.take(10)

                                                                                

[array([ 1.21782194e+00,  2.27308898e-01,  4.78922861e-01, -5.98619710e-01,
        -2.03298501e-01, -9.84658459e-01, -4.84817384e-01, -2.08561189e-01,
        -6.07027982e-01, -4.63621210e-01, -2.41059611e+00,  9.51778930e-02,
         1.24609709e+00,  1.28501379e+00,  1.28421083e+00, -1.37839955e+00,
         1.63752739e+00,  9.34260552e-01,  4.60666555e-01, -7.92357073e-01,
        -1.05829140e+00,  1.52056772e+00, -6.38608571e-01, -7.18202569e-01,
        -6.21151701e-02, -8.44960314e-02,  1.32914734e+00,  2.57969245e-01,
        -1.71458546e+00,  1.03819732e+00,  1.36846626e+00,  1.19721125e+00,
         1.03852108e+00,  1.12728385e+00,  3.88930762e-01,  7.65791373e-01,
         1.07506702e-01,  1.74941304e-01, -2.36691339e-01,  2.70355204e-02,
        -6.96437250e-01, -1.95220574e+00,  5.65569767e-01,  1.17440847e+00,
         1.11334437e+00,  2.15557170e+00, -7.51360689e-01, -9.98068830e-01,
         1.71916805e-01, -2.66052160e-03,  9.80126999e-01, -2.07541819e+00,
        -6.1

In [3]:
def get_score_mrmr(x, y):
    """
    return the correlation value between two variable in absolute value
    """
    
    return np.abs(scipy.stats.pearsonr(x, y)[0])

In [4]:
def get_mrmr_score_spark(x,y,selected_features):
    """
    x : the feature to evaluate to add into the model
    y : output value
    selected_features : the features already selected
    return the score for x with the rest of selected variables
    """
    
    # Get correlation score between feature x and output y (relevance)
    score_x_y_s = get_score_mrmr(x, y)
    
    nb_selected_features = selected_features.shape[0]
    # If some features have already been selected
    if nb_selected_features > 0:
        
        # Get corrrelation scores between x and each feature already selected (redundancy)
        score_features_x_s = np.zeros(nb_selected_features, dtype=float)
        
        for j in range(nb_selected_features):
                
            score_x_s_j = get_score_mrmr(x, selected_features[j,:])
                
            score_features_x_s[j] = score_x_s_j
                
        # Final score is relevance to output feature - average redundancy with already selected features
        score_x_y_s = score_x_y_s - np.mean(score_features_x_s)
        
    return score_x_y_s

In [5]:
def mrmr_spark(n_total_features, K, sc, X_RDD, Y):
    """
    n_total_features : number of total features
    K : number of feature to select
    sc : spark context
    X_RDD : RDD of the variable X
    Y: Output data
    
    return the indice of selected features and time execution using mrmr
    """
    time_execution = []
    remaining_features_indices = list(range(n_total_features))
    selected_features_indices = []

    for k in range(K):
        print("Step: "+str(k))
    
        start_time=time.time()

        # Get the subset of selected features values, and cast as an array
        selected_features = X_RDD.zipWithIndex().filter(lambda x: x[1] in selected_features_indices).map(lambda x: x[0]).collect()
        selected_features = np.array(selected_features)
        
    
        # mRMR scores are computed by first filtering `t_X` to remove already selected features, and then mapping 
        # each remaining feature using the `get_mrmr_score_spark` function
        scores = X_RDD.zipWithIndex().filter(lambda x: x[1] in remaining_features_indices).map(lambda x:get_mrmr_score_spark(x[0],Y,selected_features)).collect()
    
        # Once all mRMR scores are computed, the index of the feature with the highest score is selected
        scores = np.array(scores)
    
        index_max_score_features = np.argmax(scores)
    
        selected_features_indices.append(remaining_features_indices[index_max_score_features])
    
        del(remaining_features_indices[index_max_score_features])
    
        print(time.time()-start_time)
        time_execution.append(time.time()-start_time)
        
    return selected_features_indices, time_execution

In [6]:
n_total_features = t_X.shape[0]
selected_features_indices, execution_time = mrmr_spark(n_total_features, 5, sc,X_RDD, Y)

Step: 0
1.0627903938293457
Step: 1
0.8347511291503906
Step: 2
0.763155460357666
Step: 3
0.8669402599334717
Step: 4
0.7057387828826904


In [7]:
selected_features_indices

[90, 50, 140, 5, 38]

Then, we will look a the forward feature Selection by training on a decision tree with every feature and add the features which get the best results.

In [8]:
from sklearn import tree
from sklearn.model_selection import train_test_split

def fit_get_score(x,y,selected_features):
    """
    train an model to get the score with the addition of a specified x feature
    """    
    
    n_selected_features = selected_features.shape[0]
    
    if n_selected_features>0:
        # need to merge x and the already selected features
        x = np.vstack((selected_features,x))
        x = x.transpose()
    else:
        x = x.reshape(-1, 1)
    
    #split data
    x_train,x_test,y_train,y_test = train_test_split(x,y, test_size= 0.3)
    
    # train on training data
    dt = tree.DecisionTreeClassifier()
    dt = dt.fit(x_train,y_train)
    
    # get score on testing set
    return dt.score(x_test, y_test)

In [9]:
def forward_feature_selection(n_total_features, K, sc, X_RDD, Y):
    """
    n_total_features : number of total features
    K : number of feature to select
    sc : spark context
    X_RDD : RDD of the variable X
    Y: Output data
    
    return the indice of selected features and time execution
    by using a decision tree as model to calculate the score
    """
    time_execution = []
    
    remaining_features_indices = list(range(n_total_features))
    selected_features_indices = []
    
    for k in range(K):
        print("Step: "+str(k))
    
        start_time=time.time()

        # Get the subset of selected features values, and cast as an array
        selected_features = X_RDD.zipWithIndex().filter(lambda x: x[1] in selected_features_indices).map(lambda x: x[0]).collect()
        selected_features = np.array(selected_features)
        
    
        #  scores for a certain model are computed by first filtering `t_X` to remove already selected features, and then mapping 
        # each remaining feature using the `fit_get_score` function
        scores = X_RDD.zipWithIndex().filter(lambda x: x[1] in remaining_features_indices).map(lambda x:fit_get_score(x[0],Y,selected_features)).collect()
    
        # Once all scores are computed, the index of the feature with the highest value is chosen
        scores = np.array(scores)
    
        index_max_score_features = np.argmax(scores)
    
        selected_features_indices.append(remaining_features_indices[index_max_score_features])
    
        del(remaining_features_indices[index_max_score_features])
    
        print(time.time()-start_time)
        time_execution.append(time.time()-start_time)
        
    return selected_features_indices, time_execution
    

In [10]:
n_total_features = t_X.shape[0]
selected_features_indices_forward, execution_time_forward = forward_feature_selection(n_total_features, 5, sc,X_RDD, Y)

Step: 0
0.9933044910430908
Step: 1
0.9432191848754883
Step: 2
1.001812219619751
Step: 3
0.9598202705383301
Step: 4
1.1405706405639648


[Stage 41:>                                                         (0 + 3) / 3]                                                                                

In [11]:
selected_features_indices_forward

[247, 56, 269, 229, 31]

In [12]:
spark.stop()