# 4.2 Feature Selection

##### Description

Select top features based upon correlation matrix.

##### Notebook Steps

## 1. Connect Spark

In [1]:
%load_ext sparkmagic.magics

In [2]:
%manage_spark

MagicsControllerWidget(children=(Tab(children=(ManageSessionWidget(children=(HTML(value='<br/>'), HTML(value='…

Added endpoint http://ec2-54-91-225-25.compute-1.amazonaws.com:8998/
Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1612113777859_0002,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.
Cleaned up endpoint http://ec2-54-91-225-25.compute-1.amazonaws.com:8998/


## 2. Load Dataset

In [3]:
%%spark
df = spark.read.csv("s3://jolfr-capstone3/clean/mem-features.csv", header=True, inferSchema=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## 3. Calculate Correlation Matrix

In [4]:
%%spark

from pyspark.sql.functions import col
from pyspark.sql.types import BooleanType, DoubleType
import pyspark.sql.functions as f
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
import pandas as pd

dfl = df

bool_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, BooleanType)]

for column in bool_cols:
    dfl = df.withColumn(column, col(column).cast(DoubleType()))
    
dfl = dfl.withColumn("label", col("label").cast(DoubleType()))

cols = [f.name for f in dfl.schema.fields if isinstance(f.dataType, DoubleType)]

# convert to vector column first
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=cols, outputCol=vector_col, handleInvalid="skip")
df_vector = assembler.transform(dfl).select(vector_col)

# get correlation matrix
matrix = Correlation.corr(df_vector, vector_col)

# get column and row mappings
mapper = {}
index = 0
for col in cols:
    mapper[index] = col
    index += 1

# reshape and add metadata    
corr = matrix.collect()[0]["pearson({})".format(vector_col)].values.reshape(len(cols),len(cols))
corr = pd.DataFrame(corr).rename(mapper, axis=0).rename(mapper, axis=1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Aggregate Top Features
The top 50 features will be selected based upon highest correlation to the label.

In [5]:
%%spark
corr = corr.sort_values(by=['label'], ascending=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
%%spark

features = [] # stores column names

features_count = 20 # number of features to gather
threshold = 0.90 # threshold to ignore collinear columns

for index, row in corr.iterrows():
    # exits loop once enough features are gathered
    if(len(features) == features_count):
        break;
    if(index != 'label'):
        collinear = False
        for feature in features:
            if(corr[index][feature] > threshold):
                collinear = True
                break;
        if(collinear == False):
            features.append(index)
features    

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['MEAN(transactions-price_difference WHERE is_auto_renew = 0)', 'SUM(transactions-planned_daily_price WHERE is_auto_renew = 0)', 'PERCENT_TRUE(transactions-WEEKEND(transaction_date) WHERE is_auto_renew = 0)', 'PERCENT_TRUE(transactions-WEEKEND(membership_expire_date) WHERE is_auto_renew = 0)', 'NUM_UNIQUE(transactions-DAY(membership_expire_date))', 'MIN(transactions-price_difference)', 'TOTAL_PREVIOUS_MONTH(transactions-price_difference, transaction_date)', 'MODE(transactions-MONTH(membership_expire_date))', 'STD(transactions-planned_daily_price)', 'MEAN(transactions-price_difference WHERE is_cancel = 0)', 'MODE(transactions-MONTH(transaction_date))', 'LAST(transactions-price_difference)', 'AVG_TIME_BETWEEN(transactions-transaction_date)', 'NUM_UNIQUE(transactions-payment_method_id)', 'STD(transactions-daily_price)', 'MAX(transactions-planned_daily_price)', 'LAST(transactions-planned_daily_price)', 'PERCENT_TRUE(logs-WEEKEND(date))', 'MEAN(transactions-planned_daily_price WHERE is_auto

## Transform Dataset using Aggregation

In [7]:
%%spark
cols = ["msno", "time", "label"]
cols = cols + features

df = df.select(cols)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Output to File

In [8]:
%%spark
df.write.format("com.databricks.spark.csv").option("header", "true").mode('overwrite').save('s3://jolfr-capstone3/clean/features.csv')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…