# Script for selecting the best features to train the model.

It is necessary to remove those features that will not be relevant to the 
training of the model and thus improve performance. First the correlation 
matrix of the independent variables is obtained and those with a high 
percentage of correlation are eliminated. Then, using the RFECV 
method, the most significant ones are chosen.

In [3]:
# Import libraries 
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFECV
from sklearn.model_selection import  GroupKFold 
import lightgbm as ltb
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import time 
import os
import sys

t1 = time.time()

In [4]:
# Define INPUT and OUTPUT files
INPUT = '../../F3.Data Preparation/02_Data/sampled_train.csv'
INPUT_DATA = '../../F2.Data Understanding/02_Data/01_TT_Sales_and_Invest_10d_by_brand.csv'
OUTPUT_FEATS = '../02_Data/features.npy'
OUTPUT = '../02_Data/prepared_train.csv'

In [5]:
# Read data resetting the indexes
data = pd.read_csv(INPUT, sep='|').reset_index(drop=True)

FileNotFoundError: [Errno 2] File b'../../F3.Data Preparation/02_Data/sampled_train.csv' does not exist: b'../../F3.Data Preparation/02_Data/sampled_train.csv'

In [None]:
# Store in array all unnecessary    
to_delete = ['CUSTOMER_ID', 'BRANDFAMILY_ID', 'R', 'CAL_DATE', 'CAL_DATE_end','SO_ITG_WSE', 'SO_MRKT_WSE','QUOTA_SELLOUT']

   
# Create data copy by removing unnecessary columns in X and the sellout field in y
X = data.drop(to_delete, axis=1)
y = data['QUOTA_SELLOUT']

In [None]:
# Get correlations between independent variables and show heatmap
correlations = X.corr()
sns.heatmap(correlations)

In [None]:
# Eliminate those independent variables that have a greater than 95% correlation between them 
threshold = 0.95
ignored_fields = []
for c in correlations.iterrows():
    sort = c[1].sort_values(ascending=False)
    if sort[1] > threshold or sort[1] < -threshold:
        correlations = correlations.drop(c[0], axis = 0)
        correlations = correlations.drop(c[0], axis = 1)
        ignored_fields.append(c[0])

X = X.drop(ignored_fields, axis=1)

In [None]:
# Apply a data scaling with StandardScaler on the data
columns = list(X.columns.values)
sc_X = StandardScaler()
X = sc_X.fit_transform(X)

In [None]:
# Create peer groups of customers for cross-validation
groups = data.CUSTOMER_ID
group_KFold = GroupKFold(n_splits=5)

In [None]:
# Define model and fit RFECV (recursive feature elimination) with cross-validation
# to get the feature ranking
estimator = ltb.LGBMRegressor()
rfe = RFECV(estimator=estimator, min_features_to_select=30, step=5, cv=group_KFold.split(X, y, groups), n_jobs=4, verbose=1)
rfe.fit(X, y)

In [None]:
# Get best features and save to file
features = rfe.get_support(indices = True) 
best_features = list(np.array(columns)[features])

X = data[best_features+['CUSTOMER_ID', 'BRANDFAMILY_ID', 'QUOTA_SELLOUT_5'] + date_cols]

np.save(OUTPUT_FEATS, list(best_features))

In [None]:
# The data is written to a file
X.to_csv(OUTPUT, sep='|', index=False, mode='w')

In [None]:
t2 = time.time()
print ("Time to execute script:",str(t2-t1))