### Feature Selection in Python 

This notebook demonstrates the usage of the most common Feature selection methods viz:
a. Recursive Feature selection with the option to select the number of features
b. Recursive Feature selection using Cross-Validation which automaticall tunes the number of features to select
c. Feature Ranking using algorithms such as SVM, Random Forest, Lasso and Ridge

We will also compare and contrasts the features selected through eac of the methods. Specifically, we want to determine the consensus among diffrent algorithms to select the signal features.


In [1]:
# Import the basic libraries
import pandas as pd
import os
import numpy as np

In [4]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE,RFECV
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV,Lasso,LassoCV,RandomizedLasso,RidgeClassifierCV
from sklearn.model_selection import RepeatedStratifiedKFold,GridSearchCV,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC,LinearSVC

In [2]:
os.chdir("c:\\analytics\\data")

In [3]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

In [8]:
# clean the data
labels = df.Churn.map(lambda x: 1 if x=='Yes' else 0)
labels = labels.values

nominal_cols =[]
numeric_cols =[]
drop_cols = ['customerID','Churn','TotalCharges']


df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')
df.TotalCharges.isnull().sum()
df['TotalCharges'] = df['TotalCharges'].fillna(value=df.TotalCharges.median)
df.TotalCharges.isnull().sum()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'],errors='coerce')

In [9]:
# transform the variables as requ`ired by sklearn

def transform_frame_sklearn(features,drop_cols=[]):
    """ Transforms the dataframe to columns """
    numeric_cols = []
    nominal_cols = []
    
    if len(drop_cols) is not 0:
        features = features.drop(drop_cols,axis=1)
    
    for col in features.columns:
        if features[col].dtype == 'O': # and features[col].nunique() <=10:
            nominal_cols.append(col)
            
        else:
            numeric_cols.append(col)

    features_t = pd.get_dummies(data=features,columns=nominal_cols,drop_first=True)
       
    #print(features_t.columns)
    
    return(features_t) #,nominal_cols,numeric_cols)         
         

In [23]:
# transform the data frame in to the form as required by skelarn libraries

feature_trans = transform_frame_sklearn(df,drop_cols)
feature_names = feature_trans.columns
features = np.array(feature_trans)
labels = labels

In [11]:
df.Churn.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [12]:
# split the data into train ans test data

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,recall_score,precision_score,accuracy_score

In [24]:
features_train,features_test,labels_train,labels_test = train_test_split(features,labels,random_state=42)


### Recursive Feature Elimination first

We will perform the recursive feature elination using multiple algorithms. We are interested in the following:

* What are the Top Features that each algorithm selects ?

* What is the validation score for each subset of features selected by the algorithm since we wish to select those features ?



In [17]:
# determines some hyper parameters
kfold=10
score_metric = 'recall'

In [49]:
# instantiate the classfiers
lr = LogisticRegressionCV(penalty='l1',n_jobs=-1)
lasso = LassoCV(n_jobs=-1,cv=kfold,random_state=42)
ridge = RidgeClassifierCV(cv=kfold)
rf = RandomForestClassifier(criterion='gini',n_jobs=-1,n_estimators=100,max_features='sqrt',random_state=42)
svc = SVC(C=0.1,kernel='linear')

#estimators = ['lasso','ridge','random_forest','svc']
rfe_estimators = {'lasso':lasso,'ridge':ridge,'svc':svc,'lr':'','rf':rf}   

# nunmber of features to select
#num_features = round(np.sqrt(len(feature_names)))
num_features = round(len(feature_names)/2)

In [50]:
features_rankscore = pd.DataFrame(feature_names,columns=['feature_name'])

print(" Total Features: {}".format(len(feature_names)))

for clf,value in rfe_estimators.items():
    print("\n{}".format(clf))
    if value != '':
        rfe = RFE(estimator=value,step=2,n_features_to_select=num_features)
        rfe = rfe.fit(features_train, labels_train)
        print(" Estimator {} selected features = {}".format(str.upper(clf),rfe.n_features_))
        print(" Features selected are: {}".format(feature_names[rfe.support_]))
        features_rankscore[clf]=rfe.ranking_
        
        #Select the features and predict on the validation data
        #val_transform = rfe.transform(features_test)
        #predproba_validation = rfe.predict_proba(features_test)
        pred_labels = rfe.predict(features_test)
        val_roc_score = roc_auc_score(labels_test,pred_labels)
        #val_recall_score = recall_score(labels_test,pred_labels) 
        # Print the results
        print("Using the model selected {} features, the validation scores are: \n".format(rfe.n_features_))
        print(" ROC: {}, Recall:  ".format(round(val_roc_score,3)))
    else:
        continue
              
print(features_rankscore)

 Total Features: 29

lasso
 Estimator LASSO selected features = 14
 Features selected are: Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'Dependents_Yes',
       'PhoneService_Yes', 'InternetService_Fiber optic', 'OnlineSecurity_Yes',
       'OnlineBackup_Yes', 'DeviceProtection_Yes', 'TechSupport_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Electronic check'],
      dtype='object')
Using the model selected 14 features, the validation scores are: 

 ROC: 0.857, Recall:  

ridge
 Estimator RIDGE selected features = 14
 Features selected are: Index(['PhoneService_Yes', 'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_Yes', 'OnlineBackup_Yes',
       'DeviceProtection_No internet service',
       'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No internet service',
       'StreamingMovies_No internet service', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling

In [51]:
# perform the ranking based on the algorithm

              

 Total Features: 29


In [None]:
features_rank

In [56]:
from sklearn.linear_model import RidgeClassifier

lasso = LassoCV(n_jobs=-1,cv=kfold,random_state=42)#,alphas=[0.01,0.001,0.1,1.0,10,100])
ridge = RidgeClassifierCV(alphas=[0.01,0.001,0.1, 1.0, 10.0], cv=3, fit_intercept=True)
rf = RandomForestClassifier(criterion='gini',n_jobs=-1,n_estimators=100,max_features='sqrt',random_state=42)
svc = SVC(C=0.1,kernel='linear')

#estimators = ['lasso','ridge','random_forest','svc']
estimators = {'lasso':lasso,'ridge':ridge,'svc':svc,'lr':'','rf':rf,'rand_lasso':''}

In [60]:
#%%timeit
features_from_model = pd.DataFrame(feature_names,columns=['feature_name'])
coeffs = []

print(" Total Features: {}".format(len(feature_names)))

for clf,value in estimators.items():
    print("\n{}".format(clf))
    if value != '':
        print(eval(str(value)))
        model = eval(str(value))
        model = model.fit(features_train, labels_train)
        #print(model.coef_)
        #print(" Estimator {} selected features = {}".format(clf,rfe.n_features_))
        #print(" Features selected are: {}".format(feature_names[rfe.support_]))
        if clf in ['lasso']:
            coeffs = pd.Series(np.abs(model.coef_))
            coeffs = [str(x)[0:4] for x in coeffs]
            print(coeffs)
            #print("alpha is : {}".format(model.alpha_))
            #list = list(np.abs(model.coef_[0]))
            features_from_model[clf]= pd.Series(coeffs)
        elif clf in ['ridge','svc']:
            coeffs = pd.Series(np.abs(model.coef_[0]))
            coeffs = [str(x)[0:4] for x in coeffs]
            #print(coeffs)
            #list = list(np.abs(model.coef_[0]))
            features_from_model[clf]= pd.Series(coeffs)
        
        else: 
            coeffs = pd.Series(np.abs(model.feature_importances_))
            coeffs = [str(x)[0:4] for x in coeffs]
            features_from_model[clf]= pd.Series(coeffs) 
    else: 
        continue

 Total Features: 29

lasso
LassoCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=-1, normalize=False,
    positive=False, precompute='auto', random_state=42, selection='cyclic',
    tol=0.0001, verbose=False)
['0.02', '0.00', '0.00', '0.0', '0.0', '0.01', '0.08', '0.0', '0.0', '0.02', '0.0', '0.0', '0.07', '0.0', '0.03', '0.0', '0.01', '0.0', '0.07', '0.0', '0.0', '0.0', '0.0', '0.07', '0.04', '0.03', '0.0', '0.07', '0.00']

ridge
RidgeClassifierCV(alphas=[0.01, 0.001, 0.1, 1.0, 10.0], class_weight=None,
         cv=3, fit_intercept=True, normalize=False, scoring=None)

svc
SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

lr

rf
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', 

In [72]:
from sklearn.preprocessing import MinMaxScaler

In [74]:
MinMaxScaler(feature_range=features_from_model.lasso.values,self=True)

TypeError: __init__() got multiple values for argument 'self'

In [109]:
trans = scaler.fit_transform(features_from_model.iloc[:,1:].values)
temp = pd.DataFrame(trans)
temp



Unnamed: 0,0,1,2,3
0,0.25,0.25,0.189873,0.086957
1,0.0,0.0,0.037975,1.0
2,0.0,0.0,0.012658,0.956522
3,0.0,0.035714,0.063291,0.130435
4,0.0,0.0,0.025316,0.086957
5,0.125,0.142857,0.139241,0.086957
6,1.0,0.178571,0.43038,0.0
7,0.0,0.178571,0.43038,0.0
8,0.0,0.25,0.21519,0.086957
9,0.25,1.0,1.0,0.173913


Unnamed: 0,feature_name,lasso,ridge,svc,rf
0,SeniorCitizen,0.02,0.07,0.15,0.02
1,tenure,0.0,0.0,0.03,0.23
2,MonthlyCharges,0.0,0.0,0.01,0.22
3,gender_Male,0.0,0.01,0.05,0.03
4,Partner_Yes,0.0,0.0,0.02,0.02
5,Dependents_Yes,0.01,0.04,0.11,0.02
6,PhoneService_Yes,0.08,0.05,0.34,0.0
7,MultipleLines_No phone service,0.0,0.05,0.34,0.0
8,MultipleLines_Yes,0.0,0.07,0.17,0.02
9,InternetService_Fiber optic,0.02,0.28,0.79,0.04


In [96]:
scaler.fit(np.array(features_from_model.lasso.values))

ValueError: Expected 2D array, got 1D array instead:
array=[ 0.02  0.    0.    0.    0.    0.01  0.08  0.    0.    0.02  0.    0.
  0.07  0.    0.03  0.    0.01  0.    0.07  0.    0.    0.    0.    0.07
  0.04  0.03  0.    0.07  0.  ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [86]:


features_from_model_scaled = pd.DataFrame(feature_names,columns=['feature_name'])

scaler = MinMaxScaler(copy=True, feature_range=(0, 1))


        

ValueError: Expected 2D array, got 1D array instead:
array=[ 0.02  0.    0.    0.    0.    0.01  0.08  0.    0.    0.02  0.    0.
  0.07  0.    0.03  0.    0.01  0.    0.07  0.    0.    0.    0.    0.07
  0.04  0.03  0.    0.07  0.  ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
import matplotlib.pyplot as plt
print("Optimal number of features : {}".format(rfecv.n_features_))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [62]:
rfecv = RFECV(estimator=svc,step=1, cv=2,scoring='recall')
rfecv.fit(features_train, labels_train)

RFECV(cv=2,
   estimator=SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
   n_jobs=1, scoring='recall', step=1, verbose=0)

In [None]:
print("Optimal number of features : {}".format(rfecv.n_features_))

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
rfecv.support_
rfecv.ranking_

In [None]:
def print_feature_ranks(names,scores,method=''):
    """ 
    print the ranked feature names and their scores from feature selection
    
    returns a data frame with feature names and its corresponding rank and scores
    """
    ranked_cols = pd.DataFrame(np.column_stack([names,scores]),columns=['feature_name','scores'])
    
    
    ranked_col[]
    
    ranked_cols = ranked_cols.sort_values(by='scores',ascending=False)
    ranked_cols['rank_order'] = ranked_cols.scores
        #if
    else:
        ranked_cols = ranked_cols.sort_values(by='scores',ascending=False)   
        ranked_cols['rank_order'] = list(range(1,ranked_cols.shape[0]+1))    
        
    return(ranked_cols)

In [None]:
## Let's select feature based on forward-backward

from sklearn.feature_selection import SelectFromModel


In [None]:
features_ranked =print_feature_ranks(feature_names,rf_fit.feature_importances_)
#features_ranked.sort_values(by='scores',ascending=False)
#features_ranked['rank']=np.arange(1,features_ranked.shape[0]+1)
#features_ranked

In [None]:
features_score = pd.DataFrame()
features_score['feature_name']=feature_names

In [None]:

features_score['rf'] = features_ranked['rank_order']

In [None]:
from sklearn.linear_model import Lasso,RandomizedLasso,LassoCV

lasso = Lasso(alpha=100)
lasso_fit = lasso.fit(features_train,labels_train)
features_ranked =print_feature_ranks(feature_names,abs(lasso_fit.coef_))
#print(features_ranked)
features_score['lasso']=features_ranked.rank_order
#features_score

0.25