# Feature selection-dropping constant features.

In this step we remove the features which are not affecting the model traning.

In [1]:
import pandas as pd

In [4]:
data=pd.DataFrame({"A":[1,2,4,1,2,4],
                  "B":[4,5,6,7,8,9],
                  "C":[0,0,0,0,0,0],
                  "D":[1,1,1,1,1,1]})

In [5]:
data

Unnamed: 0,A,B,C,D
0,1,4,0,1
1,2,5,0,1
2,4,6,0,1
3,1,7,0,1
4,2,8,0,1
5,4,9,0,1


# Variance threshold

Feature selector that removes all low-variance features.
This feature selecting algorithm looks only at the features(X), not the desired output(y), and can thus be used unsuperwised learning.

In [7]:
from sklearn.feature_selection import VarianceThreshold 
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(data)

In [8]:
var_thres.get_support()

array([ True,  True, False, False])

In [10]:
data.columns[var_thres.get_support()]

Index(['A', 'B'], dtype='object')

In [12]:
const_col=[column for column in data.columns 
          if column not in data.columns[var_thres.get_support()]]
print(len(const_col))

2


In [13]:
for feature in const_col:
    print(feature)

C
D


# Let's practise bigger dataset

In [3]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [76]:
df=pd.read_csv("santanderTrain.csv",nrows=1000)

In [77]:
df.shape

(1000, 371)

In [78]:
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,117310.979016,0


In [79]:
from sklearn.model_selection import train_test_split
X= df.drop(labels=['TARGET'],axis=1)
y=df['TARGET']
X_train, X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [80]:
len(X_train)

700

In [81]:
len(X_test)

300

In [82]:
len(y_train)

700

In [83]:
len(y_test)

300

In [84]:
X_train.shape, X_test.shape

((700, 370), (300, 370))

Lets apply the variance threshold

In [95]:
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X_train)

In [96]:
var_thres.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [97]:
#finding non constant features
sum(var_thres.get_support())

102

In [98]:
#Lets find non-constant features
len(X_train.columns[var_thres.get_support()])

102

In [99]:
constant_columns=[column for column in X_train.columns
                if column not in X_train.columns[var_thres.get_support()]]
print(len(constant_columns))

0


In [100]:
for column in constant_columns:
    print(column)

In [101]:
X_train=X_train.drop(constant_columns,axis=1)

In [102]:
X_train

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var13_largo_ult1,saldo_medio_var13_largo_ult3,saldo_medio_var17_hace2,saldo_medio_var17_ult1,saldo_medio_var17_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
105,195,2,28,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,93861.78
68,144,2,23,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,165258.42
479,965,2,24,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61653.75
399,798,2,34,0.0,1200.45,1698.42,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,119707.20
434,864,2,28,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109607.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,1660,2,38,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45189.33
192,378,2,27,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,145599.06
629,1254,2,31,0.0,36.90,36.90,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99613.92
559,1117,2,23,0.0,0.00,0.00,0.0,0.0,0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48869.88
