In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from matplotlib import pyplot as plt
from itertools import compress

## Quasi-constant features

Quasi-constant features are those that show the same value for the great majority of the observations of the dataset. In general, these features provide little, if any, information that allows a machine learning model to discriminate or predict a target. But there can be exceptions. So you should be careful when removing these type of features.

Identifying and removing quasi-constant features, is an easy first step towards feature selection and more interpretable machine learning models.

To identify quasi-constant features, we can use the VarianceThreshold from Scikit-learn, or we can code it ourselves. If we use the VarianceThreshold, all our variables need to be numerical. If we code it manually however, we can apply the code to both numerical and categorical variables.


In [2]:
class FilterMethodQuasiConstantFeatures:
    
    def __init__(self,data_path):
        self.df=pd.read_csv(data_path)
        
    def dataframe_info(self):
        self.df.info()
        
    def dataframe_stats(self):
        print(self.df.describe().T)
        
    def __train_test_split_fmb(self):
        X_train, X_test, y_train, y_test = train_test_split(self.df.drop(['target'],axis='columns'),\
                                                            self.df['target'],\
                                                            test_size=0.3, \
                                                            random_state = 0
                                                           )
        return  X_train, X_test, y_train, y_test
    
    def __fit_summary(self,num_not_constant_features,num_constant_features,initial_shape_xtrain,initial_shape_xtest,X_train,X_test,var_threshold=0):        
            print('----Fit Summary------')
            print('---------------------')
            if var_threshold == 0:
                print('A total of '+str(num_not_constant_features)+' features are not constant')
                print('A total of '+str(num_constant_features)+' features are constant')
            else:
                print('A variance threshold  of '+str(var_threshold)+' is used')
                print('A total of '+str(num_not_constant_features)+' features are not constant')
                print('A total of '+str(num_constant_features)+' features are quasi constant')            
            print('The train shape before fit is '+str(initial_shape_xtrain))
            print('The train shape after fit is '+str(np.shape(X_test)))
            print('The test shape before fit is '+str(initial_shape_xtest))
            print('The test shape after fit is '+str(np.shape(X_train)))
            
    def __quasi_variance_summary(self,df_X_train,col_names):
        
        
        print('--------Summary of Value Percenteges--------------')
        
        for col in col_names:
            percent=round(df_X_train[col].value_counts()[0] / np.float(len(df_X_train))*100,4)
            value=df_X_train['var_1'].value_counts().index[0]
            print("For column "+col+" the value "+str(value)+" has "+str(percent)+" percent of values")
        
        
        
            
            
            
    def pandas_nunique_constant_features(self,print_summ=True):
        X_train, X_test, y_train, y_test = self.__train_test_split_fmb()   
        consant_features =[features for features in X_train.columns if X_train[features].nunique() == 1]
        
        initial_shape_xtrain = np.shape(X_train)
        initial_shape_xtest = np.shape(X_test)
        initial_num_features = len(X_train.columns)
        
        
        X_train.drop(consant_features,inplace=True,axis=1)
        X_test.drop(consant_features,inplace=True,axis=1)
        after_num_features = len(X_train.columns) 
        
        if print_summ:
            self.__fit_summary(after_num_features,
                               initial_num_features-after_num_features,
                               initial_shape_xtrain,
                               initial_shape_xtest,
                               X_train,
                               X_test)        
        if print_summ==False:
            return X_train, X_test, y_train, y_test
        
       
      
    
    
    
    def varinance_threshold(self,var_threshold):
        
        X_train, X_test, y_train, y_test = self.pandas_nunique_constant_features(print_summ=False)  
        col_names=X_train.columns
        # Make deep instead of shallow copy to create an entirly new datafeame
        df_X_train=X_train.copy(deep=True)
        sel = VarianceThreshold(threshold=var_threshold)
        sel.fit(X_train)  # fit finds the features with zero variance
        
        initial_shape_xtrain = np.shape(X_train)
        initial_shape_xtest = np.shape(X_test)
        
        X_train = sel.transform(X_train)
        X_test = sel.transform(X_test)
        
        # get_support is a boolean vector that indicates which features are retained
        # if we sum over get_support, we get the number of features that are not constant 
        # True os constant False is not constant
        
        self.__fit_summary(np.sum(sel.get_support()),
                           np.sum(~sel.get_support()),
                           initial_shape_xtrain,
                           initial_shape_xtest,
                           X_train,
                           X_test,
                           var_threshold)        
        col_names = list(compress(col_names,~sel.get_support()))
        
        self.__quasi_variance_summary(df_X_train,col_names)
        
        
    def quasi_constant_manual(self,threshold=0.998):
            
        X_train, X_test, y_train, y_test = self.pandas_nunique_constant_features(print_summ=False)    
        col_names=X_train.columns
        # Make deep instead of shallow copy to create an entirly new datafeame
        df_X_train=X_train.copy(deep=True)
        quasi_constant_feat=[]    
        for col in col_names:
            # find the predominant value, that is the value that is shared
            # by most observations
            predominant = (X_train[col].value_counts() / np.float(
            len(X_train))).sort_values(ascending=False).values[0]

            # evaluate the predominant feature: do more than 99% of the observations
            # show 1 value?
            if predominant > threshold:
                # if yes, add the variable to the list
                quasi_constant_feat.append(col)
        
        initial_shape_xtrain = np.shape(X_train)
        initial_shape_xtest = np.shape(X_test)
        
        
        X_train.drop(labels=quasi_constant_feat, axis=1, inplace=True)
        X_test.drop(labels=quasi_constant_feat, axis=1, inplace=True)
            
       
          
        
        self.__fit_summary(len(X_train.columns)-len(quasi_constant_feat),
                           len(quasi_constant_feat),
                           initial_shape_xtrain,
                           initial_shape_xtest,
                           X_train,
                           X_test,
                           threshold)        
        
        self.__quasi_variance_summary(df_X_train,quasi_constant_feat)

In [3]:
obj1 = FilterMethodQuasiConstantFeatures('../data/dataset_1.csv')

In [4]:
obj1.dataframe_info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Columns: 301 entries, var_1 to target
dtypes: float64(127), int64(174)
memory usage: 114.8 MB


In [5]:
obj1.dataframe_stats()

           count         mean           std  min  25%   50%  75%           max
var_1    50000.0     0.002220      0.108145  0.0  0.0  0.00  0.0  9.000000e+00
var_2    50000.0     0.000060      0.007746  0.0  0.0  0.00  0.0  1.000000e+00
var_3    50000.0    15.593002   1280.571855  0.0  0.0  0.00  0.0  2.079013e+05
var_4    50000.0     3.149633      2.740114  0.0  0.0  2.85  3.0  3.528000e+01
var_5    50000.0   608.681764  10951.361737  0.0  0.0  0.00  0.0  4.455000e+05
...          ...          ...           ...  ...  ...   ...  ...           ...
var_297  50000.0     0.000000      0.000000  0.0  0.0  0.00  0.0  0.000000e+00
var_298  50000.0     0.003060      0.078808  0.0  0.0  0.00  0.0  3.000000e+00
var_299  50000.0    12.462960    832.417622  0.0  0.0  0.00  0.0  1.346667e+05
var_300  50000.0  5683.960293  47364.820421  0.0  0.0  0.00  0.0  2.857673e+06
target   50000.0     0.039820      0.195538  0.0  0.0  0.00  0.0  1.000000e+00

[301 rows x 8 columns]


In [6]:
obj1.pandas_nunique_constant_features()

----Fit Summary------
---------------------
A total of 266 features are not constant
A total of 34 features are constant
The train shape before fit is (35000, 300)
The train shape after fit is (15000, 266)
The test shape before fit is (15000, 300)
The test shape after fit is (35000, 266)


In [7]:
obj1.varinance_threshold(var_threshold=0.01)

----Fit Summary------
---------------------
A variance threshold  of 0.01 is used
A total of 215 features are not constant
A total of 51 features are quasi constant
The train shape before fit is (35000, 266)
The train shape after fit is (15000, 215)
The test shape before fit is (15000, 266)
The test shape after fit is (35000, 215)
--------Summary of Value Percenteges--------------
For column var_1 the value 0 has 99.9629 percent of values
For column var_2 the value 0 has 99.9971 percent of values
For column var_7 the value 0 has 99.9886 percent of values
For column var_9 the value 0 has 99.9886 percent of values
For column var_10 the value 0 has 99.9943 percent of values
For column var_19 the value 0 has 99.0114 percent of values
For column var_28 the value 0 has 99.9943 percent of values
For column var_36 the value 0 has 99.9971 percent of values
For column var_43 the value 0 has 99.9057 percent of values
For column var_45 the value 0 has 99.92 percent of values
For column var_53 the 

In [8]:
obj2 = FilterMethodQuasiConstantFeatures('../data/dataset_1.csv')
obj2.quasi_constant_manual(threshold=0.998)

----Fit Summary------
---------------------
A variance threshold  of 0.998 is used
A total of 50 features are not constant
A total of 108 features are quasi constant
The train shape before fit is (35000, 266)
The train shape after fit is (15000, 158)
The test shape before fit is (15000, 266)
The test shape after fit is (35000, 158)
--------Summary of Value Percenteges--------------
For column var_1 the value 0 has 99.9629 percent of values
For column var_2 the value 0 has 99.9971 percent of values
For column var_3 the value 0 has 99.9629 percent of values
For column var_6 the value 0 has 99.9943 percent of values
For column var_7 the value 0 has 99.9886 percent of values
For column var_9 the value 0 has 99.9886 percent of values
For column var_10 the value 0 has 99.9943 percent of values
For column var_11 the value 0 has 99.9943 percent of values
For column var_12 the value 0 has 99.9971 percent of values
For column var_14 the value 0 has 99.9971 percent of values
For column var_16 the