In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from matplotlib import pyplot as plt
from itertools import compress
from sklearn.metrics.pairwise import cosine_similarity

In [184]:
class FilterMethodDuplicateFeatures:
    
    def __init__(self,data_path):
        self.df=pd.read_csv(data_path)
        
    def dataframe_info(self):
        self.df.info()
        
    def dataframe_stats(self):
        print(self.df.describe().T)
        
    def __train_test_split_fmb(self):
        X_train, X_test, y_train, y_test = train_test_split(self.df.drop(['target'],axis='columns'),\
                                                            self.df['target'],\
                                                            test_size=0.3, \
                                                            random_state = 0
                                                           )
        return  X_train, X_test, y_train, y_test
    
    
    def __quasi_constant_manual(self,threshold=0.998):
        X_train, X_test, y_train, y_test = self.__train_test_split_fmb()     
        col_names=X_train.columns
        # Make deep instead of shallow copy to create an entirly new datafeame
        quasi_constant_feat=[]    
        for col in col_names:
            # find the predominant value, that is the value that is shared
            # by most observations
            predominant = (X_train[col].value_counts() / np.float(
            len(X_train))).sort_values(ascending=False).values[0]

            # evaluate the predominant feature: do more than 99% of the observations
            # show 1 value?
            if predominant > threshold:
                # if yes, add the variable to the list
                quasi_constant_feat.append(col)
        
        X_train.drop(labels=quasi_constant_feat, axis=1, inplace=True)
        X_test.drop(labels=quasi_constant_feat, axis=1, inplace=True)        
            
        return X_train,X_test
    
    
    def __fit_summary(self,dup_list,before_test_shape,after_test_shape,before_train_shape,after_train_shape):
        
        print('--------summary of duplicate features------------------')
        print('There are a total of '+str(len(dup_list))+" duplicate features")
        print('--------------------------------------------------------------')
        for l in dup_list:
            print(l[1]+" is a duplicate of "+l[0]+" and will be dropped")
        print('--------------------------------------------------------------')
        print('Train Shape before drop '+str(before_test_shape))
        print('Train Shape after drop '+str(after_test_shape))
        print('Test Shape before drop '+str(before_train_shape))
        print('Test Shape after drop '+str(after_train_shape))
    
    
    def duplicate_features(self):
        X_train,X_test = self.__quasi_constant_manual()
        
        dup_dict={}
        dup_list=[]
        col_list = X_train.columns
        drop_list = []
        
        for i in  range(len(X_train.columns)):
            for j in range(i,len(X_train.columns)):
                if (col_list[i] != col_list[j]) and (X_train.iloc[:,i].equals(X_train.iloc[:,j])):
                    dup_list.append([col_list[i],col_list[j]])
                    drop_list.append(col_list[j])
        
        before_train_shape = np.shape(X_train)
        before_test_shape  = np.shape(X_test)
        
        X_train.drop(drop_list,axis=1,inplace=True)  
        X_test.drop(drop_list,axis=1,inplace=True)
        
        after_train_shape = np.shape(X_train)
        after_test_shape = np.shape(X_test)
          
        self.__fit_summary(dup_list,before_test_shape,after_test_shape,before_train_shape,after_train_shape)    

In [185]:
obj1 = FilterMethodDuplicateFeatures('../data/dataset_1.csv')

In [186]:
obj1.dataframe_info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Columns: 301 entries, var_1 to target
dtypes: float64(127), int64(174)
memory usage: 114.8 MB


In [187]:
obj1.dataframe_stats()

           count         mean           std  min  25%   50%  75%           max
var_1    50000.0     0.002220      0.108145  0.0  0.0  0.00  0.0  9.000000e+00
var_2    50000.0     0.000060      0.007746  0.0  0.0  0.00  0.0  1.000000e+00
var_3    50000.0    15.593002   1280.571855  0.0  0.0  0.00  0.0  2.079013e+05
var_4    50000.0     3.149633      2.740114  0.0  0.0  2.85  3.0  3.528000e+01
var_5    50000.0   608.681764  10951.361737  0.0  0.0  0.00  0.0  4.455000e+05
...          ...          ...           ...  ...  ...   ...  ...           ...
var_297  50000.0     0.000000      0.000000  0.0  0.0  0.00  0.0  0.000000e+00
var_298  50000.0     0.003060      0.078808  0.0  0.0  0.00  0.0  3.000000e+00
var_299  50000.0    12.462960    832.417622  0.0  0.0  0.00  0.0  1.346667e+05
var_300  50000.0  5683.960293  47364.820421  0.0  0.0  0.00  0.0  2.857673e+06
target   50000.0     0.039820      0.195538  0.0  0.0  0.00  0.0  1.000000e+00

[301 rows x 8 columns]


In [188]:
obj1.duplicate_features()

--------summary of duplicate features------------------
There are a total of 6 duplicate features
--------------------------------------------------------------
var_148 is a duplicate of var_37 and will be dropped
var_199 is a duplicate of var_84 and will be dropped
var_296 is a duplicate of var_143 and will be dropped
var_250 is a duplicate of var_177 and will be dropped
var_232 is a duplicate of var_226 and will be dropped
var_269 is a duplicate of var_229 and will be dropped
--------------------------------------------------------------
Train Shape before drop (15000, 158)
Train Shape after drop (15000, 152)
Test Shape before drop (35000, 158)
Test Shape after drop (35000, 152)
