In this notebook I will be using filtration methods to identify useless features.

First, load the data

In [73]:
import pandas as pd
import numpy as np
df1 = pd.read_csv('train.csv')
other = pd.read_csv('unique_m.csv').drop(['critical_temp','material'],axis=1)
df = pd.concat([df1,other],axis=1)
original_columns = len(df.columns)
print(df.shape)

(21263, 168)


Now remove all low variance columns - ones for which 95% of the time they have the same value.

In [74]:

# remove all columns that have a very small variance
threshold=0.05 #this gets rid of features which are the same value 95% of the time
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=threshold).fit(df)
df = pd.DataFrame(selector.transform(df),columns=df.columns[selector.get_support()])
df.shape

(21263, 132)

Now drop any duplicated features, if they exist.

In [75]:
#drop any duplicate features
df = df.transpose().drop_duplicates(keep='first').transpose()
print(df.shape)

(21263, 132)


Now drop any mutually correlated features.

In [76]:
#Now drop any correlated features
correlation_matrix = df.corr() 
mutually_correlated_features = set()  
for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            if colname!='critical_temp':
                mutually_correlated_features.add(colname)
print(len(mutually_correlated_features))
df.drop(mutually_correlated_features,axis=1,inplace=True)
df.shape

55


(21263, 77)

Now get rid of any features that have a very low correlation (abs<0.1) with the target variable, `critical_temp`.

In [79]:
#This gets rid of features that have a absolute correlation with the target less than 0.1

correlation_threshhold = 0.1

corr = pd.DataFrame(df.corr()['critical_temp'])
corr['abs'] = np.abs(corr['critical_temp'])
corr = corr.sort_values(by='abs',ascending=False).drop('abs',axis=1).dropna().reset_index()
corr = corr.rename(columns={'index':'feature','critical_temp':'corr'}).loc[1:]

low_correlated_features = list(corr[np.abs(corr['corr'])<=correlation_threshhold]['feature'])
df.drop(low_correlated_features,axis=1,inplace=True)
df.shape

(21263, 36)

In [78]:
print(str(original_columns-df.shape[1])+' features were found to be irrelevant')

132 features were found to be irrelevant


Now save the data set for modelling.

In [80]:
df.to_csv('data_filtered.csv')

# Sources:

https://stackabuse.com/applying-filter-methods-in-python-for-feature-selection/

https://stackabuse.com/applying-wrapper-methods-in-python-for-feature-selection/