In [131]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [62]:
df = pd.read_csv("data/feature_selection.csv")

In [63]:
df.head()

Unnamed: 0,ID,age,class,A.c1.1,A.c1.2,A.c1.3,A.c1.4,A.c1.5,A.c2.1,A.c2.2,...,J.c8.1,J.c8.2,J.c8.3,J.c8.4,J.c8.5,J.c9.1,J.c9.2,J.c9.3,J.c9.4,J.c9.5
0,5560,3,typ,2.238984,3.238984,4.238984,5.238984,6.238984,2.539386,3.539386,...,2.33063,3.33063,4.33063,5.33063,6.33063,0.105146,1.105146,2.105146,3.105146,4.105146
1,4694,3,typ,1.490947,2.490947,3.490947,4.490947,5.490947,0.692924,1.692924,...,0.033946,1.033946,2.033946,3.033946,4.033946,-0.921489,0.078511,1.078511,2.078511,3.078511
2,6449,3,typ,1.828413,2.828413,3.828413,4.828413,5.828413,2.995978,3.995978,...,-0.309544,0.690456,1.690456,2.690456,3.690456,1.838188,2.838188,3.838188,4.838188,5.838188
3,3008,3,asd,1.930039,2.930039,3.930039,4.930039,5.930039,2.698195,3.698195,...,0.727438,1.727438,2.727438,3.727438,4.727438,2.793029,5.793029,10.793029,17.793029,26.793029
4,3863,3,typ,2.272464,3.272464,4.272464,5.272464,6.272464,1.539144,2.539144,...,2.168858,3.168858,4.168858,5.168858,6.168858,-0.938,0.062,1.062,2.062,3.062


## Initial look
There are 453 columns, where 3 are categorical. Since age only has 1 value, this column will be dropped. ID will also be dropped since it is just an identifier of the data. Class will be label encoded, to 0 and 1; one hot encoding is unecessary since it only has 2 values.
* Categorical
    * ID: 818 unique values
    * age: 1 unique
    * class: 2 unique values
* Continuous
    * 450 columns of 10metrics x 9channels x 5 scales
Dropping ID and age results in 451 columns

In [117]:
le = preprocessing.LabelEncoder()
le.fit(df['class'])
target = le.transform(df['class']) 

LabelEncoder()

To run PCA, the continuous columns need to go through normalization

In [118]:
columns = df.drop(columns=["ID", "age", "class"]).columns
x = df[columns].values
min_max_scaler = preprocessing.MinMaxScaler()

x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=columns, index = df.index)
df[columns] = df_temp

Before proceeding further, we drop any columns that correlate more than 0.80. This drops 360 columns out of the 450 columns

In [119]:
# Create correlation matrix
df2 = df.drop(columns=["ID", "age", "class"])
corr_matrix = df2.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]
print(len(to_drop))
df3 = df2.drop(columns=to_drop)
df3["class"] = df["class"]

360


In [120]:
df3.head()

Unnamed: 0,A.c1.1,A.c2.1,A.c3.1,A.c4.1,A.c5.1,A.c6.1,A.c7.1,A.c8.1,A.c9.1,B.c1.1,...,J.c1.1,J.c2.1,J.c3.1,J.c4.1,J.c5.1,J.c6.1,J.c7.1,J.c8.1,J.c9.1,class
0,0.811133,0.88544,0.88687,0.265634,0.473976,0.195284,0.838154,0.732288,0.686819,0.961677,...,0.583561,0.728455,0.823503,0.306806,0.661808,0.286801,0.522014,0.832693,0.276217,1
1,0.623756,0.42216,0.298063,0.107139,0.23497,0.211714,0.825197,0.630345,0.272556,0.611247,...,0.799576,0.077491,0.663626,0.866635,0.563726,0.369473,0.107392,0.257865,0.019267,1
2,0.708289,1.0,0.78576,0.171995,0.063152,0.255645,0.97892,0.1709,0.506626,0.297477,...,0.362484,0.189789,0.211993,0.975401,0.311083,0.234945,0.447001,0.171894,0.709969,1
3,0.733745,0.925286,0.724177,0.438271,0.731878,0.87126,0.747149,0.89045,0.884787,0.179786,...,0.18596,0.741472,0.294284,0.529571,0.176256,0.573982,0.3587,0.431436,0.94895,0
4,0.81952,0.634478,0.927901,0.656281,0.310154,0.38352,0.579619,0.585991,0.412768,0.35276,...,0.058116,0.640197,0.359781,0.192486,0.105798,0.829789,0.816374,0.792204,0.015135,1


In [172]:
df3.columns

Index(['A.c1.1', 'A.c2.1', 'A.c3.1', 'A.c4.1', 'A.c5.1', 'A.c6.1', 'A.c7.1',
       'A.c8.1', 'A.c9.1', 'B.c1.1', 'B.c2.1', 'B.c3.1', 'B.c4.1', 'B.c5.1',
       'B.c6.1', 'B.c7.1', 'B.c8.1', 'B.c9.1', 'C.c1.1', 'C.c2.1', 'C.c3.1',
       'C.c4.1', 'C.c5.1', 'C.c6.1', 'C.c7.1', 'C.c8.1', 'C.c9.1', 'D.c1.1',
       'D.c2.1', 'D.c3.1', 'D.c4.1', 'D.c5.1', 'D.c6.1', 'D.c7.1', 'D.c8.1',
       'D.c9.1', 'E.c1.1', 'E.c2.1', 'E.c3.1', 'E.c4.1', 'E.c5.1', 'E.c6.1',
       'E.c7.1', 'E.c8.1', 'E.c9.1', 'F.c1.1', 'F.c2.1', 'F.c3.1', 'F.c4.1',
       'F.c5.1', 'F.c6.1', 'F.c7.1', 'F.c8.1', 'F.c9.1', 'G.c1.1', 'G.c2.1',
       'G.c3.1', 'G.c4.1', 'G.c5.1', 'G.c6.1', 'G.c7.1', 'G.c8.1', 'G.c9.1',
       'H.c1.1', 'H.c2.1', 'H.c3.1', 'H.c4.1', 'H.c5.1', 'H.c6.1', 'H.c7.1',
       'H.c8.1', 'H.c9.1', 'I.c1.1', 'I.c2.1', 'I.c3.1', 'I.c4.1', 'I.c5.1',
       'I.c6.1', 'I.c7.1', 'I.c8.1', 'I.c9.1', 'J.c1.1', 'J.c2.1', 'J.c3.1',
       'J.c4.1', 'J.c5.1', 'J.c6.1', 'J.c7.1', 'J.c8.1', 'J.c9.1', 'class'],

We can see that columns with scale value other than 1 are all dropped.

## Feature selection through Random Forest and Feature Selection
Using the feature importances on a randomforest model, we can further downsize the number of features that are important to make a prediction

In [126]:
X = df3.drop(columns=["class"]) 
y = df3["class"]

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [140]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((572, 90), (572,), (246, 90), (246,))

In [152]:
clf = RandomForestRegressor(n_estimators=1000, random_state=42)
clf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

NameError: name 'feat_labels' is not defined

In [153]:
# Print the name and gini importance of each feature
for feature in zip(X.columns, clf.feature_importances_):
    print(feature)

('A.c1.1', 0.013229666827143634)
('A.c2.1', 0.008117586225198498)
('A.c3.1', 0.007332670270880795)
('A.c4.1', 0.015099052576595265)
('A.c5.1', 0.010746426382779839)
('A.c6.1', 0.014386425491933514)
('A.c7.1', 0.014830024516218851)
('A.c8.1', 0.01122942681813424)
('A.c9.1', 0.015865972996893105)
('B.c1.1', 0.011684286879637953)
('B.c2.1', 0.009433618485980523)
('B.c3.1', 0.025069284532330486)
('B.c4.1', 0.014536585286922539)
('B.c5.1', 0.016197892129008958)
('B.c6.1', 0.007881765701559408)
('B.c7.1', 0.006909048090532318)
('B.c8.1', 0.014382235048276305)
('B.c9.1', 0.01732521648869488)
('C.c1.1', 0.01450102578805327)
('C.c2.1', 0.019313599760212376)
('C.c3.1', 0.010162856304565794)
('C.c4.1', 0.0069521276558200725)
('C.c5.1', 0.006482140231444947)
('C.c6.1', 0.012871875234721185)
('C.c7.1', 0.005312070087340883)
('C.c8.1', 0.006073919719040211)
('C.c9.1', 0.010730808995679006)
('D.c1.1', 0.007800711801237042)
('D.c2.1', 0.008353386097702722)
('D.c3.1', 0.012004506197366462)
('D.c4.1', 0

Looking at the feature importance values, most features dont exceed 0.01. We choose this as our threshold to select the features

In [158]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.01
sfm = SelectFromModel(clf, threshold=0.01)

# Train the selector
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=0.01)

In [161]:
# Print the names of the most important features
features = []
for feature_list_index in sfm.get_support(indices=True):
    features.append(X.columns[feature_list_index])

In [170]:
len(features)

51

In [171]:
for f in features:
    #print(f, end=', ', flush=True)
    print(f)

A.c1.1
A.c4.1
A.c5.1
A.c6.1
A.c7.1
A.c8.1
A.c9.1
B.c1.1
B.c3.1
B.c4.1
B.c5.1
B.c8.1
B.c9.1
C.c1.1
C.c2.1
C.c3.1
C.c6.1
C.c9.1
D.c3.1
D.c4.1
D.c5.1
D.c8.1
E.c1.1
E.c2.1
E.c3.1
E.c4.1
E.c5.1
F.c2.1
F.c4.1
F.c6.1
F.c7.1
G.c2.1
G.c7.1
G.c8.1
G.c9.1
H.c6.1
H.c7.1
H.c8.1
I.c1.1
I.c2.1
I.c3.1
I.c4.1
I.c6.1
I.c8.1
I.c9.1
J.c2.1
J.c3.1
J.c4.1
J.c5.1
J.c7.1
J.c8.1


After going through feature selection, we get 51 features shown above.
In all
    * we reduced 452 features to 450 by dropping ID and age
    * Reduced from 450 to 90 by removing columns that correlated more than 0.8
    * Reduced from 90 to 51 through feature selection