# Notebook 3 - Feature Selection

In [25]:
!pip install tqdm



In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFE, chi2, f_regression

import numpy as np
import pandas as pd
import tqdm

import pickle
from sklearn.externals import joblib

# Feature Selection

## on UCI

In [27]:
uci_madelon = joblib.load('./pickles/uci_madelon1new.pkl')

In [28]:
uci_madelon_no_target = uci_madelon.drop(['target'], axis = 1)

In [29]:
uci_madelon.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
76,481,487,474,488,561,482,354,478,468,471,...,482,493,697,486,567,476,508,554,536,-1
355,476,502,529,493,490,468,446,476,490,464,...,471,514,420,454,557,482,457,510,500,-1
916,469,454,489,481,500,478,461,475,477,481,...,479,498,589,517,556,484,476,539,473,-1
1694,470,513,496,478,467,481,454,476,488,486,...,469,527,296,478,556,477,509,527,520,-1
133,483,480,513,502,520,481,502,477,519,475,...,486,531,550,502,497,480,486,496,462,1


### Correlation Matrix

In [30]:
corr_matrix = uci_madelon_no_target.corr()

In [31]:
n = corr_matrix.shape[0]

for i in range(n):
    corr_matrix.iloc[i,i] = 0

corr_matrix = corr_matrix.abs()

max_corr = corr_matrix.max()

In [35]:
feats = list(corr_matrix.loc[:, max_corr > .6].columns)
print(len(feats))
feats

20


[28,
 48,
 64,
 105,
 128,
 153,
 241,
 281,
 318,
 336,
 338,
 378,
 433,
 442,
 451,
 453,
 455,
 472,
 475,
 493]

In [None]:
joblib.dump(feats, 'uci_feats.pkl')

### SelectKBest

In [46]:
skb = SelectKBest()

In [47]:
 def find_KBest(data):
    '''
    Find the 20 best features from a set of data. Data input should come without a target column.
    '''
    for col in data.columns:
        
        X = data.drop([col], axis = 1)
        y = data[col]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)
        
        fit = SelectKBest(score_func= f_regression, k=20).fit(X_train,y_train)
    
        skb_feats = list(np.where(fit.get_support())[0])
        
    return skb_feats

In [48]:
skb_feats = find_KBest(uci_madelon_no_target)
skb_feats

[14,
 20,
 83,
 86,
 103,
 110,
 160,
 163,
 211,
 235,
 238,
 256,
 280,
 289,
 299,
 344,
 432,
 438,
 460,
 497]

### SelectFromModel

In [None]:
X = uci_madelon_no_target
y = uci_madelon['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [None]:
sfm_logr = SelectFromModel(LogisticRegression(), threshold = '2.85*mean')

In [None]:
sfm_logr.fit(X_train, y_train)

In [None]:
sfm_logr_feats = np.where(sfm_logr.get_support())[0]

sfm_logr_feats

### RFE

In [None]:
rfe = RFE(LogisticRegression(), n_features_to_select=20, step = 5, verbose=1)

In [None]:
rfe.fit(X_train, y_train)

In [None]:
rfe_feats = np.where(rfe.get_support())[0]
rfe_feats

### Mean R2

In [44]:
def calculate_r_2_for_feature(data, feature, regression_method):
    '''
    Given a dataset, a feature, and a regression method, return the R2 for that feature.
    '''
    new_data = data.drop(feature, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(new_data, data[feature], test_size=0.33)

    regressor = regression_method
    regressor.fit(X_train, y_train)

    score = regressor.score(X_test, y_test)
    return score


def mean_r2_for_feature(data, feature, regression_method):
    '''
    Take 3 iterations of caclulating the R2 for a feature, then take the average.
    '''
    scores = []
    
    for _ in range(3):
        scores.append(calculate_r_2_for_feature(data, feature, regression_method))   
    
    scores = np.array(scores)
    mean_scores = scores.mean()
    
    return mean_scores 

def return_feature_scores(data, regression_method):
    '''
    Returns the scores of all features. 
    '''
    feature_scores = []
    columns = data.columns
    
    for col in columns:
        mean_r2 = mean_r2_for_feature(data, col, regression_method)
        
        if mean_r2 > 0:
            feature_scores.append((col, mean_r2))
            
        else:
            pass
        
    return feature_scores

In [None]:
return_feature_scores(uci_madelon, KNeighborsRegressor(n_jobs = -1))

## on Josh's

`j_madelon_test.pkl` is a sample set with shape `(2000, 1002)`

In [36]:
j_madelon = joblib.load('./pickles/j_madelon_test.pkl')
j_madelon = j_madelon.drop(['_id'], axis = 1)

In [37]:
Xj = j_madelon.drop(['target'], axis = 1)
yj = j_madelon['target']

### Correlation Matrix

In [38]:
corr_matrix_j = j_madelon.corr()

In [39]:
np.fill_diagonal(corr_matrix_j.values, 0)

In [40]:
corr_matrix_j = corr_matrix_j.abs()

max_corr_j = corr_matrix_j.max()

In [42]:
feats_j = list(corr_matrix_j.loc[:, max_corr_j > .6].columns)
print(len(feats_j))
feats_j

20


['feat_257',
 'feat_269',
 'feat_308',
 'feat_315',
 'feat_336',
 'feat_341',
 'feat_395',
 'feat_504',
 'feat_526',
 'feat_639',
 'feat_681',
 'feat_701',
 'feat_724',
 'feat_736',
 'feat_769',
 'feat_808',
 'feat_829',
 'feat_867',
 'feat_920',
 'feat_956']

In [None]:
[257, 269, 308, 315, 336, 341, 395, 504, 526, 639, 681, 701, 724, 736, 769, 808, 829, 867, 920, 956]

In [None]:
joblib.dump(feats_j, 'josh_feats.pkl')

In [45]:
return_feature_scores(j_madelon, Log(n_jobs = -1))

NameError: name 'Log' is not defined

## Dimension Reduction

### UCI

Decided to go with the 20 features found from the correlation matrix technique.

`28, 48, 64, 105, 128, 153, 241, 281, 318, 336, 338, 378, 433, 442, 451, 453, 455, 472, 475, 493`


In [1]:
from sklearn.decomposition import PCA

In [None]:
pca_uci = PCA(n_components=5)

In [None]:
pca_uci.fit(uci_madelon[feats])

In [None]:
pd.DataFrame(pca_uci.components_)

In [None]:
pca_uci.explained_variance_

In [None]:
uci_madelon_pca = pca_uci.transform(uci_madelon[feats])

In [None]:
uci_madelon_pca_df = pd.DataFrame(uci_madelon_pca)

In [None]:
uci_madelon_pca_df.head()

### Josh's

In [5]:
pca_josh = PCA(n_components=5)

In [10]:
pca_josh.fit(j_madelon[feats_j])

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [11]:
pd.DataFrame(pca_josh.components_)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.13682,-0.025303,0.011481,0.085652,-0.352789,-0.436513,-0.22745,-0.260787,0.216467,-0.355875,0.06159,-0.170021,0.15918,-0.164475,0.178942,0.06201,-0.33865,0.190219,0.070552,-0.296849
1,-0.197858,0.47746,-0.001346,0.328745,-0.052728,0.240963,0.247158,-0.000237,0.038109,-0.157868,0.109902,-0.408108,-0.370962,0.150923,-0.14519,0.079236,0.030978,0.195115,-0.20219,-0.173821
2,-0.257227,-0.22211,0.302716,-0.059602,-0.313101,-0.016421,0.135864,0.313477,-0.28203,0.049,-0.141322,0.23105,-0.376963,-0.240142,-0.026569,0.406151,-0.199558,0.113449,-0.023734,0.017154
3,-0.184331,0.043123,-0.125733,-0.048039,0.156635,0.018595,0.243039,-0.305444,-0.18724,-0.089942,0.210504,0.251765,-0.094657,0.19815,-0.318295,-0.043596,-0.471412,-0.255484,0.414047,-0.08709
4,-0.155032,-0.178836,-0.511852,0.213431,0.066594,-0.135757,0.225375,0.287196,0.141837,-0.012524,-0.491737,-0.001917,-0.055853,0.298546,0.28294,0.049971,-0.049219,-0.079542,0.124987,-0.137989


In [12]:
pca_josh.explained_variance_

array([ 29.20786426,  24.88210071,  11.17649782,   7.95997795,   5.89482313])

In [15]:
josh_madelon_pca = pca_josh.transform(j_madelon[feats_j])

In [16]:
josh_madelon_pca_df = pd.DataFrame(josh_madelon_pca)

In [18]:
josh_madelon_pca_df.head()

Unnamed: 0,0,1,2,3,4
0,5.988974,-0.251109,2.248139,0.518833,0.283657
1,1.702542,4.178978,-1.353208,1.148923,-5.052881
2,4.043677,2.912308,0.004676,0.987972,-2.882875
3,4.675206,1.756221,-2.332146,1.507797,0.840058
4,-3.230838,-1.687809,0.769312,-2.583358,-1.609537


small change