# Imports

In [3]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import KFold, cross_validate, LeaveOneOut
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
import time
import numpy as np

# Read data in

In [4]:
# read from csv
data_df = pd.read_csv('./data/eig_centrality.csv', header=None)
# drop columns with zeros
data_df = data_df.loc[:, (data_df != 0).any(axis=0)]
data_df.head(20)

Unnamed: 0,0,1,2,6,7,10,12,13,14,15,...,285,286,287,288,289,290,291,292,293,294
0,1,-0.067894,-0.062832,-0.03124,-0.071004,-0.030888,0.009534,0.04466,0.043163,0.02544,...,-0.11276,-0.10607,0.0,0.0,-0.028195,-0.024616,-0.042535,-0.087451,0.003145,-0.047196
1,1,0.031793,0.025996,0.12203,0.10922,-0.005733,0.017223,-0.072822,-0.077072,-0.12665,...,0.078364,0.049836,0.0,6.938900000000001e-18,-0.076947,-0.078418,-0.064815,0.095082,0.054585,0.076347
2,1,-0.005251,-0.015806,-0.12404,-0.11702,-0.017355,-0.035984,0.10695,0.011851,0.077032,...,0.021125,0.01447,0.0,-1.7347000000000001e-18,0.068705,0.084321,0.047107,0.004649,-0.14276,-0.12945
3,1,0.12315,0.096576,-0.045236,-0.005143,-0.017959,-0.060064,0.008232,-0.07691,-0.042061,...,-0.043127,-0.024063,5.5511000000000006e-17,-1.1102e-16,-0.06451,-0.10038,-0.11295,-0.099919,-0.006036,-0.004228
4,1,-0.059846,-0.0245,-0.022105,0.022153,0.079495,-0.013115,0.019867,0.032033,0.012232,...,-0.026404,-0.004397,-4.3367999999999994e-19,0.0,-0.068379,0.006091,-0.050172,0.056009,0.077894,0.0165
5,1,-0.047201,-0.023249,0.10719,0.11774,-0.010293,-0.040863,-0.098316,-0.047187,-0.10188,...,-0.047042,-0.084535,0.0,0.0,-0.095798,-0.073266,-0.094619,0.084386,0.081942,0.10532
6,1,-0.089435,-0.080275,0.00114,-0.014447,0.027198,-0.026102,0.11672,0.049616,0.06091,...,-0.005671,-0.003008,0.0,0.0,-0.009444,0.0369,-0.022935,0.029724,-0.020147,-0.023445
7,1,-0.092537,0.003668,-0.12943,-0.10773,0.08869,0.042226,0.059388,0.03793,0.075362,...,-0.015637,0.001023,2.1683999999999997e-19,0.0,0.038453,0.11186,0.055049,-0.015476,-0.12596,-0.093968
8,1,0.022265,0.048711,0.048122,0.032252,0.037123,-0.059852,0.011789,-0.07129,-0.1098,...,-0.021779,-0.004315,2.7105e-20,0.0,-0.099726,-0.076015,-0.10614,-0.054908,0.038627,0.040091
9,1,-0.11108,-0.02374,-0.02701,-0.011543,0.089719,0.020014,0.013929,0.05809,0.067178,...,0.011984,0.053828,-6.938900000000001e-18,-2.7756e-17,0.059277,0.059997,0.10167,0.027837,0.02967,0.012809


## Pull X and y from dataframe

In [5]:
X = abs(data_df.iloc[:, 1:]) # take the absolute value
y = data_df.iloc[:,0]

print(X.shape)
print(y.shape)

(60, 273)
(60,)


# Forward Selection

In [72]:
# Init model
clf = svm.SVC(C=1000, gamma='auto')

# Get num features desired
num_features = data_df.shape[1] - 1 # get total number of features
num_features_del = 5 # number of features to take off
final_features = num_features - num_features_del # final number of features in the model
print(f'final features: {final_features}')

# Call sequential features selector
sfs = SequentialFeatureSelector(clf, n_features_to_select=final_features, cv=10, direction='forward')

# Fit the model
sfs.fit(X,y)

final features: 268


SequentialFeatureSelector(cv=10, estimator=SVC(C=1000, gamma='auto'),
                          n_features_to_select=268)

In [73]:
sfs.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [74]:
# Get index of features that were dropped
dropped_features = np.where(sfs.get_support() == False)[0]
print(dropped_features)

[185 194 234 236 240]


## View dropped features

In [75]:
## Create empty dict for storing results
dropped_dict = {}
for index in dropped_features:
    sample_points = X[index]
    dropped_dict[index] = sample_points

## Verify this works correctly
print(dropped_dict.keys())

dict_keys([185, 194, 234, 236, 240])


In [56]:
## Create an empty data frame for easy usage
pd.DataFrame(dropped_dict).head(60)

Unnamed: 0,144,155,194,197,243
0,0.009374,0.004708,0.045112,0.025727,0.083662
1,0.002818,0.005332,0.028392,0.009401,0.00455
2,0.034402,0.028343,0.039744,0.021268,0.082136
3,0.038833,0.11205,0.014235,0.023948,0.040527
4,0.02939,0.12425,0.073083,0.064858,0.001888
5,0.002788,0.012017,0.038674,0.035372,0.058978
6,0.01098,0.064797,0.024266,0.039258,0.030792
7,0.000276,0.002665,0.045647,0.040691,0.037514
8,0.081128,0.061879,0.037151,0.090069,0.067636
9,0.031749,0.037143,0.13801,0.000997,0.012898


## Transform data to drop these features

In [76]:
## Transform data from sfs
X_new = sfs.transform(X)
### Verify transformation
print(X.shape[1] - len(dropped_features)) # num features that should be left
print(X_new.shape[1])

268
268


## Train new model

In [77]:
clf = svm.SVC(C=1000, gamma='auto')
loo = LeaveOneOut()
results_cv = cross_validate(clf, X, y, cv=loo, return_train_score=True)

In [78]:
print(np.mean(results_cv['train_score']))
print(np.mean(results_cv['test_score']))

0.9833333333333333
0.5333333333333333


# Function for forward and backwards

In [16]:
def feature_selection(X, y, model, direction='forward',
                     features_dropped = 10, cv_sfs=10, cv_train=10, notes=None):
    print('####################')
    print('Begin Feature Selection')
    print('####################')

    # Create a model for sfs and training
    model_sfs = model
    model_train = model

    # Get length of data for final num features
    num_features = X.shape[1] # get total number of features
    final_features = num_features - features_dropped # final number of features in the model
    print(f'final features: {final_features}')

    # Call sequential features selector
    sfs = SequentialFeatureSelector(model_sfs, n_features_to_select=final_features, cv=cv_sfs, direction=direction)
    sfs.fit(X,y)
    
    # Get the dropped features
    dropped_features = np.where(sfs.get_support() == False)[0]
    print(f'Dropped Features: {dropped_features}')

    # Transform data and train a model
    X_new = sfs.transform(X)

    # Train SVM model
    print('-----------------')
    print(f'{model_train} being used')
    results = cross_validate(model_train, X_new, y, cv=cv_train, return_train_score=True)
    train_score = results['train_score']
    test_score = results['test_score']
    test_min = min(test_score)
    test_avg = np.mean(test_score)
    test_max = max(test_score)
    print(f'train scores: {train_score}')
    print(f'test scores: {test_score}')
    print('-----------------')

    ## Create a dict to store results (makes dataframes easier)
    results_dict = {
                    'model': str(model),
                    'direction': direction,
                    'features_dropped': [dropped_features],
                    'final_num_features': final_features,
                    'train_scores': [train_score.round(3)],
                    'test_scores': [test_score.round(3)],
                    'min_test': test_min,
                    'avg_test': test_avg,
                    'max_test': test_max,
                    'notes': notes,
                    }
                    
    # results_df = pd.DataFrame(results_dict)

    return(results_dict)

In [14]:
one_run = feature_selection(X, y, model=svm.SVC(C=1000, gamma='auto'), features_dropped=90)

####################
Begin Feature Selection
####################
final features: 243
Dropped Features: [ 15 104 113 149 150 152 154 159 168 178 185 188 194 200 202 203 208 211
 213 215 221 234 236 237 240 249 258 262 263 270]
-----------------
SVC(C=1000, gamma='auto') being used
train scores: [0.96296296 0.98148148 0.96296296 0.96296296 0.96296296 0.96296296
 0.96296296 0.96296296 0.98148148 0.96296296]
test scores: [0.66666667 0.5        0.83333333 0.33333333 0.83333333 0.66666667
 0.5        0.5        0.66666667 0.66666667]
-----------------


ValueError: All arrays must be of the same length

In [18]:
all_results = []
for x in [30,60,90]:
    results = feature_selection(X, y, model=svm.SVC(C=1000, gamma='auto'), features_dropped=x)
    print(results)
    all_results.append(results)
    

####################
Begin Feature Selection
####################
final features: 243
Dropped Features: [ 15 104 113 149 150 152 154 159 168 178 185 188 194 200 202 203 208 211
 213 215 221 234 236 237 240 249 258 262 263 270]
-----------------
SVC(C=1000, gamma='auto') being used
train scores: [0.96296296 0.98148148 0.96296296 0.96296296 0.96296296 0.96296296
 0.96296296 0.96296296 0.98148148 0.96296296]
test scores: [0.66666667 0.5        0.83333333 0.33333333 0.83333333 0.66666667
 0.5        0.5        0.66666667 0.66666667]
-----------------
{'model': "SVC(C=1000, gamma='auto')", 'direction': 'forward', 'features_dropped': [array([ 15, 104, 113, 149, 150, 152, 154, 159, 168, 178, 185, 188, 194,
       200, 202, 203, 208, 211, 213, 215, 221, 234, 236, 237, 240, 249,
       258, 262, 263, 270])], 'final_num_features': 243, 'train_scores': [array([0.963, 0.981, 0.963, 0.963, 0.963, 0.963, 0.963, 0.963, 0.981,
       0.963])], 'test_scores': [array([0.667, 0.5  , 0.833, 0.333, 0.833, 

# Experiment with methods needed to do exhaustive training

In [15]:
# Iterrate over the data frame for many X and y
num_features = data_df.shape[1] - 1 # get total number of features
num_features_del = 5 # number of features to take off
final_features = num_features - num_features_del # final number of features in the model
print(final_features)

289
