# SFFS(Sequential Forward Floating Selection)

- 첨부의 csv 파일 로딩 및 조회

- SFFS(Sequantial Floating Forward Selection) 알고리즘 적용한 최적의 feature sub set 도출 및 최적의 feature set 의 정확도 조회

- 분류 알고리즘은 Random Forest 분류기를 사용

- 학습용 데이터와 테스트용 데이터는 75% 25% 비율


##### 추가 사항 

- k_features= 5 로 된 부분을 가장 성능이 높은 갯수를 자동으로 찾도록 소스 변경 

In [1]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

### 첨부의 csv 파일 로딩 및 조회

In [2]:
df = pd.read_csv("C:\Temp\diabetes.csv")

In [3]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
col = list(map(str, df.columns))

In [5]:
col

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [6]:
X = df[col[:-1]]

In [7]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [8]:
y = df[col[-1]]

In [9]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

###  학습용 데이터와 테스트용 데이터는 75% 25% 비율

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state= 0)

### SFFS(Sequential Forward Floating Selection) 알고리즘
- RandomFroestClassifier 적용
- mlxtend library 적용

##### 추가 사항 
- k_features= (min, max) 으로 자동으로 성능이 높은 갯수를 리턴
- 학습용 데이터로 적용

In [11]:
rf = RandomForestClassifier(n_estimators=100, random_state=0).fit(X_train,y_train)

In [12]:
predict_y = rf.predict(X_test) 

from sklearn.metrics import classification_report 
print(classification_report(y_test, rf.predict(X_test)))
print("test score : {}".format(rf.score(X_test, y_test)))

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       130
           1       0.70      0.52      0.59        62

    accuracy                           0.77       192
   macro avg       0.75      0.70      0.72       192
weighted avg       0.76      0.77      0.76       192

test score : 0.7708333333333334


In [13]:
max_col = len(X.columns)

sffs = SFS(rf, 
           k_features=(1, max_col), # (min, max), 최소 값과 최대 값을 통해서 적절한 값을 찾을 수 있도록 변경
           forward=True, 
           floating=True,  
           scoring='accuracy', 
           cv=5,
           n_jobs= -1) 
sffs = sffs.fit(X_train, y_train) # 학습용 데이터로 적용

In [14]:
sffs.k_feature_names_

('Glucose', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age')

In [15]:
sffs.k_score_

0.7621889055472264

In [16]:
display(pd.DataFrame(sffs.get_metric_dict()))

Unnamed: 0,1,2,3,4,5,6,7,8
feature_idx,"(1,)","(1, 5)","(1, 3, 5)","(1, 3, 4, 5)","(1, 4, 5, 6, 7)","(1, 2, 4, 5, 6, 7)","(0, 1, 2, 3, 4, 6, 7)","(0, 1, 2, 3, 4, 5, 6, 7)"
cv_scores,"[0.6724137931034483, 0.7043478260869566, 0.660...","[0.7241379310344828, 0.7217391304347827, 0.747...","[0.75, 0.7565217391304347, 0.7391304347826086,...","[0.7327586206896551, 0.7391304347826086, 0.721...","[0.7413793103448276, 0.7913043478260869, 0.791...","[0.7327586206896551, 0.8, 0.782608695652174, 0...","[0.75, 0.7652173913043478, 0.7652173913043478,...","[0.7327586206896551, 0.7391304347826086, 0.765..."
avg_score,0.684048,0.722219,0.732609,0.7309,0.762189,0.756987,0.746522,0.741334
feature_names,"(Glucose,)","(Glucose, BMI)","(Glucose, SkinThickness, BMI)","(Glucose, SkinThickness, Insulin, BMI)","(Glucose, Insulin, BMI, DiabetesPedigreeFuncti...","(Glucose, BloodPressure, Insulin, BMI, Diabete...","(Pregnancies, Glucose, BloodPressure, SkinThic...","(Pregnancies, Glucose, BloodPressure, SkinThic..."
ci_bound,0.0201409,0.0187424,0.0315128,0.015851,0.0551869,0.0485767,0.0228776,0.0620702
std_dev,0.0156703,0.0145822,0.024518,0.0123326,0.0429373,0.0377943,0.0177996,0.0482927
std_err,0.00783516,0.00729111,0.012259,0.00616629,0.0214686,0.0188972,0.00889978,0.0241464


### 주요 참고 사이트

https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection
http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#example-2-toggling-between-sfs-sbs-sffs-and-sbfs
