In [2]:
import pandas as pd
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [3]:
X = pd.read_csv('../glass_formula_a.csv')

df = pd.read_csv('../glass.csv', sep=",")
Y = df['Type']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15)
X

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,cos(RI+Na),...,cos(Si+K),cos(Si+Ca),cos(Si+Ba),cos(Si+Fe),cos(K+Ca),cos(K+Ba),cos(K+Fe),cos(Ca+Ba),cos(Ca+Fe),cos(Ba+Fe)
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,-0.854113,...,-0.914457,0.407201,-0.888546,-0.888546,-0.816902,0.998201,0.998201,-0.780846,-0.780846,1.000000
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,-0.955232,...,-0.578939,0.434414,-0.890037,-0.890037,-0.440377,0.886995,0.886995,0.023979,0.023979,1.000000
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,-0.788898,...,-0.432648,0.612633,-0.742924,-0.742924,-0.310785,0.924909,0.924909,0.073914,0.073914,1.000000
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,-0.556771,...,-0.603136,0.658924,-0.938212,-0.938212,-0.805203,0.841901,0.841901,-0.357900,-0.357900,1.000000
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,-0.605388,...,-0.196147,0.862094,-0.679754,-0.679754,-0.693271,0.852525,0.852525,-0.214342,-0.214342,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,-0.998662,...,-0.907556,0.994110,-0.156778,-0.938212,-0.986455,0.417595,0.996802,-0.685707,-0.970191,0.488872
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,-0.745916,...,-0.694286,0.975589,0.732899,-0.694286,-0.519289,-0.019202,1.000000,-0.844470,-0.519289,-0.019202
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,-0.985127,...,-0.396249,0.984095,0.943346,-0.396249,-0.553048,-0.069148,1.000000,-0.792913,-0.553048,-0.069148
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,-0.982278,...,-0.215718,0.917682,0.976284,-0.215718,-0.585923,0.000796,1.000000,-0.810833,-0.585923,0.000796


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, stratify=Y)

## KNN

In [6]:
knn = KNeighborsClassifier()

plain_sbs = SFS(knn, 
          k_features=(1, 20), 
          forward=False,
          floating=False, 
          scoring='accuracy',
          cv=10,
          n_jobs=-1)

plain_sbs.fit(X_train, y_train, custom_feature_names=X.columns)

selected_features = X.columns[list(plain_sbs.k_feature_idx_)]

x_t = X_train[selected_features]
clf = KNeighborsClassifier()
clf.fit(x_t, y_train)

x_test_filtered = X_test[selected_features]
y_test_pred = clf.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.65      0.93      0.76        14
           2       0.69      0.73      0.71        15
           3       1.00      0.33      0.50         3
           5       0.50      0.33      0.40         3
           6       0.00      0.00      0.00         2
           7       1.00      0.67      0.80         6

    accuracy                           0.70        43
   macro avg       0.64      0.50      0.53        43
weighted avg       0.70      0.70      0.67        43

0.6571895424836601


## Naive Bayes

In [7]:
from sklearn.naive_bayes import GaussianNB

In [8]:
# Sequential Backward Selection
gnb = GaussianNB()

nb_sbs = SFS(gnb, 
          k_features=(1, 20), 
          forward=False, 
          floating=False, 
          scoring='accuracy',
          cv=10)

nb_sbs.fit(X_train, y_train, custom_feature_names=X.columns)

SequentialFeatureSelector(cv=10, estimator=GaussianNB(), forward=False,
                          k_features=(1, 20), scoring='accuracy')

In [9]:
selected_features = X.columns[list(nb_sbs.k_feature_idx_)]
x_nb = X_train[selected_features]

gnb_ = GaussianNB()
gnb_.fit(x_nb, y_train)

GaussianNB()

In [10]:
x_test_filtered = X_test[selected_features]
y_test_pred = gnb_.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(gnb_, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.50      0.36      0.42        14
           2       0.73      0.53      0.62        15
           3       0.09      0.33      0.14         3
           5       0.67      0.67      0.67         3
           6       1.00      1.00      1.00         2
           7       0.67      0.67      0.67         6

    accuracy                           0.51        43
   macro avg       0.61      0.59      0.58        43
weighted avg       0.61      0.51      0.55        43

0.5320261437908498


## SVM

In [11]:
from sklearn import svm
from sklearn.svm import LinearSVC

In [12]:
# Sequential Backward Selection
svm_sbs = SFS(svm.SVC(), 
          k_features=(1, 20), 
          forward=False, 
          floating=False, 
          scoring='accuracy',
          cv=10) # -1 means ALL CPU

svm_sbs.fit(X, Y, custom_feature_names=X.columns)

SequentialFeatureSelector(cv=10, estimator=SVC(), forward=False,
                          k_features=(1, 20), scoring='accuracy')

In [13]:
selected_features = X.columns[list(svm_sbs.k_feature_idx_)]
x_lr = X_train[selected_features]
svm_clf = svm.SVC()
svm_clf.fit(x_lr, y_train)

x_test_filtered = X_train[selected_features]
y_test_pred = svm_clf.predict(x_test_filtered)
print(classification_report(y_train, y_test_pred))
print(cross_val_score(svm_clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.65      0.89      0.75        56
           2       0.74      0.75      0.75        61
           3       0.00      0.00      0.00        14
           5       0.80      0.80      0.80        10
           6       0.00      0.00      0.00         7
           7       1.00      0.96      0.98        23

    accuracy                           0.74       171
   macro avg       0.53      0.57      0.55       171
weighted avg       0.66      0.74      0.69       171

0.35653594771241837


## Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

# Sequential Backward Selection
lr_sbs = SFS(LogisticRegression(), 
          k_features=(1, 20), 
          forward=False, 
          floating=False, 
          scoring='accuracy',
          cv=10) # -1 means ALL CPU

lr_sbs.fit(X, Y, custom_feature_names=X.columns)

selected_features = X.columns[list(lr_sbs.k_feature_idx_)]
x_lr = X_train[selected_features]
lr = LogisticRegression()
lr.fit(x_lr, y_train)

x_test_filtered = X_train[selected_features]
y_test_pred = lr.predict(x_test_filtered)
print(classification_report(y_train, y_test_pred))
print(cross_val_score(lr, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.71      0.88      0.78        56
           2       0.78      0.80      0.79        61
           3       0.00      0.00      0.00        14
           5       0.91      1.00      0.95        10
           6       0.80      0.57      0.67         7
           7       1.00      1.00      1.00        23

    accuracy                           0.79       171
   macro avg       0.70      0.71      0.70       171
weighted avg       0.73      0.79      0.76       171

0.70359477124183
