In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
X = pd.read_csv('../glass_formula_b.csv')

df = pd.read_csv('../glass.csv', sep=",")
Y = df['Type']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15)

In [3]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# KNN

In [4]:
%%time
plain_sfs = SFS(KNeighborsClassifier(), 
          k_features=(1, 20), 
          forward=True, 
          floating=False,
          scoring='accuracy', 
          cv=10)

plain_sfs.fit(X, Y)

CPU times: user 25.3 s, sys: 114 ms, total: 25.4 s
Wall time: 12.4 s


SequentialFeatureSelector(cv=10, estimator=KNeighborsClassifier(),
                          k_features=(1, 20), scoring='accuracy')

In [5]:
plain_sfs.k_score_

0.7298701298701298

In [6]:
selected_features = X.columns[list(plain_sfs.k_feature_idx_)]
x_t = X_train[selected_features]
clf = KNeighborsClassifier()
clf.fit(x_t, y_train)

x_test_filtered = X_test[selected_features]
y_test_pred = clf.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.71      0.92      0.80        13
           2       0.90      0.64      0.75        14
           3       0.00      0.00      0.00         1
           5       1.00      1.00      1.00         1
           6       0.00      0.00      0.00         0
           7       1.00      0.75      0.86         4

    accuracy                           0.76        33
   macro avg       0.60      0.55      0.57        33
weighted avg       0.81      0.76      0.77        33

0.6950292397660818


# Naive Bayes

In [7]:
naive_bayes = GaussianNB()
plain_sfs_nb = SFS(naive_bayes, 
          k_features=(1, 20), 
          forward=True, 
          floating=False, 
          cv=10)

plain_sfs_nb.fit(X, Y) 

SequentialFeatureSelector(cv=10, estimator=GaussianNB(), k_features=(1, 20))

In [8]:
plain_sfs_nb.k_score_

0.6545454545454545

In [9]:
selected_features = X.columns[list(plain_sfs_nb.k_feature_idx_)]
x_t = X_train[selected_features]
clf = GaussianNB()
clf.fit(x_t, y_train)

x_test_filtered = X_test[selected_features]
y_test_pred = clf.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.71      0.77      0.74        13
           2       0.77      0.71      0.74        14
           3       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       0.80      1.00      0.89         4

    accuracy                           0.73        33
   macro avg       0.46      0.50      0.47        33
weighted avg       0.70      0.73      0.71        33

0.5014619883040936


# SVM

In [10]:

plain_sfs_svm = SFS(svm.SVC(), 
          k_features=(1, 20), 
          forward=True, 
          floating=False, 
          cv=10)

plain_sfs_svm.fit(X, Y)

SequentialFeatureSelector(cv=10, estimator=SVC(), k_features=(1, 20))

In [11]:
plain_sfs_svm.k_score_

0.6681818181818182

In [12]:
selected_features = X.columns[list(plain_sfs_svm.k_feature_idx_)]
x_t = X_train[selected_features]
clf = svm.SVC()
clf.fit(x_t, y_train)

x_test_filtered = X_test[selected_features]
y_test_pred = clf.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.71      0.77      0.74        13
           2       0.69      0.79      0.73        14
           3       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       1.00      0.75      0.86         4

    accuracy                           0.73        33
   macro avg       0.48      0.46      0.47        33
weighted avg       0.69      0.73      0.71        33

0.3423976608187135


# Logistic Regression

In [13]:
plain_sfs_lr = SFS(LogisticRegression(), 
          k_features=(1, 20), 
          forward=True, 
          floating=False, 
          cv=10)

plain_sfs_lr.fit(X, Y)
selected_features = X.columns[list(plain_sfs_lr.k_feature_idx_)]
x_t = X_train[selected_features]
clf = LogisticRegression()
clf.fit(x_t, y_train)
x_test_filtered = X_test[selected_features]
y_test_pred = clf.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.79      0.85      0.81        13
           2       0.75      0.86      0.80        14
           3       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       1.00      0.75      0.86         4

    accuracy                           0.79        33
   macro avg       0.51      0.49      0.49        33
weighted avg       0.75      0.79      0.76        33

0.6567251461988304
