In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
glass_ds = pd.read_csv('../new_glass.csv')
X = glass_ds.drop(columns='Type')
Y = glass_ds['Type']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15)

In [3]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# KNN

In [28]:
%%time
plain_sfs = SFS(KNeighborsClassifier(), 
          k_features=(1, 10), 
          forward=True, 
          floating=False,
          scoring='accuracy', 
          cv=10)

plain_sfs.fit(X, Y)

CPU times: user 19.7 s, sys: 108 ms, total: 19.8 s
Wall time: 19.9 s


SequentialFeatureSelector(cv=10, estimator=KNeighborsClassifier(),
                          k_features=(1, 10), scoring='accuracy')

In [29]:
plain_sfs.k_score_

0.6259740259740261

In [30]:
selected_features = X.columns[list(plain_sfs.k_feature_idx_)]
x_t = X_train[selected_features]
clf = KNeighborsClassifier()
clf.fit(x_t, y_train)

x_test_filtered = X_test[selected_features]
y_test_pred = clf.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.50      0.67      0.57         9
           2       0.79      0.85      0.81        13
           3       0.00      0.00      0.00         2
           5       1.00      0.20      0.33         5
           6       0.00      0.00      0.00         1
           7       0.60      1.00      0.75         3

    accuracy                           0.64        33
   macro avg       0.48      0.45      0.41        33
weighted avg       0.65      0.64      0.60        33

0.6406432748538011


# Logistic Regression

In [32]:
plain_sfs_lr = SFS(LogisticRegression(), 
          k_features=(1, 50), 
          forward=True, 
          floating=False, 
          cv=10)

plain_sfs_lr.fit(X, Y)


STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

SequentialFeatureSelector(cv=10, estimator=LogisticRegression(),
                          k_features=(1, 50))

In [33]:
selected_features = X.columns[list(plain_sfs_lr.k_feature_idx_)]
x_t = X_train[selected_features]
clf = LogisticRegression()
clf.fit(x_t, y_train)

LogisticRegression()

In [34]:
x_test_filtered = X_test[selected_features]
y_test_pred = clf.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.86      0.67      0.75         9
           2       0.71      0.92      0.80        13
           3       0.00      0.00      0.00         2
           5       1.00      0.60      0.75         5
           6       1.00      1.00      1.00         1
           7       0.60      1.00      0.75         3

    accuracy                           0.76        33
   macro avg       0.69      0.70      0.67        33
weighted avg       0.75      0.76      0.73        33

0.6187134502923977


# Naive Bayes

In [25]:
naive_bayes = GaussianNB()
plain_sfs_nb = SFS(naive_bayes, 
          k_features=(1, 50), 
          forward=True, 
          floating=False, 
          cv=10)

plain_sfs_nb.fit(X, Y) 

SequentialFeatureSelector(cv=10, estimator=GaussianNB(), k_features=(1, 50))

In [26]:
plain_sfs_nb.k_score_

0.7015151515151515

In [27]:
selected_features = X.columns[list(plain_sfs_nb.k_feature_idx_)]
x_t = X_train[selected_features]
clf = GaussianNB()
clf.fit(x_t, y_train)

x_test_filtered = X_test[selected_features]
y_test_pred = clf.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.62      0.56      0.59         9
           2       0.71      0.77      0.74        13
           3       1.00      0.50      0.67         2
           5       0.83      1.00      0.91         5
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         3

    accuracy                           0.76        33
   macro avg       0.86      0.80      0.82        33
weighted avg       0.76      0.76      0.75        33

0.4976608187134503


# SVM

In [22]:

plain_sfs_svm = SFS(svm.SVC(), 
          k_features=(1, 50), 
          forward=True, 
          floating=False, 
          cv=10)

plain_sfs_svm.fit(X, Y)

SequentialFeatureSelector(cv=10, estimator=SVC(), k_features=(1, 50))

In [23]:
plain_sfs_svm.k_score_

0.753896103896104

In [24]:
selected_features = X.columns[list(plain_sfs_svm.k_feature_idx_)]
x_t = X_train[selected_features]
clf = svm.SVC()
clf.fit(x_t, y_train)

x_test_filtered = X_test[selected_features]
y_test_pred = clf.predict(x_test_filtered)
print(classification_report(y_test, y_test_pred))
print(cross_val_score(clf, X_train, y_train, cv=10).mean())

              precision    recall  f1-score   support

           1       0.71      0.56      0.63         9
           2       0.71      0.92      0.80        13
           3       0.00      0.00      0.00         2
           5       1.00      0.60      0.75         5
           6       1.00      1.00      1.00         1
           7       0.60      1.00      0.75         3

    accuracy                           0.73        33
   macro avg       0.67      0.68      0.65        33
weighted avg       0.71      0.73      0.70        33

0.45847953216374265
