In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import random
from sklearn.model_selection import KFold

In [27]:
X = pd.read_csv('../datasets/glass_formula_c.csv')

In [28]:
target_column = 'Type'
y = X[target_column]
X_ = X.drop([target_column], axis=1)
features = list(X_)
features

['RI',
 'Na',
 'Mg',
 'Al',
 'Si',
 'K',
 'Ca',
 'Ba',
 'Fe',
 'RI*Na',
 'RI*Mg',
 'RI*Al',
 'RI*Si',
 'RI*K',
 'RI*Ca',
 'RI*Ba',
 'RI*Fe',
 'Na*Mg',
 'Na*Al',
 'Na*Si',
 'Na*K',
 'Na*Ca',
 'Na*Ba',
 'Na*Fe',
 'Mg*Al',
 'Mg*Si',
 'Mg*K',
 'Mg*Ca',
 'Mg*Ba',
 'Mg*Fe',
 'Al*Si',
 'Al*K',
 'Al*Ca',
 'Al*Ba',
 'Al*Fe',
 'Si*K',
 'Si*Ca',
 'Si*Ba',
 'Si*Fe',
 'K*Ca',
 'K*Ba',
 'K*Fe',
 'Ca*Ba',
 'Ca*Fe',
 'Ba*Fe']

### KNN

In [29]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=random.randint(100, 2000))
i = 1
results = []
for train_index, test_index in kf.split(X):
    X_train = X.iloc[train_index].loc[:, features]
    X_test = X.iloc[test_index][features]
    y_train = X.iloc[train_index].loc[:, target_column]
    y_test = X.loc[test_index][target_column]
    
    plain_sfs = SFS(KNeighborsClassifier(), 
          k_features=(5, 10), 
          forward=True, 
          floating=False,
          scoring='accuracy',
          n_jobs=-1)
    
    plain_sfs.fit(X_train, y_train)
    selected_features = X.columns[list(plain_sfs.k_feature_idx_)]
    print(f'Features => {list(selected_features)}')
    x_t = X_test[selected_features]
    
    clf = KNeighborsClassifier()
    clf.fit(X_train[selected_features], y_train)
    score = accuracy_score(y_test, clf.predict(x_t))
    results.append(score)
    print(f"Accuracy for the fold no. {i} on the test set: {score}\n")
    i += 1
    
print(f'KNN {np.array(results).mean()}')

Features => ['Al', 'K', 'Ba', 'Mg*Fe', 'Al*Fe']
Accuracy for the fold no. 1 on the test set: 0.5581395348837209

Features => ['RI', 'Na', 'Al', 'K', 'Ba', 'Mg*Ba', 'Al*K', 'Al*Ba', 'K*Ca']
Accuracy for the fold no. 2 on the test set: 0.4186046511627907

Features => ['Al', 'K', 'Ca', 'RI*Na', 'RI*Ca', 'Mg*Ca', 'Al*K', 'Si*Ba']
Accuracy for the fold no. 3 on the test set: 0.6511627906976745

Features => ['RI', 'Na', 'Ca', 'RI*Ca', 'Na*Al', 'Na*Ba', 'Mg*Ca', 'Si*K', 'Si*Ba']
Accuracy for the fold no. 4 on the test set: 0.7441860465116279

Features => ['Mg', 'RI*Al', 'Mg*K', 'Mg*Ca', 'Ca*Ba']
Accuracy for the fold no. 5 on the test set: 0.6904761904761905

KNN 0.6125138427464009
CPU times: user 3.27 s, sys: 414 ms, total: 3.68 s
Wall time: 12.9 s


#### Logistic Regression

In [30]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=321)
i = 1
results = []
for train_index, test_index in kf.split(X):
    X_train = X.iloc[train_index].loc[:, features]
    X_test = X.iloc[test_index][features]
    y_train = X.iloc[train_index].loc[:, target_column]
    y_test = X.loc[test_index][target_column]
    
    plain_sfs_ls = SFS(LogisticRegression(), 
          k_features=(5, 10), 
          forward=True, 
          floating=False,
          n_jobs=-1)
    
    plain_sfs_ls.fit(X_train, y_train)
    selected_features = X.columns[list(plain_sfs_ls.k_feature_idx_)]
    print(f'Features => {list(selected_features)}')
    x_t = X_test[selected_features]
    
    clf = LogisticRegression()
    clf.fit(X_train[selected_features], y_train)
    score = accuracy_score(y_test, clf.predict(x_t))
    results.append(score)
    print(f"Accuracy for the fold no. {i} on the test set: {score}\n")
    i += 1
    
print(f'LogisticRegression {np.array(results).mean()}')

Features => ['RI*Si', 'Na*Al', 'Na*K', 'Na*Ba', 'Na*Fe', 'Al*Ca', 'K*Ca', 'K*Ba']
Accuracy for the fold no. 1 on the test set: 0.4883720930232558

Features => ['RI', 'Si', 'Ba', 'RI*K', 'Na*Al', 'Al*Ba', 'K*Ca', 'Ba*Fe']
Accuracy for the fold no. 2 on the test set: 0.6744186046511628

Features => ['RI', 'Al', 'Ba', 'RI*Al', 'RI*Si', 'RI*Fe', 'Na*Al', 'Al*Ca', 'Si*K', 'Ca*Ba']
Accuracy for the fold no. 3 on the test set: 0.627906976744186

Features => ['Al', 'Na*Al', 'Al*K', 'Al*Ba', 'K*Ca', 'Ca*Fe']
Accuracy for the fold no. 4 on the test set: 0.6511627906976745

Features => ['RI', 'Si', 'Ba', 'Fe', 'RI*Al', 'Na*Al', 'Na*K', 'K*Fe', 'Ca*Fe', 'Ba*Fe']
Accuracy for the fold no. 5 on the test set: 0.5714285714285714

LogisticRegression 0.60265780730897
CPU times: user 3.49 s, sys: 299 ms, total: 3.79 s
Wall time: 1min 21s


### Naive Bayes

In [31]:
%%time
i = 1
results = []
for train_index, test_index in kf.split(X):
    X_train = X.iloc[train_index].loc[:, features]
    X_test = X.iloc[test_index][features]
    y_train = X.iloc[train_index].loc[:, target_column]
    y_test = X.loc[test_index][target_column]
    
    naive_bayes = GaussianNB()
    plain_sfs_nb = SFS(naive_bayes, 
          k_features=(5, 10), 
          forward=True, 
          floating=False)
    
    plain_sfs_nb.fit(X_train, y_train)
    selected_features = X.columns[list(plain_sfs_nb.k_feature_idx_)]
    print(f'Features => {list(selected_features)}')
    x_t = X_test[selected_features]
    
    clf = GaussianNB()
    clf.fit(X_train[selected_features], y_train)
    score = accuracy_score(y_test, clf.predict(x_t))
    results.append(score)
    print(f"Accuracy for the fold no. {i} on the test set: {score}\n")
    i += 1
    
print(f'Naive Bayes {np.array(results).mean()}')

Features => ['RI', 'Al', 'RI*Al', 'RI*Si', 'Na*Al', 'Na*Si', 'Mg*Al', 'Al*Si', 'Al*Ca']
Accuracy for the fold no. 1 on the test set: 0.6046511627906976

Features => ['Na', 'Al', 'Na*Al', 'Al*Si', 'Al*Ca']
Accuracy for the fold no. 2 on the test set: 0.6744186046511628

Features => ['RI', 'Al', 'Na*Al', 'Na*Si', 'Al*Ca']
Accuracy for the fold no. 3 on the test set: 0.5813953488372093

Features => ['Na', 'Al', 'Na*Al', 'Mg*Ca', 'Al*Ca']
Accuracy for the fold no. 4 on the test set: 0.5813953488372093

Features => ['Na', 'Al', 'RI*Na', 'RI*Al', 'RI*Si', 'Na*Al', 'Mg*Al', 'Al*Si', 'Al*K', 'Al*Ca']
Accuracy for the fold no. 5 on the test set: 0.5714285714285714

Naive Bayes 0.6026578073089702
CPU times: user 17.5 s, sys: 145 ms, total: 17.7 s
Wall time: 17.8 s


### SVM

In [32]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=random.randint(1000,9999))
i = 1
results = []
for train_index, test_index in kf.split(X):
    X_train = X.iloc[train_index].loc[:, features]
    X_test = X.iloc[test_index][features]
    y_train = X.iloc[train_index].loc[:, target_column]
    y_test = X.loc[test_index][target_column]
    
    plain_sfs_svm = SFS(svm.SVC(kernel='linear'), 
          k_features=(2, 10), 
          forward=False, 
          floating=False,
          n_jobs=-1)
    
    plain_sfs_svm.fit(X_train, y_train)
    selected_features = X.columns[list(plain_sfs_svm.k_feature_idx_)]
    print(f'Features => {list(selected_features)}')
    x_t = X_test[selected_features]
    
    clf = svm.SVC(kernel='linear')
    clf.fit(X_train[selected_features], y_train)
    score = accuracy_score(y_test, clf.predict(x_t))
    results.append(score)
    print(f"Accuracy for the fold no. {i} on the test set: {score}\n")
    i += 1
    
print(f'SVM {np.array(results).mean()}')


STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

KeyError: None

### Random Forest

In [33]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=random.randint(1000,9999))
i = 1
results = []
for train_index, test_index in kf.split(X):
    X_train = X.iloc[train_index].loc[:, features]
    X_test = X.iloc[test_index][features]
    y_train = X.iloc[train_index].loc[:, target_column]
    y_test = X.loc[test_index][target_column]
    
    rfc_sbs = SFS(RandomForestClassifier(random_state=0), 
          k_features=(3, 10), 
          forward=False, 
          floating=False, n_jobs=-1)

    
    rfc_sbs.fit(X_train, y_train)
    selected_features = X.columns[list(rfc_sbs.k_feature_idx_)]
    print(f'Features => {list(selected_features)}')
    x_t = X_test[selected_features]
    
    clf = RandomForestClassifier(random_state=0)
    clf.fit(X_train[selected_features], y_train)
    score = accuracy_score(y_test, clf.predict(x_t))
    results.append(score)
    print(f"Accuracy for the fold no. {i} on the test set: {score}\n")
    i += 1
    
print(f'RandomForest {np.array(results).mean()}')

Features => ['RI', 'Fe', 'RI*Al', 'RI*Ca', 'Mg*Ca', 'K*Ca']
Accuracy for the fold no. 1 on the test set: 0.7209302325581395

Features => ['RI', 'Ba', 'RI*Mg', 'RI*Si', 'Mg*Ca', 'Al*K', 'Al*Ca', 'Si*Fe', 'K*Fe']
Accuracy for the fold no. 2 on the test set: 0.813953488372093

Features => ['RI', 'Al', 'RI*Ca', 'RI*Ba', 'Na*Fe', 'Mg*Si', 'Mg*Ca', 'K*Ca']
Accuracy for the fold no. 3 on the test set: 0.7674418604651163

Features => ['RI', 'Si', 'RI*Ba', 'RI*Fe', 'Na*Fe', 'Mg*Ca', 'Al*K', 'Al*Ca', 'K*Ba', 'K*Fe']
Accuracy for the fold no. 4 on the test set: 0.7209302325581395

Features => ['RI', 'Si', 'RI*Ca', 'RI*Ba', 'Mg*K', 'Mg*Ca', 'Al*Si', 'Al*Ba', 'Si*K', 'Ba*Fe']
Accuracy for the fold no. 5 on the test set: 0.7142857142857143

RandomForest 0.7475083056478405
CPU times: user 4.97 s, sys: 542 ms, total: 5.52 s
Wall time: 6min 30s


### Multilayer Perceptron

In [34]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=random.randint(1000,9999))
i = 1
results = []
for train_index, test_index in kf.split(X):
    X_train = X.iloc[train_index].loc[:, features]
    X_test = X.iloc[test_index][features]
    y_train = X.iloc[train_index].loc[:, target_column]
    y_test = X.loc[test_index][target_column]
    
    mlp_sbs = SFS(MLPClassifier(random_state=1), 
          k_features=(1, 10), 
          forward=False, 
          floating=False)
    
    mlp_sbs.fit(X_train, y_train)
    selected_features = X.columns[list(mlp_sbs.k_feature_idx_)]
    print(f'Features => {list(selected_features)}')
    x_t = X_test[selected_features]
    
    clf = MLPClassifier(random_state=1)
    clf.fit(X_train[selected_features], y_train)
    score = accuracy_score(y_test, clf.predict(x_t))
    results.append(score)
    print(f"Accuracy for the fold no. {i} on the test set: {score}\n")
    i += 1
    
print(f'MLP {np.array(results).mean()}')

Features => ['Na*Mg', 'Na*Ba', 'Na*Fe', 'Mg*Al', 'Mg*Ca', 'Mg*Fe', 'Si*K']
Accuracy for the fold no. 1 on the test set: 0.46511627906976744

Features => ['Mg', 'RI*K', 'RI*Ca', 'Na*Al', 'Mg*Al', 'Si*K', 'Si*Ba']
Accuracy for the fold no. 2 on the test set: 0.6976744186046512

Features => ['RI', 'Na', 'Mg', 'Na*Al', 'Na*K', 'Na*Ba', 'Al*Ca', 'Si*Fe', 'K*Ca', 'K*Fe']
Accuracy for the fold no. 3 on the test set: 0.4883720930232558

Features => ['Si', 'Fe', 'RI*K', 'RI*Ca', 'Na*Al', 'Na*K', 'Al*K', 'Ca*Ba']
Accuracy for the fold no. 4 on the test set: 0.6976744186046512

Features => ['Al', 'Ba', 'RI*Mg', 'RI*Al', 'RI*Ba', 'Na*Mg', 'Mg*Ca', 'Al*Ba', 'Si*K', 'Ba*Fe']
Accuracy for the fold no. 5 on the test set: 0.5714285714285714

MLP 0.5840531561461795
CPU times: user 2h 23min 29s, sys: 4min 27s, total: 2h 27min 57s
Wall time: 26min 12s
