In [23]:
from pandas import read_csv
import warnings
warnings.filterwarnings('ignore')

In [24]:
header = ['feature1','feature2','feature3','feature4','class']
data = read_csv('data/banknote_authentication.txt', header=None, names=header)
data

Unnamed: 0,feature1,feature2,feature3,feature4,class
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


In [25]:
header_cat = ['edibility', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 
           'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring','stalk-surface-below-ring', 
            'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 
           'spore-print-colors', 'population', 'habitat']
data_cat = read_csv('data/agaricus-lepiota.data', header=None, names=header_cat)
data_cat

Unnamed: 0,edibility,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-colors,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


## Testing framework

The framework is to train a 3 simple classifier models on a single feature and observe the metrics for each one when used to predict its training data. Repeat this for each feature to determine if the statistical tests can accurately determine which input features are good to create a model.

KNN does not work for categotical test because of one hot encoded data. Use LinearSVC instead.

In [26]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from pandas import get_dummies
from sklearn.svm import LinearSVC

In [27]:
def test_feature(x, y):
    x = x.values.reshape(-1, 1)
    y = y.values.reshape(-1)
    models = [('NB model', GaussianNB()), ('KNN model',KNeighborsClassifier(n_neighbors=5)), ('Decision Tree model', DecisionTreeClassifier())]
    for mx in models:
        clf = mx[1]
        clf.fit(x, y)
        y_ = clf.predict(x)
        print(mx[0])
        print(classification_report(y, y_), '\n')
    
def test_feature_cat(x, y):
    x = get_dummies(x).replace({False:0, True:1}).values
    y = y.values.reshape(-1)
    models = [('NB model', GaussianNB()), ('Linear SVM model',LinearSVC()), ('Decision Tree model', DecisionTreeClassifier())]
    for mx in models:
        clf = mx[1]
        clf.fit(x, y)
        y_ = clf.predict(x)
        print(mx[0])
        print(classification_report(y, y_), '\n')

## One-way Anova test
Each class label is a group, the ANOVA test is performed to determine if there is a difference between the means of the groups for each feature based on the means between groups and variance within groups. 
### H0: there is no difference in the mean between groups for a particular feature
### H1: the means are different in the mean between groups for a particular feature
Low P-values are strong evidence to reject the null hypothesis, and is an indication that the feature is valuable for model.

In [28]:
from scipy.stats import f_oneway

In [29]:
# Split samples into two groups. groups from class 0 and class 1
group_a = data.loc[data['class']==0, ['feature1','feature2','feature3','feature4']]
group_b = data.loc[data['class']==1, ['feature1','feature2','feature3','feature4']]

In [30]:
F_stat, p_val = f_oneway(group_a, group_b, axis=0)
print(F_stat)
print(p_val)

[1.51662667e+03 3.37691145e+02 3.41195192e+01 7.52088816e-01]
[5.74096537e-224 1.37205114e-067 6.46552630e-009 3.85967572e-001]


In [31]:
### Mutual information
from sklearn.feature_selection import mutual_info_classif

mutual_info_classif(data[header].values, data['class'])

array([0.37818234, 0.22660886, 0.12229088, 0.01901871, 0.6880911 ])

In [32]:
# Best feature
test_feature(data['feature1'], data['class'])

NB model
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       762
           1       0.83      0.82      0.82       610

    accuracy                           0.84      1372
   macro avg       0.84      0.84      0.84      1372
weighted avg       0.84      0.84      0.84      1372
 

KNN model
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       762
           1       0.86      0.87      0.86       610

    accuracy                           0.88      1372
   macro avg       0.88      0.88      0.88      1372
weighted avg       0.88      0.88      0.88      1372
 

Decision Tree model
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       762
           1       1.00      1.00      1.00       610

    accuracy                           1.00      1372
   macro avg       1.00      1.00      1.00      1372
weighted avg       1.00      1.

In [33]:
# Worst feature
test_feature(data['feature4'], data['class'])

NB model
              precision    recall  f1-score   support

           0       0.56      1.00      0.71       762
           1       0.00      0.00      0.00       610

    accuracy                           0.56      1372
   macro avg       0.28      0.50      0.36      1372
weighted avg       0.31      0.56      0.40      1372
 

KNN model
              precision    recall  f1-score   support

           0       0.72      0.78      0.75       762
           1       0.70      0.62      0.66       610

    accuracy                           0.71      1372
   macro avg       0.71      0.70      0.70      1372
weighted avg       0.71      0.71      0.71      1372
 

Decision Tree model
              precision    recall  f1-score   support

           0       0.90      0.99      0.94       762
           1       0.98      0.86      0.92       610

    accuracy                           0.93      1372
   macro avg       0.94      0.92      0.93      1372
weighted avg       0.94      0.

Results:

The results are as expected, the feature with the highest p-valuehas consistently higher accuracy on all models than the feature with the worse p-value

## Chi2 test

This is an entirely categorical dataset. The P-values of the chi2 test are calculated for each categorical feature and the class label to determine if there is a significant association between categorical variable and the class label.

#### H0: there is no significant association between the categorical variables
#### H1: there is a significant association between the categorical variables 

Low P-values are strong evidence to reject the null hypothesis, and is an indication that the feature is valuable for a model.

In [34]:
from pandas import crosstab
from scipy.stats import chi2_contingency

In [35]:
p_values = []
features = data_cat.columns[1:]

for f in features:
    contingency_table = crosstab(data_cat['edibility'], data_cat[f])
    chi2, p, dof, expected = chi2_contingency(contingency_table, correction=False)
    p_values.append((f, p))

In [36]:
p_values.sort(key=lambda x:x[1])
p_values

[('bruises', 0.0),
 ('odor', 0.0),
 ('gill-size', 0.0),
 ('gill-color', 0.0),
 ('stalk-surface-above-ring', 0.0),
 ('stalk-surface-below-ring', 0.0),
 ('stalk-color-above-ring', 0.0),
 ('stalk-color-below-ring', 0.0),
 ('ring-type', 0.0),
 ('spore-print-colors', 0.0),
 ('population', 0.0),
 ('habitat', 0.0),
 ('stalk-root', 7.702047904943513e-290),
 ('gill-spacing', 1.946774580056623e-216),
 ('cap-shape', 1.1964565685935438e-103),
 ('ring-number', 4.23575764172306e-82),
 ('cap-color', 6.055814598336576e-78),
 ('cap-surface', 5.518427038649143e-68),
 ('veil-color', 3.32097274916963e-41),
 ('gill-attachment', 2.4274771680713518e-31),
 ('stalk-shape', 3.739357224138512e-20),
 ('veil-type', 1.0)]

Based on the P-values, the best features include bruises, odor, gill-size, and gill-attchment (p-values all below 0.01). While the worst feature is veil-type with a p-value of 1. Let's take a look at how well these features perform in a model.

In [37]:
# Best feature 1
test_feature_cat(data_cat['bruises'], data_cat['edibility'])

NB model
              precision    recall  f1-score   support

           e       0.82      0.65      0.73      4208
           p       0.69      0.84      0.76      3916

    accuracy                           0.74      8124
   macro avg       0.75      0.75      0.74      8124
weighted avg       0.76      0.74      0.74      8124
 

Linear SVM model
              precision    recall  f1-score   support

           e       0.82      0.65      0.73      4208
           p       0.69      0.84      0.76      3916

    accuracy                           0.74      8124
   macro avg       0.75      0.75      0.74      8124
weighted avg       0.76      0.74      0.74      8124
 

Decision Tree model
              precision    recall  f1-score   support

           e       0.82      0.65      0.73      4208
           p       0.69      0.84      0.76      3916

    accuracy                           0.74      8124
   macro avg       0.75      0.75      0.74      8124
weighted avg       0.76 

In [38]:
# Best feature 2
test_feature_cat(data_cat['odor'], data_cat['edibility'])

NB model
              precision    recall  f1-score   support

           e       0.97      1.00      0.99      4208
           p       1.00      0.97      0.98      3916

    accuracy                           0.99      8124
   macro avg       0.99      0.98      0.99      8124
weighted avg       0.99      0.99      0.99      8124
 

Linear SVM model
              precision    recall  f1-score   support

           e       0.97      1.00      0.99      4208
           p       1.00      0.97      0.98      3916

    accuracy                           0.99      8124
   macro avg       0.99      0.98      0.99      8124
weighted avg       0.99      0.99      0.99      8124
 

Decision Tree model
              precision    recall  f1-score   support

           e       0.97      1.00      0.99      4208
           p       1.00      0.97      0.98      3916

    accuracy                           0.99      8124
   macro avg       0.99      0.98      0.99      8124
weighted avg       0.99 

In [39]:
# Best feature 3
test_feature_cat(data_cat['gill-size'], data_cat['edibility'])

NB model
              precision    recall  f1-score   support

           e       0.70      0.93      0.80      4208
           p       0.89      0.57      0.69      3916

    accuracy                           0.76      8124
   macro avg       0.79      0.75      0.75      8124
weighted avg       0.79      0.76      0.75      8124
 

Linear SVM model
              precision    recall  f1-score   support

           e       0.70      0.93      0.80      4208
           p       0.89      0.57      0.69      3916

    accuracy                           0.76      8124
   macro avg       0.79      0.75      0.75      8124
weighted avg       0.79      0.76      0.75      8124
 

Decision Tree model
              precision    recall  f1-score   support

           e       0.70      0.93      0.80      4208
           p       0.89      0.57      0.69      3916

    accuracy                           0.76      8124
   macro avg       0.79      0.75      0.75      8124
weighted avg       0.79 

In [40]:
print(data_cat['bruises'].unique())
print(data_cat['odor'].unique())
print(data_cat['gill-size'].unique())

['t' 'f']
['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']
['n' 'b']


## Results:

Even though these 3 lowest p-value categorical features have a near 0 p-value, there is an unexpected leap in performance when modelling with the 'odor' variable, this is due to the high number of unique values in this category. It seems that a combination of chi2 p-value score and number of unique values are strong contributing factors in model performance.

In [17]:
# Medium feature 1
test_feature_cat(data_cat['gill-attachment'], data_cat['edibility'])

NB model
              precision    recall  f1-score   support

           e       0.91      0.05      0.09      4208
           p       0.49      1.00      0.66      3916

    accuracy                           0.50      8124
   macro avg       0.70      0.52      0.37      8124
weighted avg       0.71      0.50      0.36      8124
 

Linear SVM model
              precision    recall  f1-score   support

           e       0.52      1.00      0.68      4208
           p       0.00      0.00      0.00      3916

    accuracy                           0.52      8124
   macro avg       0.26      0.50      0.34      8124
weighted avg       0.27      0.52      0.35      8124
 

Decision Tree model
              precision    recall  f1-score   support

           e       0.52      1.00      0.68      4208
           p       0.00      0.00      0.00      3916

    accuracy                           0.52      8124
   macro avg       0.26      0.50      0.34      8124
weighted avg       0.27 

In [18]:
# Worst feature
test_feature_cat(data_cat['veil-type'], data_cat['edibility'])

NB model
              precision    recall  f1-score   support

           e       0.52      1.00      0.68      4208
           p       0.00      0.00      0.00      3916

    accuracy                           0.52      8124
   macro avg       0.26      0.50      0.34      8124
weighted avg       0.27      0.52      0.35      8124
 

Linear SVM model
              precision    recall  f1-score   support

           e       0.52      1.00      0.68      4208
           p       0.00      0.00      0.00      3916

    accuracy                           0.52      8124
   macro avg       0.26      0.50      0.34      8124
weighted avg       0.27      0.52      0.35      8124
 

Decision Tree model
              precision    recall  f1-score   support

           e       0.52      1.00      0.68      4208
           p       0.00      0.00      0.00      3916

    accuracy                           0.52      8124
   macro avg       0.26      0.50      0.34      8124
weighted avg       0.27 

## Results:
Results are as expected for the medium and low p-value score features.