In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Exercise 6

For this exercise you can use either Python with sklearn or Weka.

* Using the UCI mushroom dataset from the last exercise, perform a feature
selection using a classifier evaluator. Which features are most discriminitave?

* Use principal components analysis to construct a reduced space.
Which combination of features explain the most variance in the dataset?

* Do you see any overlap between the PCA features
and those obtained from feature selection?

In [143]:
df = pd.read_csv('agaricus-lepiota.csv')
df

df_dummies = pd.get_dummies(df).astype(float)

# Want to determine edibility, so it cannot be in the X dataset
X, y = df_dummies.drop(['edibility_e', 'edibility_p'], axis=1), df_dummies[['edibility_e', 'edibility_p']]
print("X-shape: ", X.shape)

skb = SelectKBest(chi2, k=5)
skb.fit(X, y)
X_new = skb.transform(X)
print("X_new-shape: ",X_new.shape)

# Encodes True to the selected features out of the total 118
mask = skb.get_support(indices=True)
print("Selected column indices: ", mask)

selected_features = df_dummies.columns[mask]
#print("Selected columns: ", selected_features)
print("Selected features: ", ", ".join(selected_features.values)) # Joins on ", "

X-shape:  (8124, 117)
X_new-shape:  (8124, 5)
Selected column indices:  [24 27 37 57 61]
Selected features:  odor_a, odor_l, gill-size_b, stalk-root_r, stalk-surface-above-ring_y


In [157]:
from sklearn import decomposition
from sklearn.decomposition import PCA

# normalize data
from sklearn import preprocessing
data_scaled = pd.DataFrame(preprocessing.scale(X),columns = X.columns) 

# PCA
# Common to reduce to 2D, probably because it is easy both to understand and to plot
pca = PCA(n_components=2)
x_pca = pca.fit_transform(data_scaled)

best_features = [pca.components_[i].argmax() for i in range(x_pca.shape[1])]
feature_names = [X.columns[best_features[i]] for i in range(x_pca.shape[1])]
print("Highest variance: ", feature_names)

# Dump components relations with features:
print(pd.DataFrame(pca.components_,columns=data_scaled.columns,index = ['PC-1','PC-2']))

Highest variance:  ['ring-type_p', 'stalk-root_?']
      cap-shape_b  cap-shape_c  cap-shape_f  cap-shape_k  cap-shape_s  \
PC-1     0.079834     0.001893    -0.013437    -0.085837     0.012201   
PC-2     0.016743     0.008192    -0.041887     0.131896     0.001769   

      cap-shape_x  cap-surface_f  cap-surface_g  cap-surface_s  cap-surface_y  \
PC-1     0.026957       0.032678       0.005794       0.017089      -0.046603   
PC-2    -0.047485      -0.118133       0.005171       0.144960      -0.028729   

      ...  population_s  population_v  population_y  habitat_d  habitat_g  \
PC-1  ...      0.114010     -0.150387      0.003753   0.020477   0.075409   
PC-2  ...     -0.009443      0.083800     -0.149493  -0.049328  -0.064072   

      habitat_l  habitat_m  habitat_p  habitat_u  habitat_w  
PC-1  -0.080517   0.080901  -0.135526   0.044891   0.025837  
PC-2   0.176586  -0.015291  -0.013922  -0.006366   0.051098  

[2 rows x 117 columns]


* Do you see any overlap between the PCA features
and those obtained from feature selection?

In [155]:
set(selected_features).intersection(set(feature_names))

set()