# Oving 6

* 1 Using the UCI mushroom dataset from the last exercise, perform a feature selection using a classifier evaluator. Which features are most discriminative?

* 2 Use principal components analysis to construct a reduced space. Which combination of features explain the most variance in the dataset?

* 3 Do you see any overlap between the PCA features and those obtained from feature selection?

In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:
# Task 1
# Create dataframe from mushroom csv
data = pd.read_csv('agaricus-lepiota.csv')
# Gets the first n rows
data.head(10)

Unnamed: 0,edibility,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [3]:
# Convert categorical variable into dummy/indicator variables.
# In other words, it sets entries with a value to 1 and columns without a value to 0.
# Booleans are set to 1 for true and 0 for false
dummyData = pd.get_dummies(data)
X, Y = pd.get_dummies(data), pd.get_dummies(data['edibility'])
print("X shape:", X.shape)
print("Y shape:", Y.shape)
X.head(5)


X shape: (8124, 119)
Y shape: (8124, 2)


Unnamed: 0,edibility_e,edibility_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [4]:
Y.head(5)

Unnamed: 0,e,p
0,0,1
1,1,0
2,1,0
3,0,1
4,1,0


X shape: 8124 rows, 119 columns
Y shape: 8124 rows, 2 columns

In [5]:
# Feature selection using SelectKBest. Selects the 10 best features
select_k_best = SelectKBest(chi2, k=10) # chi2 Computes chi-squared stats between each non-negative feature and class.

select_k_best.fit(X,Y)

selected = [dummyData.columns[i] for i in select_k_best.get_support(indices=True)]

# Deleting the features edibility_p and edibility_e since these are the targets.
del selected[:2]
print("The eight most discriminative features:\n")
for i in range (len(selected)):
    print("number",i+1 ,": ", selected[i])

The eight most discriminative features:

number 1 :  odor_f
number 2 :  odor_n
number 3 :  gill-size_n
number 4 :  gill-color_b
number 5 :  stalk-surface-above-ring_k
number 6 :  stalk-surface-below-ring_k
number 7 :  ring-type_l
number 8 :  spore-print-color_h


In [6]:
# Task 2
from sklearn.decomposition import PCA
import pprint
principal_component_analysis = PCA(n_components=8)
principal_component_analysis_x = principal_component_analysis.fit_transform(X)

# 8124 rows, 6 columns
print(principal_component_analysis_x.shape)


# The singular values corresponding to each of the selected components
pca_singular = np.array (principal_component_analysis.singular_values_,dtype=np.float)

# Finds the indexes of the highest values
best_features = [principal_component_analysis.components_[i].argmax()
                 for i in range(principal_component_analysis_x.shape[1])]

# List of the best features
feature_names = [X.columns[best_features[i]]
                 for i in range(principal_component_analysis_x.shape[1])]


# Creates a dictionary of the names and variances
res1 = dict(zip(feature_names, pca_singular))

# Prints the names and variances as key value pairs. 
print("features which give the highest variance, along with their respective variances\n")
pprint.pprint(res1, sort_dicts=False)


(8124, 8)
features which give the highest variance, along with their respective variances

{'edibility_p': 135.09426392135452,
 'stalk-root_b': 113.85840536555101,
 'habitat_g': 95.06399036032089,
 'stalk-shape_t': 77.52742291773879,
 'odor_n': 64.64022201823175,
 'cap-shape_f': 58.48305255247532,
 'cap-surface_s': 54.754074721467845,
 'cap-color_n': 50.72919937328889}


In [7]:
# Task 3 
# With K value = 10 and n_components = 8 we get the following features which overlap with each other:
print("without edibility_p")
set(selected).intersection(res1)

without edibility_p


{'odor_n'}

In [8]:
# Since i removed edibility_p from the list in task 2, it doesnt show up here.
#To show that it should, ill initiate it again
# Feature selection using SelectKBest. Selects the 10 best features
select_k_best2 = SelectKBest(chi2, k=10) # chi2 Computes chi-squared stats between each non-negative feature and class.

select_k_best2.fit(X,Y)

selected2 = [dummyData.columns[i] for i in select_k_best.get_support(indices=True)]
print("with edibility_p")
set(selected2).intersection(res1)

with edibility_p


{'edibility_p', 'odor_n'}