# Classification using Scikit Learn

### Load and prepare the data

In [2]:
# Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
mush_df = pd.read_csv('mushrooms.csv')

In [4]:
mush_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
mush_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

 ### Recode categorical variables to numeric
 
 • The data in the mushrooms dataset is **currently encoded with strings**.

 • These values will **need to be encoded to numeric to work with sklearn**.

 • We'll use pd.get_dummies to **convert the categorical variables into indicator variables**.

In [6]:
mush_df2 = pd.get_dummies(mush_df)
display(mush_df2)

Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8120,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
8121,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [7]:
mush_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Columns: 119 entries, class_e to habitat_w
dtypes: uint8(119)
memory usage: 944.2 KB


In [11]:
# all rows, columns from column at index 2 onwards
# features
X_mush = mush_df2.iloc[:,2:]

# all rows, column at index 1 only
# label
y_mush = mush_df2.iloc[:,1]
display(X_mush)
display(y_mush)

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8120,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
8121,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0


0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: class_p, Length: 8124, dtype: uint8

In [9]:
# use the variables X_train2, y_train2 for Question 1
X_train2, X_test2, y_train2, y_test2 =train_test_split(X_mush, y_mush, random_state=0)

In [10]:
# For performance reasons in Questions 6 and 7, we will create a smaller version of the
# entire mushroom dataset for use in those questions. For simplicity we'll just re-use
# the 25% test split created above as the representative subset.

# Use the variables X_subset, y_subset for Questions 2 and 3.
X_subset = X_test2
y_subset = y_test2

### Q1. Using `X_train2` and `y_train2` from the preceding cell, train a DecisionTreeClassifier with default parameters and random_state=0. What are the 5 most important features found by the decision tree?
 
As a reminder, the feature names are available in the `X_train2.columns` property, and the order of the features in `X_train2.columns` matches the order of the feature importance values in the classifier's `feature_importances_` property. 
 
 This function should **return a list of length 5 containing the feature names in descending order of importance.**


In [11]:
def answer_one():
    from sklearn.tree import DecisionTreeClassifier
    
    tree_clf = DecisionTreeClassifier().fit(X_train2, y_train2)
    
    feature_names = []
    
    # Get index of importance leves since their's order is the same with feature columns
    for index, importance in enumerate(tree_clf.feature_importances_):
        # Add importance so we can further order this list, and add feature name with index
        feature_names.append([importance, X_train2.columns[index]])
    
    # Descending sort
    feature_names.sort(reverse=True)
    # Turn in to a numpy array
    feature_names = np.array(feature_names)
    # Select only feature names
    feature_names = feature_names[:5,1]
    # Turn back to python list
    feature_names = feature_names.tolist()
    
    return feature_names # Your answer here

In [12]:
answer_one()

['odor_n',
 'stalk-root_c',
 'stalk-surface-below-ring_y',
 'spore-print-color_r',
 'spore-print-color_u']

### Q3. Use the `validation_curve` function in `sklearn.model_selection` to determine training and test scores for a Support Vector Classifier (`SVC`) with varying parameter values.  

#### Recall that the validation_curve function, in addition to taking an initialized unfitted classifier object, takes a dataset as input and does its own internal train-test splits to compute results.

 **Because creating a validation curve requires fitting multiple models, for performance reasons this question will use just a subset of the original mushroom dataset: please use the variables X_subset and y_subset as input to the validation curve function (instead of X_mush and y_mush) to reduce computation time.**

The initialized unfitted classifier object we'll be using is a Support Vector Classifier with radial basis kernel.  So your first step is to create an `SVC` object with default parameters (i.e. `kernel='rbf', C=1`) and `random_state=0`. Recall that the kernel width of the RBF kernel is controlled using the `gamma` parameter.  

 With this classifier, and the dataset in X_subset, y_subset, explore the effect of `gamma` on classifier accuracy by using the `validation_curve` function to find the training and test scores for 6 values of `gamma` from `0.0001` to `10` (i.e. `np.logspace(-4,1,6)`). Recall that you can specify what scoring metric you want validation_curve to use by setting the "scoring" parameter.  In this case, we want to use "accuracy" as the scoring metric.
 
#### For each level of `gamma`, `validation_curve` will fit 3 models on different subsets of the data, returning two 6x3 (6 levels of gamma x 3 fits per level) arrays of the scores for the training and test sets.


#### Find the mean score across the three models for each level of `gamma` for both arrays, creating two arrays of length 6, and return a tuple with the two arrays.


 e.g.

 if one of your array of scores is

 array([[ 0.5, 0.4, 0.6],
[ 0.7, 0.8, 0.7],
[ 0.9, 0.8, 0.8],
[ 0.8, 0.7, 0.8],
[ 0.7, 0.6, 0.6],
[ 0.4, 0.6, 0.5]])

 it should then become

 array([ 0.5, 0.73333333, 0.83333333,
0.76666667, 0.63333333, 0.5])

 *This function should return one tuple of numpy
arrays `(training_scores, test_scores)` where each
array in the tuple has shape `(6,)`.*

In [13]:
def answer_two():
    from sklearn.svm import SVC
    from sklearn.model_selection import validation_curve

    svc = SVC(kernel='rbf', C=1, random_state=0)
    gamma = np.logspace(-4,1,6)
    train_scores, test_scores = validation_curve(svc, X_subset, y_subset,
                            param_name='gamma',
                            param_range=gamma,
                            scoring='accuracy')

    scores = (train_scores.mean(axis=1), test_scores.mean(axis=1))
        
    return scores # Your answer here

In [14]:
answer_two()

(array([0.83370474, 0.93943926, 0.99101417, 1.        , 1.        ,
        1.        ]),
 array([0.83160092, 0.93747958, 0.9901502 , 1.        , 0.99852459,
        0.52240229]))

In [15]:
for index, num in enumerate(np.logspace(-4,1,6)):
    print(num)

0.0001
0.001
0.01
0.1
1.0
10.0


### Q3. Based on the scores from Q2, what gamma value corresponds to a model that is underfitting (and has the worst test set accuracy)? What gamma value corresponds to a model that is overfitting (and has the worst test set accuracy)? What choice of gamma would be the best choice for a model with good generalization performance on this dataset (high accuracy on both training and test set)? 
Note: there may be multiple correct solutions to this question.

(Hint: Try plotting the scores from question 6 to visualize the relationship between gamma and accuracy.)
 
 *This function should return one tuple with the degree values in this order: `(Underfitting, Overfitting, Good_Generalization)`*

In [16]:
def answer_three():
    
    param_range = np.logspace(-4, 1, 6)
    
    # Read in the results of answer_six
    training_scores, test_scores = answer_two()
    
    # Sort the scores
    train_scores_sorted = np.sort(training_scores)
    test_scores_sorted = np.sort(test_scores)
    
    # Initialize the values
    Underfitting = 0
    Overfitting = 0
    Good_Generalization = 0
    min_train_scores = np.min(training_scores)
    max_train_scores = np.max(training_scores)
    min_test_scores = np.max(test_scores)
    max_test_scores = np.max(test_scores)    
    
    for gam, data in zip(param_range, zip(training_scores, test_scores)):
        if data[0] <= train_scores_sorted[1] and data[1] <= test_scores_sorted[1]:
            Underfitting = gam
        if data[0] > train_scores_sorted[1] and data[1] <= test_scores_sorted[1]:
            Overfitting = gam
        if data[0] == max_train_scores and data[1] == max_test_scores:
            Good_Generalization = gam
    
    return Underfitting, Overfitting, Good_Generalization

In [17]:
answer_three()

(0.0001, 10.0, 0.1)

(Underfitting, Overfitting, Good_Generalization)