# Week 6 Discussion

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from ucimlrepo import fetch_ucirepo 

In [2]:
# Fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# Save data as X and y variables
X = mushroom.data.features 
y = np.ravel(mushroom.data.targets)

# Expand dataframe columns and look at view dataframe
pd.set_option('display.max_columns', None)
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


## Encoding data

In [None]:
# Factorize all columns
for col in X.columns:
    X.loc[:,col] = pd.factorize(X[col], sort=True)[0] # select first element
    # works the same as X[col], but this gives a setting with copy warning

# View first few rows of encoded data
X.iloc[0:5, 0:5] # first five rows and 5 columns

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor
0,5,2,4,1,6
1,5,2,9,1,0
2,0,2,8,1,3
3,5,3,8,1,6
4,5,2,3,0,5


In [4]:
# Check for NAs
X.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [5]:
X_na = X.copy()

In [None]:
# Assign 10% of new df with NA values
for col in X_na.columns:
    X_na.loc[X_na.sample(frac = 0.1).index, col] = np.nan # given both values on index, but index is what you want to update with the NAs

In [None]:
# Confirm there are missing values
X_na.isna().sum()

cap-shape                   812
cap-surface                 812
cap-color                   812
bruises                     812
odor                        812
gill-attachment             812
gill-spacing                812
gill-size                   812
gill-color                  812
stalk-shape                 812
stalk-root                  812
stalk-surface-above-ring    812
stalk-surface-below-ring    812
stalk-color-above-ring      812
stalk-color-below-ring      812
veil-type                   812
veil-color                  812
ring-number                 812
ring-type                   812
spore-print-color           812
population                  812
habitat                     812
dtype: int64

### Imputation method 1: Fillings NA values with the mode

In [9]:
# Impute with mode
X_mode_impute = X_na.fillna(X_na.mode().iloc[0]) # needs a series, so select the 0 object from the df to make it a series

# Check NAs
X_mode_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

### Imputation method 2: Filling NA values with the median using `SimpleImputer`

In [11]:
# Impute with median 
median_impute = SimpleImputer(strategy='median') # initialize the imputer
X_median_impute = median_impute.fit_transform(X_na) # actually perform imputation

X_median_impute = pd.DataFrame(X_median_impute, columns = X.columns)

# Check
X_median_impute.isna().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

### Imputation method 3: Filling NA values with KNN imputer

In [14]:
# Impute with KNN imputer, classifies based on majority nearest neighbor class
knn_impute = KNNImputer(n_neighbors=5)
X_knn_impute = knn_impute.fit_transform(X_na)
X_knn_impute = pd.DataFrame(X_knn_impute, columns=X.columns)

# Check
X_knn_impute.isna().sum()


cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

### Which method is most accurate?

In [17]:
# Calculate imputation accuracy
mse_mode = mean_squared_error(X, X_mode_impute)
mse_median = mean_squared_error(X, X_median_impute)
mse_knn = mean_squared_error(X, X_knn_impute)

# Check results
print(f"Mode Imputation Performance: {mse_mode:.4f}")
print(f"Median Imputation Performance: {mse_median:.4f}")
print(f"KNN Imputation Performance: {mse_knn:.4f}")

Mode Imputation Performance: 0.4479
Median Imputation Performance: 0.2534
KNN Imputation Performance: 0.1279


## Random forest classifier with original data

In [18]:
# Split actual data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# features for tuning

num_features = [1, 4, 7, 10, 13, 16, 19, 22]
accuracy = []

for feature in num_features:
    rf_classifier = RandomForestClassifier(
        n_estimators = 50, 
        max_depth = 3, 
        random_state = 42, 
        max_features = feature
    )

    rf_classifier.fit(X_train, y_train)

    # Predict and evaluate results

    y_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    accuracy.append(rf_accuracy)

    print(f"Number of features: {feature}; Random Forest Accuracy: {rf_accuracy:.4f}")

Number of features: 1; Random Forest Accuracy: 0.9167
Number of features: 4; Random Forest Accuracy: 0.9848
Number of features: 7; Random Forest Accuracy: 0.9869
Number of features: 10; Random Forest Accuracy: 0.9836
Number of features: 13; Random Forest Accuracy: 0.9824
Number of features: 16; Random Forest Accuracy: 0.9861
Number of features: 19; Random Forest Accuracy: 0.9820
Number of features: 22; Random Forest Accuracy: 0.9578


## Random forest classifier with imputed data

In [23]:
# Number of features to include for tuning
X_train, X_test, y_train, y_test = train_test_split(X_knn_impute, y, test_size=0.3, random_state=42)

In [24]:
# features for tuning

num_features = [1, 4, 7, 10, 13, 16, 19, 22]
accuracy = []

for feature in num_features:
    rf_classifier = RandomForestClassifier(
        n_estimators = 50, 
        max_depth = 3, 
        random_state = 42, 
        max_features = feature
    )

    rf_classifier.fit(X_train, y_train)

    # Predict and evaluate results

    y_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, y_pred)
    accuracy.append(rf_accuracy)

    print(f"Number of features: {feature}; Random Forest Accuracy: {rf_accuracy:.4f}")

Number of features: 1; Random Forest Accuracy: 0.9188
Number of features: 4; Random Forest Accuracy: 0.9836
Number of features: 7; Random Forest Accuracy: 0.9844
Number of features: 10; Random Forest Accuracy: 0.9832
Number of features: 13; Random Forest Accuracy: 0.9865
Number of features: 16; Random Forest Accuracy: 0.9754
Number of features: 19; Random Forest Accuracy: 0.9762
Number of features: 22; Random Forest Accuracy: 0.9573
