In [1]:
# This is the mushroom project. The purpose of this assignment is to predict if a mushroom is poisonous or not. 
# The methods I will be using in this assignment are mainly KNN and PCA algorithms
# First, let's import mushroom dataset and adds the columns name to it
#Common import
import pandas as pd
import numpy as np
# import the data set
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"
#Add column names
mushrooms =  pd.read_csv(url, names = ['class', 'cap-shape', 'cap-surface', 
                                       'cap-color', 'bruises', 'odor', 'gill-attachment', 
                                       'gill-spacing', 'gill-size', 'gill-color', 
                                       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
                                       'stalk-surface-below-ring', 'stalk-color-above-ring', 
                                       'stalk-color-below-ring', 'veil-type', 'veil-color', 
                                       'ring-number', 'ring-type', 'spore-print-color', 
                                       'population', 'habitat'], na_values = "?")
# check if there is a column contains a null value, there has a columns contains "nan" value
mushrooms.isnull().any()

class                       False
cap-shape                   False
cap-surface                 False
cap-color                   False
bruises                     False
odor                        False
gill-attachment             False
gill-spacing                False
gill-size                   False
gill-color                  False
stalk-shape                 False
stalk-root                   True
stalk-surface-above-ring    False
stalk-surface-below-ring    False
stalk-color-above-ring      False
stalk-color-below-ring      False
veil-type                   False
veil-color                  False
ring-number                 False
ring-type                   False
spore-print-color           False
population                  False
habitat                     False
dtype: bool

In [2]:
# See if any columns have any non-sense or null values.
# From below noticed that 'stalk-root' column contains 'nan' value.
for i in mushrooms.columns:
    print(i,mushrooms[i].unique())

class ['p' 'e']
cap-shape ['x' 'b' 's' 'f' 'k' 'c']
cap-surface ['s' 'y' 'f' 'g']
cap-color ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']
bruises ['t' 'f']
odor ['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']
gill-attachment ['f' 'a']
gill-spacing ['c' 'w']
gill-size ['n' 'b']
gill-color ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
stalk-shape ['e' 't']
stalk-root ['e' 'c' 'b' 'r' nan]
stalk-surface-above-ring ['s' 'f' 'k' 'y']
stalk-surface-below-ring ['s' 'f' 'y' 'k']
stalk-color-above-ring ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
stalk-color-below-ring ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
veil-type ['p']
veil-color ['w' 'n' 'o' 'y']
ring-number ['o' 't' 'n']
ring-type ['p' 'e' 'l' 'f' 'n']
spore-print-color ['k' 'n' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
population ['s' 'n' 'a' 'v' 'y' 'c']
habitat ['u' 'g' 'm' 'd' 'p' 'w' 'l']


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
# remove poisonous target feature 
m_notarget = mushrooms.drop(['class'], axis=1)

# instead of using train_test_split, we are here manually split the data into x_train,y_train,x_test and y_test
# y-train is the non-null stalk-root feature rows
y_train = mushrooms['stalk-root'][mushrooms['stalk-root'].notnull()]
y_train = y_train.to_frame()

# x_train is the non-null rows of all the other columns in the same indices of non-null stalk root feature rows
df_other_columns = m_notarget.drop('stalk-root', axis=1)
x_train = df_other_columns.loc[y_train.index]

#Set columns with nan as response
# y_pred is the null stalk-root feature rows that we'r trying to impute
y_pred  = mushrooms['stalk-root'][mushrooms['stalk-root'].isnull()]
y_pred = pd.DataFrame(y_pred , columns = ['stalk-root'])

# x_test is the non-null rows of all the other columns in the same indices of the null stalk root feature rows
x_test = df_other_columns.loc[y_pred.index]

In [4]:
import warnings
warnings.filterwarnings('ignore')

#Instantiate an object of the OneHotEncoder with drop parameter set to first
cat_encoder = OneHotEncoder(drop = 'first', handle_unknown = 'ignore')

#Call the fit_transform() method and pass categorical data, data_cat
train_feature_encoder = cat_encoder.fit_transform(x_train).toarray()
test_feature_encoder = cat_encoder.transform(x_test).toarray()

#Call labelencode method to response data 
response_label = LabelEncoder()
y_train = y_train.values.flatten() 
train_response_encoder = response_label.fit_transform(y_train)

In [5]:
#Train knn model
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(train_feature_encoder,train_response_encoder)

KNeighborsClassifier()

In [6]:
#Have the knn model to make a prediction about the missing values
predicted = knn.predict(test_feature_encoder)
predicted = predicted.ravel()

#Use inverse_transform() function of ordinal encoder to get the categorical 
#values back from the numerical dtype
stalkroot_categorical = response_label.inverse_transform(predicted)

#Replace nan with prediction values
df_stalkrootpred = pd.DataFrame(stalkroot_categorical, columns = ['stalk-root'])
df_stalkrootpred.index = y_pred.index

#The complete column of stalk-root is now without nan value
y_train = pd.DataFrame(y_train,columns = ['stalk-root'])
df_stalk_root = pd.concat([y_train,df_stalkrootpred])

In [7]:
#Create full feature dataset by concat x_train and x_test
df_x = pd.concat([x_train, x_test])

#Put the complete 'stalk-root' column back to the orginal feature dataset
df_stalk_root.index = df_x.index
df_org = pd.concat([df_stalk_root,df_x],axis =1)

#Add back dropped target feature,'class' column
df_class = pd.DataFrame(mushrooms['class'])
df = pd.concat([df_class,df_org],axis =1)

#Print the first 10 rows of new complete dataset with no any nan
missing_values = df.head(10)
missing_values

Unnamed: 0,class,stalk-root,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,e,x,s,n,t,p,f,c,n,...,s,w,w,p,w,o,p,k,s,u
1,e,c,x,s,y,t,a,f,c,b,...,s,w,w,p,w,o,p,n,n,g
2,e,c,b,s,w,t,l,f,c,b,...,s,w,w,p,w,o,p,n,n,m
3,p,e,x,y,w,t,p,f,c,n,...,s,w,w,p,w,o,p,k,s,u
4,e,e,x,s,g,f,n,f,w,b,...,s,w,w,p,w,o,e,n,a,g
5,e,c,x,y,y,t,a,f,c,b,...,s,w,w,p,w,o,p,k,n,g
6,e,c,b,s,w,t,a,f,c,b,...,s,w,w,p,w,o,p,k,n,m
7,e,c,b,y,w,t,l,f,c,b,...,s,w,w,p,w,o,p,n,s,m
8,p,e,x,y,w,t,p,f,c,n,...,s,w,w,p,w,o,p,k,v,g
9,e,c,b,s,y,t,a,f,c,b,...,s,w,w,p,w,o,p,k,s,m


Question_1: 
Yes,if one-hot encoded the response data instead, it's still able to train the KNN model.
The integer encoding is insufficient for categorical variables where there is no such 
ordinal relationship.In practice, allowing the version to assume natural class ordering 
and using this encoding may result in poor overall performance or unexpected results 
(predictions halfway among classes).In this situation, a one-time encoding of the
integer representation could be used. The integer encoded variable is removed for 
each specified integer value, and a new binary variable is supplied.

In [8]:
#Common import and split updated dataset without any nan value
from sklearn.model_selection import train_test_split
x=df.iloc[:,df.columns!='class']
y=df.iloc[:,0]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [9]:
#Instantiate an object of the OneHotEncoder with drop parameter set to first
onehot_encoder = OneHotEncoder(drop = 'first', handle_unknown = 'ignore')

#Call the fit_transform() method and pass categorical data, data_cat
train_feature_encoder = onehot_encoder.fit_transform(x_train).toarray()
test_feature_encoder = onehot_encoder.transform(x_test).toarray()

#Call the LabelEncoder() method
response_label = LabelEncoder()
train_response_encoder = response_label.fit_transform(y_train)

Question_2:
No, you cannot train both models on RandomForestClassifier and logistic regression algorithms if you one-hot encoded the response data.One-hot coding creates a dummy variable trap because the final results of one variable can be easily predicted using the remaining variables. The dummy variable trap is a situation in which variables are highly correlated with each other. The dummy variable trap leads to a problem called multicollinearity. Multicollinearity occurs, in which there is a dependency between the independent characteristics. Multicollinearity is a serious problem in devices that extract knowledge from models.Here I almost reveal how the multicollinearity problem occurs after the one-hot encoding.

In [10]:
#Common import and train the random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_feature_encoder,train_response_encoder)
rf_pred = rf.predict(test_feature_encoder)

In [11]:
#Train the logistic regression model
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver="lbfgs")
clf.fit(train_feature_encoder,train_response_encoder)
clf_pred = clf.predict(test_feature_encoder)

In [12]:
%%timeit
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_feature_encoder,train_response_encoder)
rf_pred = rf.predict(test_feature_encoder)

478 ms ± 33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver="lbfgs")
clf.fit(train_feature_encoder,train_response_encoder)
clf_pred = clf.predict(test_feature_encoder)

95.6 ms ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
# Returning Categorical value of random forest model
imputed_rf = rf_pred.reshape(-1, 1)
categorical_rfpred = response_label.inverse_transform(imputed_rf.ravel())
# Returning Categorical value of logistic regression model
imputed_clf = clf_pred.reshape(-1, 1)
categorical_clfpred = response_label.inverse_transform(imputed_clf.ravel())

In [15]:
#Print classification report of random forest model
from sklearn.metrics import classification_report
print(classification_report(categorical_rfpred,y_test))

              precision    recall  f1-score   support

           e       1.00      1.00      1.00       843
           p       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [16]:
#Print classification report of logistic regression model
print(classification_report(categorical_clfpred,y_test))

              precision    recall  f1-score   support

           e       1.00      1.00      1.00       843
           p       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



Both random forest and logistic regression performed very efficiently in providing good predictions. Accuracy, precision and recall all 100%. Our large set of predictors we have given is very good at predicting.

In [17]:
from sklearn.decomposition import PCA
# 95% of variance
pca = PCA(n_components = 0.95)
x_train_pca = pca.fit_transform(train_feature_encoder)
x_test_pca = pca.transform(test_feature_encoder)
print(pca.explained_variance_ratio_)

[0.18059977 0.10526527 0.08823667 0.05326606 0.04504662 0.04062511
 0.03645102 0.03125304 0.02663138 0.02524412 0.02312577 0.02174843
 0.01979873 0.01835232 0.01818592 0.01747714 0.01567503 0.01492226
 0.01414152 0.01285241 0.01261076 0.01156224 0.01073068 0.01014324
 0.00995045 0.00970005 0.00919718 0.00906698 0.0082216  0.00802871
 0.00702482 0.00691255 0.00652527 0.00554139 0.0054683  0.00525617
 0.00460621 0.00430221]


In [18]:
#Print out orginal data shape and transformed data shape after pca
print("original shape:   ", test_feature_encoder.shape)
print("transformed shape:", x_test_pca.shape)

original shape:    (1625, 94)
transformed shape: (1625, 38)


In [19]:
#Calculate how many percentage have reduced of dimensions 
reduced = np.subtract(test_feature_encoder.shape[1], x_test_pca.shape[1])
number = reduced/(test_feature_encoder.shape[1])
# Print the result of how many percentage have reduced of dimensions 
print(f"{number:.0%}")

60%


In [20]:
#This is how many dimensions left with after reducing dimensionality
pca.n_components_

38

In [21]:
#Train the Random Forest Classifier on reduced dataset
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train_pca,train_response_encoder)
rf_pred_pca = rf.predict(x_test_pca)

In [22]:
#Train the Logistic Regression on reduced dataset
clf = LogisticRegression(solver="lbfgs")
clf.fit(x_train_pca,train_response_encoder)
clf_pred = clf.predict(x_test_pca)

In [23]:
%%timeit
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train_pca,train_response_encoder)
rf_pred_pca = rf.predict(x_test_pca)

3.41 s ± 741 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%%timeit
clf = LogisticRegression(solver="lbfgs")
clf.fit(x_train_pca,train_response_encoder)
clf_pred = clf.predict(x_test_pca)

61.9 ms ± 4.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [25]:
#Returning Categorical value of random forest model on the reduced dataset
pca_rf = rf_pred_pca.reshape(-1, 1)
pca_rfpred = response_label.inverse_transform(pca_rf.ravel())
#Returning Categorical value of logistic regression model on the reduced dataset
pca_clf = clf_pred.reshape(-1, 1)
pca_clfpred = response_label.inverse_transform(pca_clf.ravel())

In [26]:
#Print classification report of random forest model on reduced dataset 
print(classification_report(pca_rfpred,y_test))

              precision    recall  f1-score   support

           e       1.00      1.00      1.00       843
           p       1.00      1.00      1.00       782

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625



In [27]:
#Print classification report of logistic regression model on reduced dataset 
print(classification_report(pca_clfpred,y_test))

              precision    recall  f1-score   support

           e       0.99      0.99      0.99       845
           p       0.99      0.99      0.99       780

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625



In [28]:
#Confirm accuracy of prediction using logistic regression on reduced dataset 
from sklearn.metrics import accuracy_score
print("Accuracy of pca logistic regression :", accuracy_score(y_test, pca_clfpred))
# confirm accuracy of prediction using random forest model on reduced dataset 
print("Accuracy of pca random forest:", accuracy_score(y_test, pca_rfpred))

Accuracy of pca logistic regression : 0.9926153846153846
Accuracy of pca random forest: 1.0


In [29]:
#Common import 
from tabulate import tabulate
#create tabulate table
print(tabulate({'Model':['Random Forest', None,None,None,'Logistic Regression'], 
                'Item':['Accuracy', 'Precision','Recall','Time(ms)','Accuracy', 
                         'Precision','Recall','Time(ms)'], 
                 'Full Data':[1,1,1,433,1,1,1,81.7],
                 'PCA Reduced':[1,1.00,1.00,2600,0.99,0.99,0.99,46.1]},
               headers = ['Model', 'Item','Full Data','PCA Reduced'],tablefmt='plain'))

Model                Item         Full Data    PCA Reduced
Random Forest        Accuracy           1             1
                     Precision          1             1
                     Recall             1             1
                     Time(ms)         433          2600
Logistic Regression  Accuracy           1             0.99
                     Precision          1             0.99
                     Recall             1             0.99
                     Time(ms)          81.7          46.1


For processing time, we can see logistic regression model takes plenty much less time than the random forest model on each full data and reduced data. Logistic regression is processed quicker on reduced data than on full data. Random forest processed slower on reduced data than full data.

For accuracy, precision, and recall values, the random forest model is very properly estimated on each full data and reduced data. Logistic regression worked very properly on full data too, however on reduced data, all these values are still good with slightly(0.01) much less in contrast with random forest algorithm. Since all accuracy, precision, and recall values are either 100% or 99% from two different algorithm on both full and reduced data, there is no obvious evidence of overfitting. 

I can conclude that logistic regression processed a lot quicker than the random forest algorithm with the equal excellent accuracy, precision, and recall values that the random forest model produced on full data. However logistic regression algorithm produced slightly less good values on reduced data than the random forest model made. The random forest algorithm was superior and more accurate for mushroom dataset classification on both full data and reduced data.