## Mushroom Classification
The goal of this activity is to create a model that predicts whether the mushroom is edible or not

* 1.1 Exploration of Data

In [1]:
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
import seaborn as sns;

In [2]:
df = pd.read_csv('mushrooms.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'mushrooms.csv'

In [None]:
df.head(5)

In [None]:
df['class'].value_counts()

#### 2. Data Pre-processing

* 2.1. Data Cleaning

* 2.1.1. Drop Duplicates

In [None]:
#check the shape of our data frame
df.shape

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

* 2.1.2. Fill in missing values

In [None]:
df.isna().sum()

We can see that the dataset does not have any missing values in any of its features

* 2.1.3. Removing extreme values

In [None]:
df.dtypes

Since we do not have any numerical features, we will skip the "removing of extreme values" step.

* 2.2. Converting categorical features into numerical features

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder();
df_new = df.copy();

In [None]:
for col in df_new.columns:
    if(col != 'class'):
        df_new[col] = encoder.fit_transform(df_new[col]);

In [None]:
df_new['class'] = np.where(df_new['class'] == 'p',1,0)

In [None]:
df_new.dtypes

* 2.3 Data Normalization / Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

df_scaled = df_new.copy();
scaler = MinMaxScaler();

In [None]:
for col in df_scaled.columns:
    if(col != 'class'):
        df_scaled[col] = scaler.fit_transform(df_scaled[col].values.reshape(-1,1));

In [None]:
df_scaled

* Feature Selection / Correlation Analysis

In [None]:
sns.heatmap(df_scaled.corr())

In [None]:
df_scaled = df_scaled.drop(['veil-type'],axis=1)

In [None]:
sns.heatmap(df_scaled.corr())

In [None]:
corr = df_scaled.corr();
corr_mask = corr.mask(np.triu(np.ones_like(corr, dtype=bool)))
corr_unstacked = corr_mask.unstack().sort_values(ascending = False);
corr_df = pd.DataFrame(corr_unstacked[abs(corr_unstacked)>.75]).sort_index()

In [None]:
corr_df

In [None]:
#drop gill-attachment to remove any correlation
df_scaled = df_scaled.drop(['gill-attachment'],axis=1)

In [None]:
X = df_scaled.drop(['class'],axis=1);
y = df_scaled['class'];

In [None]:
from sklearn.metrics import confusion_matrix;
from sklearn.metrics import RocCurveDisplay; #plot_roc_curve
from sklearn.metrics import classification_report;

def evaluate_model(model,X,y):

    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,
                                                 train_size=.80,stratify=y);
    model.fit(X_train,y_train);
    
    print(classification_report(y_test,model.predict(X_test),
                                target_names=['edible','poisonous']));
    print(confusion_matrix(y_test,model.predict(X_test),
                          labels=[0,1]));
    print('Training score:',model.score(X_train,y_train));
    print('Test score:',model.score(X_test,y_test));
    RocCurveDisplay.from_estimator(model,X_test,y_test); #plot_roc_curve(model,X_test,y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier;

rf = RandomForestClassifier();
evaluate_model(rf,X,y)

In [None]:
#information gain
from sklearn.feature_selection import mutual_info_classif

importances = mutual_info_classif(X,y);
feat_importances = pd.Series(importances, df_scaled.columns[0:len(df_scaled.columns) - 1]);
feat_importances.plot(kind='barh')

In [None]:
#drop features with low feature importance
X_ig = X.drop(['veil-color','stalk-color-below-ring','gill-color','cap-surface','cap-shape'],axis=1)

In [None]:
evaluate_model(rf,X_ig,y)

In [None]:
#CHI SQUARE TEST
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k = 6); #k is the number of features you want to retain
X_chi2 = chi2_features.fit_transform(X,y);

evaluate_model(rf,X_chi2,y)


In [None]:
#CORRELATION

corr = df_scaled.corr();
corr['class'] = abs(corr['class'])
corr['class']

In [None]:
X_corr = X.drop(['cap-shape','cap-color','odor','cap-surface','stalk-shape','stalk-color-above-ring',
                'stalk-color-below-ring','veil-color','ring-number','spore-print-color','habitat'],axis=1)

In [None]:
X_corr.shape

In [None]:
evaluate_model(rf,X_corr,y)

In [None]:
#VARIANCE THRESHOLD
from sklearn.feature_selection import VarianceThreshold

v_threshold = VarianceThreshold(threshold = 0.10);
v_threshold.fit(X);
sp = v_threshold.get_support()

In [None]:
col = X.columns

In [None]:
X_thres = X.copy();
for s,c in zip(sp,col):
    if(s == False):
        X_thres.drop([c],axis=1,inplace=True)

In [None]:
X_thres.columns

In [None]:
evaluate_model(rf,X_thres,y)

In [None]:
#MAD
mean_abs_diff = np.sum(np.abs(X - np.mean(X,axis = 0)),axis = 0)/X.shape[0];

plt.bar(np.arange(X.shape[1]),mean_abs_diff);

In [None]:
X.columns

In [None]:
from sklearn.model_selection import cross_val_score;

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 3);
X_pca = pca.fit_transform(X);
pca_df = pd.DataFrame(data = X_pca, columns = ['PCA1','PCA2','PCA3']);

evaluate_model(rf,pca_df,y)
print(cross_val_score(rf,pca_df,y,cv=10).mean())

In [None]:
fig = plt.figure();
ax = fig.add_subplot(projection = '3d');

ax.scatter(pca_df['PCA1'],pca_df['PCA2'],pca_df['PCA3'],
           c = y,marker='o')

In [None]:
from sklearn.decomposition import FastICA

ica =  FastICA(n_components = 3);
X_ica = ica.fit_transform(X);

evaluate_model(rf,X_ica,y)
cross_val_score(rf,X_ica,y,cv=10).mean()

In [None]:
ica_df =  pd.DataFrame(data=X_ica,columns=['ICA1','ICA2','ICA3'])

In [None]:
fig = plt.figure();
ax = fig.add_subplot(projection = '3d');

ax.scatter(ica_df['ICA1'],ica_df['ICA2'],ica_df['ICA3'],
           c = y,marker='o')

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components = 1);

X_lda = lda.fit(X,y).transform(X);

evaluate_model(rf,X_lda,y)
cross_val_score(rf,X_lda,y,cv=5).mean()

In [None]:
lda_df = pd.DataFrame(data=X_lda,columns=['LDA']);
dff = pd.concat([lda_df,y])

In [None]:
from sklearn.manifold import LocallyLinearEmbedding;

lle = LocallyLinearEmbedding(n_components=3);
X_lle = lle.fit_transform(X);

evaluate_model(rf,X_lle,y);
cross_val_score(rf,X_lle,y,cv=5).mean()

In [None]:
from sklearn.manifold import TSNE;

tsne = TSNE(n_components=3,verbose=1,perplexity=40,n_iter=300);

X_tnse = tsne.fit_transform(X);

evaluate_model(rf,X_tnse,y);
cross_val_score(rf,X_tnse,y,cv=5).mean()

In [None]:
sns.heatmap(df_scaled.corr())

* 3. Modelling

* 3.1. Split target variable to feature variables

In [None]:
X = df_scaled.drop(['class'],axis=1);
y = df_scaled['class'];

* 3.2. Split our dataset to training and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,
                                                 train_size=.80,stratify=y);

In [None]:
print(X_train.shape);
print(X_test.shape);

* 3.3. Choose the best model

In [None]:
from sklearn.metrics import confusion_matrix;
from sklearn.metrics import RocCurveDisplay;
from sklearn.metrics import classification_report;

def evaluate_model(model):
    print(classification_report(y_test,model.predict(X_test),
                                target_names=['edible','poisonous']));
    print(confusion_matrix(y_test,model.predict(X_test),
                          labels=[0,1]));
    print('Training score:',model.score(X_train,y_train));
    print('Test score:',model.score(X_test,y_test));
    RocCurveDisplay.from_estimator(model,X_test,y_test);



* 3.3.1. Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB();
gnb.fit(X_train,y_train);

evaluate_model(gnb)

* 3.3.2. BernoulliNB

In [None]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB();
bnb.fit(X_train,y_train);

evaluate_model(bnb)

* 3.3.3. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier();
dt.fit(X_train,y_train);

evaluate_model(dt)

* 3.3.4. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier();
rf.fit(X_train,y_train);

evaluate_model(rf)

* 3.3.5. KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier();
knn.fit(X_train,y_train);

evaluate_model(knn)

* 3.3.6. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression();
lr.fit(X_train,y_train);

evaluate_model(lr)

###### Hyperparameter Tuning / Cross Validation
To tune our model into a better model

In [None]:
from sklearn.model_selection import RandomizedSearchCV

* 4.1.1. Decision Tree Classifier Hyperparameter Tuning

In [None]:
params = {
    'criterion': ['gini','entropy'],
    'splitter': ['best','random'],
    'max_depth': [5,10,15,20]
}

rsearch = RandomizedSearchCV(DecisionTreeClassifier(), params, 
n_iter = 15, cv=10)
rsearch.fit(X_train,y_train)

In [None]:
rsearch.best_params_

In [None]:
dt_tuned = DecisionTreeClassifier(
    splitter='best',
    max_depth=10,
    criterion='gini'
);

In [None]:
dt_tuned.fit(X_train,y_train)

In [None]:
evaluate_model(dt_tuned)

In [None]:
from sklearn.model_selection import cross_val_score;

scores = cross_val_score(dt_tuned, X_train,y_train, cv=20);
scores.mean()

5. Choosing a model
<br>Since the cross validation score of models who gained 100% accuracy in both training and test dataset is still 100%, we can conclude that the best models for this data is either DecisionTree, Random Forest or KNN Classifier Algorithm.

In [None]:
knn.get_params()

In [None]:
from sklearn.feature_selection import mutual_info_classif

importances = mutual_info_classif(X,y);
feat_importances = pd.Series(importances, df.columns[0:len(df.columns) - 1]);
feat_importances.plot(kind='barh')