## Mushroom Classification
The goal of this activity is to create a model that predicts whether the mushroom is edible or poisonous.

* 1.1Exploration of Data

In [1]:
import numpy as np;
import pandas as pd;
import seaborn as sns;
import matplotlib.pyplot as plt;

In [2]:
df = pd.read_csv('mushrooms.csv');

FileNotFoundError: [Errno 2] No such file or directory: 'mushrooms.csv'

In [None]:
df.head(5)

In [None]:
#To check if the data is imbalance

df['class'].value_counts()

In [None]:
sns.countplot(df['class']);

### 2. Data Pre-Processing
* 2.1 Data Cleaning 
* 2.1.1 Drop Duplicates

In [None]:
#check the shape of the dataframe 
df.shape

In [None]:
#drop duplicates
df.drop_duplicates(inplace = True)

In [None]:
#check again the shape of the dataframe 
df.duplicated().sum()

* 2.1.2 Fill-in Missing Values

In [None]:
df.isna().sum()

### As we can see, the dataset does not have any missing values in any of its features

In [None]:
#Columns - Features 
#target variable - attribute

* 2.1.3 Removing Extreme Values

In [None]:
#check types of features
df.dtypes

### Since we do not have any numerical features, we will skip "removing extreme values" step

* 2.2 Converting Categorical Features into numerical features

In [None]:
### We will use label encoding, label encoding will loop all columns to change it into 1,0 values

In [None]:
#import
from sklearn.preprocessing import LabelEncoder

In [None]:
df.columns

In [None]:

encoder = LabelEncoder();

#para mapreserve yung lumang data
df_new = df.copy();


In [None]:
#loop

for col in df_new.columns:
    if(col != 'class'):
        df_new[col] = encoder.fit_transform(df_new[col]);

In [None]:
df_new.head(5)

In [None]:
df_new['class'] = np.where(df_new['class'] == 'p', 1,0)

In [None]:
df_new.head(5)

In [None]:
df_new.dtypes

* 2.3 Data Normalization / Feature Scalling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
df_scaled = df_new.copy()
scaler = MinMaxScaler()

In [None]:
#loop

for col in df_scaled.columns:
    if(col != 'class'):
        df_scaled[col] = scaler.fit_transform(df_scaled[col].values.reshape(-1,1));
        #convert into 2 dimensional array - values.reshape(-1,1)

In [None]:
df_scaled.head(5)

* FEATURE SELECTION / Correlation Analysis

In [None]:
sns.heatmap(df_scaled.corr());

In [None]:
df_scaled = df_scaled.drop(['veil-type'], axis=1)

In [None]:
sns.heatmap(df_scaled.corr());

In [None]:
corr = df_scaled.corr()
corr_mask = corr.mask(np.triu(np.ones_like(corr, dtype=np.bool)))
corr_unstacked = corr_mask.unstack().sort_values(ascending = False);
corr_df = pd.DataFrame(corr_unstacked[corr_unstacked>.75]).sort_index()

In [None]:
corr_df

In [None]:
#drop gill-attachments to remove any correlation
df_scaled = df_scaled.drop(['gill-attachment'], axis=1)

In [None]:
sns.heatmap(df_scaled.corr());

### 3. DATA MODELLING
* 3.1 Split Target variables to feature variables

In [None]:
#class is traget var
X = df_scaled.drop(['class'], axis=1);
y = df_scaled['class'];

* 3.2 Split the dataset to training and tests
* if supervised learning

In [None]:
#importing model selection for training and test
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, train_size=.80, stratify=y);
#stratify - para same ang feature.

In [None]:
#see values of train and test
print(X.shape)
print(X_train.shape)
print(X_test.shape)

* 3.3 Choose The Best Model 

* 3.3.1 Naive Bayes

In [None]:
#gaussian naive bayes
from sklearn.naive_bayes import GaussianNB

In [None]:
#for evaluation

from sklearn.metrics import confusion_matrix 
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report # so that we will not compute the accuracy

In [None]:
def evaluate_model(model):
    print('Classification Report\n',classification_report(y_test, model.predict(X_test), target_names=['edible','poisonous']));
    print('Confusion Matrix \n',confusion_matrix(y_test, model.predict(X_test), labels=[0,1]));
    print('\nTraining Accuracy Score: ', model.score(X_train, y_train))
    print('Test Accuracy Score: ', model.score(X_test, y_test))
    print('\n ROC Curve')
    plot_roc_curve(model,X_test,y_test);

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train);
gnb.score(X_test, y_test)

In [None]:
evaluate_model(gnb)
#classification - titignan lagi ang f1 score para sa accuracy rate
#not smooth

* 3.3.2 Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train);
bnb.score(X_test, y_test)

In [None]:
evaluate_model(bnb)
#not smooth

* 3.3.3 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train);
dt.score(X_test, y_test)

In [None]:
evaluate_model(dt)
#panget kasi hindi magiging accurate ang prediction

* 3.3.4 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train);
rf.score(X_test, y_test)

In [None]:
evaluate_model(rf)
#smooth but not recommended same with decision tree

* 3.3.5 KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train);
knn.score(X_test, y_test)

In [None]:
evaluate_model(knn)

* 3.3.5 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train);
lr.score(X_test, y_test)

In [None]:
evaluate_model(lr)
#not so smooth

### 4. Hyperparameter Tuning / Cross Validation
To tune our model into a better model

* 4.1.1 Decision Tree Classifier Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best','random'],
    'max_depth': [5,10,15,20]
}

rsearch = RandomizedSearchCV(DecisionTreeClassifier(), params, n_iter = 15, cv = 10)
rsearch.fit(X_train, y_train)

In [None]:
rsearch.best_params_

In [None]:
dt_tuned = DecisionTreeClassifier(
        splitter ='best',
        max_depth = 10, 
        criterion = 'gini'
)

In [None]:
dt_tuned.fit(X_train, y_train)

In [None]:
evaluate_model(dt_tuned)

* 4.4.2 RANDOM FOREST CLASSIFIER HYPERPARAMETER TUNING

In [None]:
rf.get_params()

In [None]:
params = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [5,10,15,20,30,50,100],
    'max_depth': [5,10,15,20]
}

rsearch_rf = RandomizedSearchCV(RandomForestClassifier(), params, n_iter = 15, cv = 10)
rsearch_rf.fit(X_train, y_train)

In [None]:
rsearch_rf.best_params_

In [None]:
rf_tuned = RandomForestClassifier(
        n_estimators = ,
        max_depth = 10, 
        criterion = 'gini'
)

In [None]:
rf_tuned.fit(X_train, y_train)

In [None]:
evaluate_model(rf_tuned)

In [None]:
#cross validation 
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(dt_tuned, X_train, y_train, cv=20);
scores.mean()

### 5. Choosing a model / conclusion


* Since the cross validation score of models who gaines 100% accuracy in both training and test dataset is still 100%, we can conclude that the best models for tjis data is either DecisionTree, RandomForest or KNN Classifier Algorithm

In [None]:
# Golden Rule
#pag clean ang data good ang model
#pinaka magandang model ang pipiliin, mataas