# Mushroom Project

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
%matplotlib inline
import warnings
warnings.simplefilter('ignore')

In [None]:
pd.options.display.max_columns = None
df = pd.read_csv('agaricus-lepiota.csv')
df.head()

We cannot infer any information looking at the dataset since the columns data are encoded. we will have to see 
each coolumn and give a proper name as per the data present in the column. we will refer agaricus-lepiota.names 
present in the UCI repository.
https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/

In [None]:
# unique values in column p
df['p'].unique()

In column p it is classified that if the mushroom is edible or poisonous as per the information avaialable in the UCI repository

In [None]:
df.rename(columns={'p':'class'}, inplace=True)
df.head()

let us also label encode the column class

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(df['class'])
print(encoder.classes_)
df['class'] = encoder.transform(df['class'])
df.head()

In the column 'class' 0 is encoded as Edible and 1 is encoded as Poison

Looking at the rest columns and from the information from the UCI repository we can see that the lables in the data represent the color, structure, place where grown etc of the mushrooms. All the columns are catogorical so lets encode then to convert into numerical one.

since p.2 have only one value p (0 after encoding) we can drop it as it will not affect our analysis

In [None]:
df.drop(['p.2'],axis=1,inplace=True)

In [None]:
# creating a list of columns to be converted into the numerical varibale
cols=['x', 's', 'n', 't', 'p.1', 'f', 'c', 'n.1', 'k', 'e', 'e.1','s.1', 's.2', 'w', 'w.1', 'w.2', 
      'o', 'p.3', 'k.1', 's.3', 'u']

In [None]:
# using for loop we will pass each column and encode it.
for i in range(len(cols)):
    encoder = LabelEncoder()
    encoder.fit(df[cols[i]])
    df[cols[i]] = encoder.transform(df[cols[i]])

In [None]:
df.head()

In [None]:
corr = df.corr()
plt.figure(figsize=(20,15))
sns.heatmap(corr,vmax=0.6, square=True,annot=True,fmt='.2f')
plt.show()

The columns which shows very less correlation with the class column are:
'x','n'
Hence, we can remove these columns from our dataframe.

In [None]:
df_new = df.drop(['x','n'],axis=1)
df_new.head()

In [None]:
# let us check the correltation plot once again to verify we have all the columns which are correlated to the
#target column class
corr = df_new.corr()
plt.figure(figsize=(20,15))
sns.heatmap(corr,vmax=1,linecolor='black',linewidths=0.5,square=True,annot=True,fmt='.2f')
plt.show()

# Splitting the dataset into x and y

In [None]:
x = df_new.drop(['class'], axis=1)
y = df_new['class']

In [None]:
# Finding the best random state
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

best_rstate=0
accu=0

for i in range(30,200):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.25,random_state=i)
    mod=LogisticRegression()
    mod.fit(x_train,y_train)
    y_pred = mod.predict(x_test)
    tempaccu=accuracy_score(y_test,y_pred)
    if tempaccu > accu:
        accu = tempaccu
        best_rstate = i
        
print(f"Best Accuracy {accu*100} found on Random state {best_rstate}")

We found the best random state at 121 with Best Accuracy of 96.1% and will be using it in train_test_split in next step

In [None]:
# creating train test split using the best random state which we found above
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=121)

# Importing Classification libraries for model building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
algo = [LogisticRegression,RandomForestClassifier,SGDClassifier,DecisionTreeClassifier,
        KNeighborsClassifier,GaussianNB]

for each in algo:
    model = each()
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print(f"{model}\n",classification_report(y_test, y_pred))
    print("\n")

LogisticRegression has good accuracy score of 96%, so we will go for LogisticRegression

In [None]:
# creating parameter list to pass in GridSearchCV
parameters = {"max_iter" : [500,700,900,1100,1300,1500],
              "penalty" : ["elasticnet","l1","l2"]}

In [None]:
from sklearn.model_selection import GridSearchCV

GCV = GridSearchCV(LogisticRegression(),parameters,cv=5,scoring='r2') # initializing GridsearchCV
GCV.fit(x_train,y_train)
GCV.best_estimator_   # Finding best estimators
GCV_pred = GCV.best_estimator_.predict(x_test) #predicting the values using best estimators found by gridserchcv
print("Final Accuracy: ",accuracy_score(y_test,GCV_pred)*100) # final accuracy