In [8]:
# Importing required modules
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.model_selection
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split as ttsplit

%matplotlib inline

In [9]:
#Reading CSV and converting to dataframe
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 
                  sep = ',', 
                  header = None, 
                  usecols = [0,3,5], 
                  names = ["Mushroom_Class", "Cap_Color", "Odor"])
data.head(5)

Unnamed: 0,Mushroom_Class,Cap_Color,Odor
0,p,n,p
1,e,y,a
2,e,w,l
3,p,w,p
4,e,g,n


In [10]:
## Converting data to numeric values
data.replace(to_replace={"Mushroom_Class":{'p':1, 'e':0}}, inplace = True)
data.replace(to_replace={"Cap_Color":{'n':0, 'b':1, 'c':2, 'g':3, 'r':4, 'p':5, 'u':6, 'e':7, 'w':8, 'y':9}}, inplace=True)
data.replace(to_replace={"Odor":{'a':0, 'l':1, 'c':2, 'y':3, 'f':4, 'm':5, 'n':6, 'p':7, 's':8}}, inplace=True)
data.head(5)

Unnamed: 0,Mushroom_Class,Cap_Color,Odor
0,1,0,7
1,0,9,0
2,0,8,1
3,1,8,7
4,0,3,6


In [11]:
# Counting edible and poisionus mushrooms
count = data['Mushroom_Class'].value_counts()
count

0    4208
1    3916
Name: Mushroom_Class, dtype: int64

In [12]:
# Converting two predictor into dummy variables
m_color = pd.Series(data['Cap_Color'])
c = pd.get_dummies(m_color)

odor = pd.Series(data['Odor'])
o = pd.get_dummies(odor)

# Combining both into a new column
mushroom_data = pd.concat([c, o, data['Mushroom_Class']], axis = 1)

mushroom_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,Mushroom_Class
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [13]:
# counting data shapes. result = 8124 rows and 20 columns
mushroom_data.shape

(8124, 20)

In [15]:
x = mushroom_data.iloc[:, :-1].values # defining x values for training model
y = mushroom_data.iloc[:, 1].values # defining y value for training model
X_train, X_test, Y_train, Y_test = ttsplit(x,y, random_state=1)

print(X_train.shape)
print(X_test.shape)

(6093, 19)
(2031, 19)


In [16]:
print(Y_train.shape)
print(Y_test.shape)

(6093,)
(2031,)


In [17]:
lr = sklearn.linear_model.LinearRegression()
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)
t = [1,0]
p = [1,0]
print(sklearn.metrics.mean_absolute_error(t,p))
print(sklearn.metrics.mean_squared_error(t, p))
print(np.sqrt(sklearn.metrics.mean_squared_error(t, p)))

0.0
0.0
0.0


In [18]:
print(np.sqrt(metrics.mean_squared_error(Y_test, y_pred)))

3.7554262791171435e-16


In [19]:
# Train and Test with "CAP COLOR" feature
X = mushroom_data.iloc[:, 0:9].values
Y = mushroom_data.iloc[:, 1].values

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, random_state=1)
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)

print(np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

3.873135371241152e-16


In [20]:
# Train and Test with "ODOR" feature 
X = mushroom_data.iloc[:, 10:18].values
Y = mushroom_data.iloc[:, 1].values

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, random_state=1)
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)

print(np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

0.14928001774973673


In [21]:
# The ODOR feature can be used to predict edible or poisonous mushroom 
# because the square root is less than COLOR error. 
# Less error means a better or more accurate prediction.