In [1]:
# Import statements
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn import metrics

In [2]:
# Data dictionary just for my own sanity
data_dict = {'cap-shape': {'b': 'bell', 'c': 'conical', 'x': 'convex', 'f': 'flat', 's': 'sunken', 'p': 'spherical', 'o': 'others'},
             'cap-surface': {'i': 'fibrous', 'g': 'grooves', 'y': 'scaly', 's': 'smooth', 'd': 'dry', 'h': 'shiny', 'l': 'leathery', 'k': 'sikly', 
                             't': 'sticky', 'w': 'wrinkled', 'e': 'fleshy'},
             'stem-root': {'b': 'bulbous', 's': 'swollen', 'c': 'club', 'u': 'cup', 'e': 'equal', 'z': 'rizhomorphs', 'r': 'rooted'},
             'ring-type': {'c': 'cobwebby', 'e': 'evanexcent', 'r': 'flaring', 'g': 'grooved', 'l': 'large', 'p': 'pendant', 's': 'sheathing', 
                           'z': 'zone', 'y': 'scaly', 'm': 'movable', 'f': 'none', '?': 'unknown'},
             'season': {'s': 'spring', 'u': 'summer', 'a': 'autumn', 'w': 'winter'},
             'habitat': {'g': 'grasses', 'l': 'leaves', 'm': 'meadows', 'p': 'paths', 'h': 'heaths', 'u': 'urban', 'w': 'waste', 'd': 'woods'}}

In [3]:
# Reading in data from CSV
mushroom_baseDF= pd.read_csv('mushroom_overload.csv', low_memory=False)

In [4]:
# Getting a look at the different columns available
print(f"Columns: {mushroom_baseDF.columns}", "\n")
print(f"DataFrame shape: {mushroom_baseDF.shape}")

Columns: Index(['class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object') 

DataFrame shape: (6723116, 21)


In [5]:
# Simple for loop to output the number of null values in each feature
for i in mushroom_baseDF.columns:
    print(f"Number of null-values in {i}: {mushroom_baseDF[i].isnull().sum().sum()}")

Number of null-values in class: 0
Number of null-values in cap-diameter: 0
Number of null-values in cap-shape: 0
Number of null-values in cap-surface: 1579337
Number of null-values in cap-color: 0
Number of null-values in does-bruise-or-bleed: 0
Number of null-values in gill-attachment: 1066721
Number of null-values in gill-spacing: 2800709
Number of null-values in gill-color: 0
Number of null-values in stem-height: 0
Number of null-values in stem-width: 0
Number of null-values in stem-root: 5763499
Number of null-values in stem-surface: 4263506
Number of null-values in stem-color: 0
Number of null-values in veil-type: 6367343
Number of null-values in veil-color: 5893024
Number of null-values in has-ring: 0
Number of null-values in ring-type: 276729
Number of null-values in spore-print-color: 6049365
Number of null-values in habitat: 0
Number of null-values in season: 0


In [6]:
# Slimming down the dataframe into a new one to only have the relavent data
mushroom_DF = mushroom_baseDF[['cap-shape', 'cap-surface', 'stem-root', 'ring-type', 'season', 'habitat']]
mushroom_DF.head()

Unnamed: 0,cap-shape,cap-surface,stem-root,ring-type,season,habitat
0,f,g,s,g,a,d
1,x,h,s,g,u,d
2,f,h,s,p,a,d
3,f,g,s,g,u,d
4,x,h,s,p,w,d


In [7]:
# Dropping all rows with null values. Since I am testing if the selected categorical variables can be used to predict the dependent variable,
# I am only interested in rows in which all variables are filled out.
mushroom_DF_cleaned = mushroom_DF.dropna(axis = 0, how = 'any')

In [8]:
# Checking the shape
mushroom_DF_cleaned.shape

(643352, 6)

In [9]:
# Checking again for null values
for i in mushroom_DF_cleaned.columns:
    print(f"Number of null-values in {i}: {mushroom_DF_cleaned[i].isnull().sum().sum()}")

Number of null-values in cap-shape: 0
Number of null-values in cap-surface: 0
Number of null-values in stem-root: 0
Number of null-values in ring-type: 0
Number of null-values in season: 0
Number of null-values in habitat: 0


In [10]:
# Breaking the DataFrame into independent and dependent variables
X = mushroom_DF_cleaned[['cap-shape', 'cap-surface', 'stem-root', 'ring-type', 'season']]
y = mushroom_DF_cleaned[['habitat']]

In [11]:
# Using label encoding on the dependent variable and printing out the encoding mapping.
label_encoder = LabelEncoder()  
y.loc[:, 'habitat'] = label_encoder.fit_transform(y['habitat']) 
print("Original categories and their encoded values:", dict(enumerate(label_encoder.classes_)))

Original categories and their encoded values: {0: 'd', 1: 'g', 2: 'h', 3: 'l', 4: 'm'}


In [12]:
# Creating dummy variables
X = pd.get_dummies(X)
X_array = X.to_numpy()

In [13]:
# Creating K-folds and splitting data
kf = KFold(n_splits=10, shuffle=True, random_state=42)
data_split_object = kf.split(X_array, y)

In [16]:
# Just visualizing the split
my_split_data = next(data_split_object)
print("Train indicies are: ", my_split_data[0][0:10])
print("Test indicies are: ", my_split_data[1][0:10])

Train indicies are:  [ 0  1  2  3  4  5  8  9 10 11]
Test indicies are:  [  6   7  14  36  52  54  63  76  81 102]


In [17]:
predicted_y = []
expected_y = []

for train_index, test_index in kf.split(X_array, y):
    X_train, X_test = X_array[train_index], X_array[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


    classifier = LogisticRegression(multi_class='multinomial', random_state=42)
    classifier.fit(X_train, y_train)

    predicted_this_fold = classifier.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, predicted_this_fold)
    print("Accuracy from this fold is: " + accuracy.__str__())

    predicted_y.extend(predicted_this_fold)

    expected_y.extend(y_test)