In [1]:
#import modules needed to handle data and encoding
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [50]:
#ml project read data from csv file
df = pd.read_csv("mushrooms.csv")

In [51]:
#check general info for data frame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [52]:
#check number of unique categories for each column
df.nunique()

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

In [53]:
#check data frame description
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [35]:
#prepare data for ml
X = df.drop(columns="class")

In [36]:
#prepare target for ml
y = df["class"]

In [54]:
#function which takes data frame, column name and encoding function and return data frame 
#with encoded column using specified encoder
#for int and floats function expects MinMaxScaler or StandardScaler
def data_encoder(data, column_header, encoding_function_name):
    if encoding_function_name==OneHotEncoder:
        column = np.array(data[column_header]).reshape(-1,1)
        encoder = encoding_function_name(sparse=False)
        encoder.fit(column)
        categories = encoder.categories_
        new_data = encoder.transform(column)
        for i in range(len(categories[0])):
            data[column_header + "_" + categories[0][i]] = pd.Series(new_data[:,i])
        data.drop(columns=column_header, inplace=True)
    elif (data[column_header].dtype.name != 'category') and ((data[column_header].dtype.name != 'object')):
        column = np.array(data[column_header]).reshape(-1,1)
        encoder = encoding_function_name()
        encoder.fit(column)
        new_data = encoder.transform(column)
        data[column_header + "_scal"] = pd.Series(new_data[:,0])
        data.drop(columns=column_header, inplace=True)
    else:
        print("Type of data does not match encoder!")
    return data

In [42]:
#function which normalize categories number; for more than 4 categories in one column 
#it calculates percentage value and for less than 5.5% category name is replaced with 'other'
def categories_normalization(data, column):
    if data[column].nunique()>4:
        cat = data[column].unique()
        number_for_cat = data[column].value_counts()
        data_len = len(data[column])
        list_to_replace = []
        for i in range(len(cat)):
            cat_in_perc = round(number_for_cat[i]/data_len*100, 2)
            if cat_in_perc <= 5.5:
                list_to_replace.append(cat[i])
            else:
                pass
        data[column].replace(to_replace=list_to_replace, value='other', inplace=True)
    else:
        pass
    return data

In [43]:
#change categories if they're very low in percentage for whole data frame
for column in X.columns:
    categories_normalization(X, column)

In [47]:
#encode whole df with OneHotEncoder as we only have categorical data
for column in X.columns:
    data_encoder(X, column, OneHotEncoder)

In [48]:
X.nunique()

cap-shape_b        2
cap-shape_f        2
cap-shape_other    2
cap-shape_s        2
cap-shape_x        2
                  ..
habitat_d          2
habitat_g          2
habitat_m          2
habitat_other      2
habitat_u          2
Length: 87, dtype: int64