In [1]:
#import modules needed to handle data and encoding
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [2]:
#ml project read data from csv file
df = pd.read_csv("mushrooms.csv")

In [3]:
#check general info for data frame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [4]:
#check number of unique categories for each column
df.nunique()

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

In [5]:
#prepare data for ml
X = df.drop(columns="class")

In [6]:
#function which normalize categories number; for more than 4 categories in one column 
#it calculates percentage value and for less than 5.5% category name is replaced with 'other'
def categories_normalization(data, column):
    if data[column].nunique()>4:
        cat = data[column].unique()
        number_for_cat = data[column].value_counts()
        data_len = len(data[column])
        list_to_replace = []
        for i in range(len(cat)):
            cat_in_perc = round(number_for_cat[i]/data_len*100, 2)
            if cat_in_perc <= 5.5:
                list_to_replace.append(cat[i])
            else:
                pass
        data[column].replace(to_replace=list_to_replace, value='other', inplace=True)
    else:
        pass
    return data

In [7]:
#function which takes data frame, column name and scaler name and return data frame 
#with encoded column using specified scaler
def data_scaler(data, column_header, scaler_name):
    column = np.array(data[column_header]).reshape(-1,1)
    scaler = scaler_name()
    scaler.fit(column)
    new_data = scaler.transform(column)
    data[column_header + "_scal"] = pd.Series(new_data[:,0])
    data.drop(columns=column_header, inplace=True)
    return data

In [8]:
#function which takes data frame, column name and encoder name and return data frame 
#with encoded column using specified encoder
def data_encoder(data, column_header, encoding_function_name):
    if encoding_function_name==OneHotEncoder:
        column = np.array(data[column_header]).reshape(-1,1)
        encoder = encoding_function_name(sparse=False)
        encoder.fit(column)
        categories = encoder.categories_
        new_data = encoder.transform(column)
        for i in range(len(categories[0])):
            data[column_header + "_" + categories[0][i]] = pd.Series(new_data[:,i])
        data.drop(columns=column_header, inplace=True)
    elif encoding_function_name==LabelEncoder:
        column = data[column_header]
        encoder = encoding_function_name()
        encoder.fit(column)
        new_data = encoder.transform(column)
        new_data.reshape(-1,1)
        data[column_header + "_enc"] = pd.Series(new_data)
        data.drop(columns=column_header, inplace=True)
    elif encoding_function_name==OrdinalEncoder:
        column = np.array(data[column_header]).reshape(-1,1)
        encoder = encoding_function_name()
        encoder.fit(column)
        new_data = encoder.transform(column)
        data[column_header + "_enc"] = pd.Series(new_data[:,0])
        data.drop(columns=column_header, inplace=True)
    else:
        print("Encoder is not specified in function")
    return data

In [9]:
#encoder original categories and create target for ml with LabelEncoder
data_encoder(df, "class", LabelEncoder)
y = df["class_enc"]

In [10]:
#change categories if they're very low in percentage for whole data frame
for column in X.columns:
    categories_normalization(X, column)

In [11]:
#encode whole df with OneHotEncoder as we only have categorical data
for column in X.columns:
    data_encoder(X, column, OneHotEncoder)