In [40]:
#import modules needed to handle data and encoding
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

class Model:
    def __init__(self, model_type=None):
        if model_type=='rf':
            self.user_defined_model = RandomForestClassifier(oob_score=True)
        elif model_type=='SVC':
            self.user_defined_model = SVC(kernel='rbf')
        elif model_type=='dt':
            self.user_defined_model = DecisionTreeClassifier(max_depth=4 , min_samples_leaf=3)
        elif model_type=='KNN':
            self.user_defined_model = KNeighborsClassifier(n_neighbors=2,leaf_size=20, algorithm='kd_tree',p=1)
        else:
            self.user_defined_model = LogisticRegression(random_state=42)
    
    #read data from file, create data frame
    def df(self, datafile):
        self.df = pd.read_csv(datafile)
    
    #create target, encode categories
    def create_target(self, column):
        self.y = self.df[column]
        encoder = LabelEncoder()
        encoder.fit(self.y)
        new_data = encoder.transform(self.y)
        new_data.reshape(-1,1)
    
    #create data frame with features, drop columns which are unnecessary or are target
    def create_X(self, column):
        self.X = self.df.drop(columns=column)
    
    #function which normalize categories number; for more than 4 categories in one column 
    #it calculates percentage value and for less than 5.5% category name is replaced with 'other'
    def categories_normalization(self):
        for column in self.X.columns:
            if self.X[column].nunique()>4:
                cat = self.X[column].unique()
                number_for_cat = self.X[column].value_counts()
                data_len = len(self.X[column])
                list_to_replace = []
                for i in range(len(cat)):
                    cat_in_perc = round(number_for_cat[i]/data_len*100, 2)
                    if cat_in_perc <= 5.5:
                        list_to_replace.append(cat[i])
                    else:
                        pass
                self.X[column].replace(to_replace=list_to_replace, value='other', inplace=True)
            else:
                pass

    #function which takes data frame, column name and scaler name and return data frame 
    #with encoded column using specified scaler        
    def data_scaler(self, column_header, scaler_name):
        column = np.array(self.X[column_header]).reshape(-1,1)
        scaler = scaler_name()
        scaler.fit(column)
        new_data = scaler.transform(column)
        self.X[column_header + "_scal"] = pd.Series(new_data[:,0])
        self.X.drop(columns=column_header, inplace=True)
    
    #function which takes data frame, column name and encoder name and return data frame 
    #with encoded column using specified encoder
    def data_encoder(self, encoding_function_name):
        for column in self.X.columns:
            if encoding_function_name==OneHotEncoder:
                column_ = np.array(self.X[column]).reshape(-1,1)
                encoder = encoding_function_name(sparse=False)
                encoder.fit(column_)
                categories = encoder.categories_
                new_data = encoder.transform(column_)
                for i in range(len(categories[0])):
                    self.X[column + "_" + categories[0][i]] = pd.Series(new_data[:,i])
                self.X.drop(columns=column, inplace=True)
            elif encoding_function_name==LabelEncoder:
                column_ = self.X[column]
                encoder = encoding_function_name()
                encoder.fit(column_)
                new_data = encoder.transform(column_)
                new_data.reshape(-1,1)
                self.X[column + "_enc"] = pd.Series(new_data)
                self.X.drop(columns=column, inplace=True)
            elif encoding_function_name==OrdinalEncoder:
                column_ = np.array(self.X[column]).reshape(-1,1)
                encoder = encoding_function_name()
                encoder.fit(column_)
                new_data = encoder.transform(column_)
                self.X[column + "_enc"] = pd.Series(new_data[:,0])
                self.X.drop(columns=column, inplace=True)
            else:
                print("Encoder is not specified in function")
        
    def split(self, test_size):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = test_size, random_state = 42, shuffle=True)

    def fit(self):
        self.model = self.user_defined_model.fit(self.X_train, self.y_train)
    
    def predict(self):
        result = self.user_defined_model.predict(self.X_test)
        return result
    
    #checking score of chosen model on test data
    def score(self):
        m_score = self.model.score(self.X_test, self.y_test)
        return f"Accuracy score for model is {m_score}"

In [34]:
#Random forest model
model_instance = Model(model_type='rf')
model_instance.df("mushrooms.csv")
model_instance.create_target("class")
model_instance.create_X(["class", "gill-attachment", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", 
                      "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "spore-print-color"])
model_instance.categories_normalization()
model_instance.data_encoder(OneHotEncoder)
model_instance.split(0.2)
model_instance.fit()
model_instance.predict()
model_instance.score()

'Accuracy score for model is 1.0'

In [35]:
#Logistic regression model
model_instance2 = Model()
model_instance2.df("mushrooms.csv")
model_instance2.create_target("class")
model_instance2.create_X(["class", "gill-attachment", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", 
                      "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "spore-print-color"])
model_instance2.categories_normalization()
model_instance2.data_encoder(OneHotEncoder)
model_instance2.split(0.2)
model_instance2.fit()
model_instance2.predict()
model_instance2.score()

'Accuracy score for model is 0.9981538461538462'

In [36]:
#SVC rbf model
model_instance3 = Model("SVC")
model_instance3.df("mushrooms.csv")
model_instance3.create_target("class")
model_instance3.create_X(["class", "gill-attachment", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", 
                      "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "spore-print-color"])
model_instance3.categories_normalization()
model_instance3.data_encoder(OneHotEncoder)
model_instance3.split(0.2)
model_instance3.fit()
model_instance3.predict()
model_instance3.score()

'Accuracy score for model is 1.0'

In [37]:
#KNN model
model_instance4 = Model("KNN")
model_instance4.df("mushrooms.csv")
model_instance4.create_target("class")
model_instance4.create_X(["class", "gill-attachment", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", 
                      "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "spore-print-color"])
model_instance4.categories_normalization()
model_instance4.data_encoder(OneHotEncoder)
model_instance4.split(0.2)
model_instance4.fit()
model_instance4.predict()
model_instance4.score()

'Accuracy score for model is 0.9993846153846154'

In [42]:
#Decision Tree model
model_instance5 = Model("dt")
model_instance5.df("mushrooms.csv")
model_instance5.create_target("class")
model_instance5.create_X(["class", "gill-attachment", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", 
                      "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "spore-print-color"])
model_instance5.categories_normalization()
model_instance5.data_encoder(OneHotEncoder)
model_instance5.split(0.2)
model_instance5.fit()
model_instance5.predict()
model_instance5.score()

'Accuracy score for model is 0.9895384615384616'