In [10]:
import numpy as np
import sklearn as skl
import pandas as pd
import itertools

In [11]:
def process_raw_data():    
    df = pd.read_csv('mushrooms.csv')
    df.dropna(inplace=True)
    df_class = df['class']
    df_features = df.drop(labels=['class'], axis=1)

    df_one_hot_class = pd.get_dummies(df_class, drop_first=True)
    df_one_hot_features = pd.get_dummies(df_features, drop_first=True)
    return df_one_hot_class, df_one_hot_features

In [12]:
def exclude_correlated_attributes(df_one_hot_features):
    columns_dropped = []
    correlated = True
    
    while correlated:
        correlated = False
        mymatrix = df_one_hot_features.corr()
        curr_columns = df_one_hot_features.columns
        num_col = len(curr_columns)
        
        for i, j in itertools.combinations_with_replacement(range(num_col),2):
            if i<j and mymatrix.iloc[i][j] >= 0.75:
                correlated = True
                to_drop = curr_columns[i]
                columns_dropped.append(to_drop)
                df_one_hot_features.drop(labels=[to_drop], axis=1, inplace=True)
                break
    sorted_remaining_columns = df_one_hot_features.columns.sort_values()
    df_one_hot_features = df_one_hot_features[sorted_remaining_columns]
    
    with open('remaining_columns.txt', 'w') as f:
        for column in sorted_remaining_columns:
            f.write('{}\n'.format(column))
        f.close()
    
    return df_one_hot_features 

In [13]:
import pickle
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

def train_model(df_one_hot_features, df_one_hot_class):
    def reshape_classes(y):
        return np.array(y).reshape(len(y))
        
    X_train, X_test, y_train, y_test = train_test_split(
        df_one_hot_features, df_one_hot_class, 
        test_size=0.2, random_state=42
    )
    multi_model = MultinomialNB()
    params = {'alpha': [1, 0.7, 0.4, 0.1]}

    grid = GridSearchCV(multi_model, param_grid=params, scoring='accuracy', cv=4)
    grid.fit(X=X_train, y=reshape_classes(y_train))

    y_preds = grid.best_estimator_.predict(X_test)
    y_trues = reshape_classes(y_test)

    #print(accuracy_score(y_pred=y_preds, y_true=y_trues))
    #print(confusion_matrix(y_pred=y_preds, y_true=y_trues))
    
    if accuracy_score(y_pred=y_preds, y_true=y_trues) > 0.9:
        #If model is good enough, retrain it on the whole dataset
        grid.best_estimator_.fit(X=df_one_hot_features, y=reshape_classes(df_one_hot_class))
        pickle.dump(grid.best_estimator_, open('mushroom_model.pkl', 'wb'))
    return grid.best_estimator_

In [14]:
if __name__ == '__main__':
    df_class, df_features = process_raw_data()
    df_features = exclude_correlated_attributes(df_features)
    trained_model = train_model(df_features, df_class)
    print(list(df_features.iloc[1]))

[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
