In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, hstack
import lightgbm as lgb
import numpy as np

In [3]:
class MushroomClassifier:
    def __init__(self, train_file, test_file, submission_file):
        self.train_file = train_file
        self.test_file = test_file
        self.submission_file = submission_file
        self.model = lgb.LGBMClassifier(n_estimators=100, max_depth=10, num_leaves=2**10 - 1, random_state=42)

        self.encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')

    def load_data(self):
        self.train_data = pd.read_csv(self.train_file)
        self.test_data = pd.read_csv(self.test_file)
        self.sample_submission = pd.read_csv(self.submission_file)

    def preprocess_data(self):
        # Fill missing values
        for column in self.train_data.columns:
            if self.train_data[column].dtype == 'object':
                self.train_data[column] = self.train_data[column].fillna(self.train_data[column].mode()[0])
            else:
                self.train_data[column] = self.train_data[column].fillna(self.train_data[column].mean())
        for column in self.test_data.columns:
            if self.test_data[column].dtype == 'object':
                self.test_data[column] = self.test_data[column].fillna(self.test_data[column].mode()[0])
            else:
                self.test_data[column] = self.test_data[column].fillna(self.test_data[column].mean())

        # Separate the target column
        y = self.train_data['class']
        self.train_data = self.train_data.drop(['class'], axis=1)

        # Identify categorical columns
        categorical_columns = self.train_data.select_dtypes(include=['object']).columns

        # Fit the encoder on the training data
        self.encoder.fit(self.train_data[categorical_columns])

        # Transform the training data
        train_encoded = self.encoder.transform(self.train_data[categorical_columns])
        train_non_categorical = self.train_data.drop(categorical_columns, axis=1)
        train_non_categorical_sparse = csr_matrix(train_non_categorical.values)
        self.train_data = hstack([train_non_categorical_sparse, train_encoded])

        # Transform the test data
        test_encoded = self.encoder.transform(self.test_data[categorical_columns])
        test_non_categorical = self.test_data.drop(categorical_columns, axis=1)
        test_non_categorical_sparse = csr_matrix(test_non_categorical.values)
        self.test_data = hstack([test_non_categorical_sparse, test_encoded])

        # Convert target column to numeric
        y = y.map({'e': 0, 'p': 1})
        self.y = y.values

    def train_model(self):
        X_train, X_val, y_train, y_val = train_test_split(self.train_data, self.y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
        y_pred = self.model.predict(X_val)
        mcc = matthews_corrcoef(y_val, y_pred)
        print(f'Matthews Correlation Coefficient on validation set: {mcc}')

    def predict(self):
        predictions = self.model.predict(self.test_data)
        submission = pd.DataFrame({'id': self.test_data[:, 0].toarray().flatten(), 'class': predictions})
        submission['class'] = submission['class'].map({0: 'e', 1: 'p'})
        submission.to_csv('submission.csv', index=False)
        print('Submission file created: submission.csv')

In [4]:
if __name__ == "__main__":
    train_file = '../data/train.csv'
    test_file = '../data/test.csv'
    submission_file = '../data/sample_submission.csv'
    
    classifier = MushroomClassifier(train_file, test_file, submission_file)
    classifier.load_data()
    classifier.preprocess_data()
    classifier.train_model()
    classifier.predict()

[LightGBM] [Info] Number of positive: 1364404, number of negative: 1129152
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.517876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1576
[LightGBM] [Info] Number of data points in the train set: 2493556, number of used features: 282
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547172 -> initscore=0.189251
[LightGBM] [Info] Start training from score 0.189251
Matthews Correlation Coefficient on validation set: 0.9801885866854735
Submission file created: submission.csv
