In [19]:
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import joblib
from sklearn.base import BaseEstimator, TransformerMixin

# Suppress specific warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Custom Transformer for Adding Custom Aggregations
class CustomAggregations(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X)
        X['mean_feature'] = X.mean(axis=1)
        X['sum_feature'] = X.sum(axis=1)
        X['median_feature'] = X.median(axis=1)
        X['std_feature'] = X.std(axis=1)
        X['max_feature'] = X.max(axis=1)
        X['min_feature'] = X.min(axis=1)
        return X.values

# Load Data
training_data = pd.read_csv('../data/training_data.csv', delimiter=';')
test_data = pd.read_csv('../data/test_data_no_target.csv', delimiter=';')

# Convert Numerical Columns from strings to floats
def convert_to_float(df):
    for col in df.columns:
        if df[col].dtype == 'object' and col not in ['Group', 'Class', 'Perform']:
            df[col] = df[col].str.replace(',', '.').astype(float)
    return df

training_data = convert_to_float(training_data)
test_data = convert_to_float(test_data)

# Handle Missing Values using Median Imputation
training_data.fillna(training_data.median(numeric_only=True), inplace=True)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

# One-Hot Encoding for the 'Group' column
training_data = pd.get_dummies(training_data, columns=['Group'])
test_data = pd.get_dummies(test_data, columns=['Group'])

# Ensure the test set has the same columns as the training set
missing_cols = set(training_data.columns) - set(test_data.columns) - {'Class', 'Perform'}
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[training_data.columns.drop(['Class', 'Perform'])]

# Separate features and target
X_train = training_data.drop(columns=['Class', 'Perform'])
y_train = training_data['Class'] + 1


In [20]:
# Feature Engineering
pipeline = Pipeline([
    ('feature_engineering', FeatureUnion([
        ('scaler', StandardScaler()),
        # ('custom_aggregations', CustomAggregations())
    ])),
    ('xgb_model', XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='mlogloss'))
])

# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train XGBoost Model
pipeline.fit(X_train_split, y_train_split)

In [21]:
# Get CatBoost predictions as a new feature
xgb_train_pred = pipeline.predict_proba(X_train_split)[:, 1]
xgb_val_pred = pipeline.predict_proba(X_val_split)[:, 1]
xgb_test_pred = pipeline.predict_proba(test_data)[:, 1]

# Add CatBoost predictions as a feature
X_train_split['xgb_pred'] = xgb_train_pred
X_val_split['xgb_pred'] = xgb_val_pred
test_data['xgb_pred'] = xgb_test_pred

In [22]:
print("Training Catboost Model...")
# Train LightGBM Model
cat_model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, verbose=0)
cat_model.fit(X_train_split, y_train_split)

Training Catboost Model...


<catboost.core.CatBoostClassifier at 0x310dcf310>

In [23]:
# Evaluate the LightGBM model
y_val_pred = cat_model.predict(X_val_split)
accuracy = accuracy_score(y_val_split, y_val_pred)
precision = precision_score(y_val_split, y_val_pred, average='weighted')
recall = recall_score(y_val_split, y_val_pred, average='weighted')
f1 = f1_score(y_val_split, y_val_pred, average='weighted')

print(f'LightGBM Model - Accuracy: {accuracy}')
print(f'LightGBM Model - Precision: {precision}')
print(f'LightGBM Model - Recall: {recall}')
print(f'LightGBM Model - F1 Score: {f1}')

LightGBM Model - Accuracy: 0.448125
LightGBM Model - Precision: 0.43430099695927127
LightGBM Model - Recall: 0.448125
LightGBM Model - F1 Score: 0.4300878945193263


In [24]:
from metrics import calculate_custom_error
print(calculate_custom_error(y_val_split, y_val_pred))

0.884375


In [25]:
print(np.unique(y_val_pred, return_counts=True))

(array([0, 1, 2]), array([ 415,  183, 1002]))


In [26]:
# Make predictions on the test set
y_test_pred = cat_model.predict(test_data) - 1
np.savetxt('predictions_lgb.txt', y_test_pred, fmt='%d', newline='\n')

# Export models
joblib.dump(pipeline, 'xgb_model.pkl')
joblib.dump(cat_model, 'lgb_model.pkl')

['lgb_model.pkl']

In [17]:
print(np.unique(y_test_pred, return_counts=True))

(array([-1,  0,  1]), array([ 737,  158, 1105]))
