In [1]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import KNNImputer

import matplotlib.pyplot as plt
%matplotlib inline 

np.random.seed(0)

def load_csv(file_path):
    # Load the data
    data = pd.read_csv(file_path, delimiter=';')

    # Replace commas in numeric columns and convert them to floats
    data = data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')
    return data

In [2]:
def drop_columns_with_excessive_nans(dataframe, threshold=200):
    """Drop columns from a DataFrame where the number of NaN values exceeds the specified threshold."""
    nan_counts = dataframe.isna().sum()
    columns_to_drop = nan_counts[nan_counts > threshold].index
    return dataframe.drop(columns=columns_to_drop)

In [3]:
import numpy as np
from sklearn.metrics import confusion_matrix

cost_matrix = np.array([[0, 1, 2],
                        [1, 0, 1],
                        [2, 1, 0]])
def calculate_custom_error(preds, gt, cost_matrix=cost_matrix):
    """
    Calculate a custom error metric based on a confusion matrix and a cost matrix.

    Args:
    preds (array-like): Predicted labels.
    gt (array-like): Ground truth (actual) labels.
    cost_matrix (numpy.ndarray): A matrix of costs associated with misclassifications.

    Returns:
    float: The calculated error metric.
    """
    # Calculate the confusion matrix
    cm = confusion_matrix(gt, preds)
    
    # Validate dimensions of cost_matrix
    if cm.shape != cost_matrix.shape:
        raise ValueError("Cost matrix dimensions must match the confusion matrix dimensions.")
    
    # Calculate weighted confusion matrix
    weighted_cm = cm * cost_matrix
    
    # Calculate the custom error
    total_samples = len(gt)
    if total_samples == 0:
        raise ValueError("The length of ground truth cannot be zero.")
    
    error = np.sum(weighted_cm) / total_samples
    return error


## Data Loader

In [18]:
train_data = load_csv('../data/training_data.csv')
train_data = train_data[['I5', 'I8', 'I9', 'I18', 'I37', 'I38', 'I44', 'I47', 'I57', 'dI5',
       'dI6', 'dI23', 'dI25', 'dI28', 'dI35', 'dI40', 'dI42', 'dI46', 'dI47',
       'dI54', 'dI56', 'dI57', 'dI58', 'Group', 'Class', 'Perform']]

le = LabelEncoder()
train_data['Group'] = le.fit_transform(train_data['Group'])

# Cut outliers
top_quantiles = train_data.quantile(0.97)
outliers_top = (train_data > top_quantiles)

low_quantiles = train_data.quantile(0.03)
outliers_low = (train_data < low_quantiles)

train_data = train_data.mask(outliers_top, top_quantiles, axis=1)
train_data = train_data.mask(outliers_low, low_quantiles, axis=1)

In [19]:
train_data = train_data.groupby(['Group']).transform(lambda x: x.fillna(x.mean()))
train_data.fillna(0, inplace=True)

In [20]:
X = train_data.drop(columns=['Class', 'Perform'], axis=1)  # Features
performs = train_data['Perform']
y = train_data['Class'] # Target variable

### Filling nan values

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
np.unique(y_train, return_counts=True)

(array([-1,  0,  1]), array([2477,  909, 3014]))

# Training

### Model Selection

In [22]:
from sklearn.neighbors import KNeighborsClassifier
state = 42

model = RandomForestClassifier(random_state=state, n_estimators=100, class_weight='balanced', verbose=1)
# model = SVC(random_state=state, class_weight='balanced')
# model = KNeighborsClassifier(n_neighbors=3, weights='distance')

In [23]:
# Initialize the Decision Tree Classifier
# model = DecisionTreeClassifier(random_state=42)
# model = GaussianNB()

# Initialize search
# model.fit(X_train_res, y_train_res)
model.fit(X_train, y_train)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Predict on the test set
y_pred = model.predict(X_test)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [24]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Custom Error:", calculate_custom_error(y_pred, y_test))

Accuracy: 0.474375
Classification Report:
              precision    recall  f1-score   support

          -1       0.44      0.36      0.39       619
           0       0.33      0.00      0.01       227
           1       0.49      0.71      0.58       754

    accuracy                           0.47      1600
   macro avg       0.42      0.36      0.33      1600
weighted avg       0.45      0.47      0.43      1600

Confusion Matrix:
[[222   2 395]
 [ 70   1 156]
 [218   0 536]]
Custom Error: 0.90875


# Testing

In [87]:

test_data = load_csv('../data/test_data_no_target.csv')
# test_data = drop_columns_with_excessive_nans(test_data, 50)
le = LabelEncoder()
test_data['Group'] = le.fit_transform(test_data['Group'])

# Cut outliers
top_quantiles = test_data.quantile(0.97)
outliers_top = (test_data > top_quantiles)

low_quantiles = test_data.quantile(0.03)
outliers_low = (test_data < low_quantiles)

test_data = test_data.mask(outliers_top, top_quantiles, axis=1)
test_data = test_data.mask(outliers_low, low_quantiles, axis=1)

In [88]:
test_data = test_data.groupby(['Group']).transform(lambda x: x.fillna(x.mean()))
test_data.fillna(0, inplace=True)

In [90]:
predicts = model.predict(test_data) + model1.predict(test_data)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [92]:
np.savetxt('predicts_dummy.txt', np.array(predicts), fmt='%d', delimiter='\n')