In [225]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import KNNImputer

import matplotlib.pyplot as plt
%matplotlib inline 

np.random.seed(0)

def filter_outliers(data):
    """Filter out the outliers using IQR method.
    """  
    for column in data:
        if data[column].dtype in ['int64', 'float64']:
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            outliers = data[(data[column] < (Q1 - 1.5 * IQR)) | (data[column] > (Q3 + 1.5 * IQR))]
            # Filter out the outliers
            data = data[(data[column] >= (Q1 - 1.5 * IQR)) & (data[column] <= (Q3 + 1.5 * IQR))]
    return data

def filter_outliers_by_group(data):
    filter_data = None
    groups = data['Group'].unique()
    for group in groups:
        group_data = data[data['Group'] == group]
        group_data = filter_outliers(group_data)
        if filter_data is None:
            filter_data = group_data
        else:
            filter_data = pd.concat([filter_data, group_data])
    return filter_data

def load_csv(file_path, remove_outliners=False, filter_by_group=False):
    # Load the data
    data = pd.read_csv(file_path, delimiter=';')

    # Replace commas in numeric columns and convert them to floats
    data = data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')
    # if 'Perform' in data.columns:
    #     data.drop('Perform', axis=1, inplace=True)
    if filter_by_group:
        data = filter_outliers_by_group(data)
    elif remove_outliners:
        data = filter_outliers(data)
    return data

In [226]:
def drop_columns_with_excessive_nans(dataframe, threshold=200):
    """Drop columns from a DataFrame where the number of NaN values exceeds the specified threshold."""
    nan_counts = dataframe.isna().sum()
    columns_to_drop = nan_counts[nan_counts > threshold].index
    return dataframe.drop(columns=columns_to_drop)


In [338]:
import numpy as np
from sklearn.metrics import confusion_matrix

cost_matrix = np.array([[0, 1, 2],
                        [1, 0, 1],
                        [2, 1, 0]])
def calculate_custom_error(preds, gt, cost_matrix=cost_matrix):
    """
    Calculate a custom error metric based on a confusion matrix and a cost matrix.

    Args:
    preds (array-like): Predicted labels.
    gt (array-like): Ground truth (actual) labels.
    cost_matrix (numpy.ndarray): A matrix of costs associated with misclassifications.

    Returns:
    float: The calculated error metric.
    """
    # Calculate the confusion matrix
    cm = confusion_matrix(gt, preds)
    
    # Validate dimensions of cost_matrix
    if cm.shape != cost_matrix.shape:
        raise ValueError("Cost matrix dimensions must match the confusion matrix dimensions.")
    
    # Calculate weighted confusion matrix
    weighted_cm = cm * cost_matrix
    
    # Calculate the custom error
    total_samples = len(gt)
    if total_samples == 0:
        raise ValueError("The length of ground truth cannot be zero.")
    
    error = np.sum(weighted_cm) / total_samples
    return error


In [339]:
def feature_transformations(X):
    # Log transformation
    X = X.drop(columns=['Group'], axis=1)
    df_log_transformed = X.apply(lambda x: np.log(x + 1))  # x+1 to avoid log(0)

    # # Box-Cox or Yeo-Johnson transformation
    pt = PowerTransformer(method='yeo-johnson', standardize=True)  # Box-Cox requires strictly positive values
    X = pd.DataFrame(pt.fit_transform(X), columns=X.columns)
    return X

In [340]:
def group_wise_knn_imputation(df, group_column, n_neighbors=5):
    # Initialize an empty DataFrame to collect the imputed groups
    try:
        df_copy = df.drop(columns=['I21', 'I48', 'I50', 'dI21', 'dI48', 'dI50'], axis=1)
    except:
        df_copy = df

    # We will collect the group imputed dataframes here and concatenate them at the end
    imputed_dfs = []

    # Iterate over each group
    for group_name, group_data in df_copy.groupby(group_column):
        # Create an imputer object
        imputer = KNNImputer(n_neighbors=n_neighbors)

        # Select numeric columns for imputation
        numeric_cols = group_data.select_dtypes(include=[np.number]).columns.tolist()

        # Perform imputation
        group_data.loc[:, numeric_cols] = imputer.fit_transform(group_data[numeric_cols])

        # Append the imputed group data
        imputed_dfs.append(group_data)

    # Concatenate all the imputed group dataframes
    df_imputed = pd.concat(imputed_dfs, ignore_index=False)

    return df_imputed

In [341]:
def group_wise_imputation(X, group_column):
    # Iterate over each group defined by the 'group_column'
    for group, group_data in X.groupby(group_column):
        # Select only numeric columns for imputation, excluding the group column explicitly
        numeric_cols = group_data.select_dtypes(include=[np.number]).columns.tolist()
        if group_column in numeric_cols:
            numeric_cols.remove(group_column)  # Ensure the group column is not in the list

        for column in numeric_cols:
            upper_quartile = group_data[column].quantile(0.75)
            lower_quartile = group_data[column].quantile(0.25)
            IQR = upper_quartile - lower_quartile
            upper_whisker = upper_quartile + 1.5 * IQR
            lower_whisker = lower_quartile - 1.5 * IQR
            
            # Impute outliers with the median of the group
            median_value = group_data[column].median()
            group_data[column] = np.where((group_data[column] > upper_whisker) | 
                                        (group_data[column] < lower_whisker), 
                                        median_value, group_data[column])
        
        # Assign the corrected group data back to the main DataFrame
        X.loc[group_data.index, group_data.columns] = group_data

    return X


In [370]:
train_data = load_csv('../data/training_data.csv')
# train_data.loc[(train_data['Perform'] < 0.1) & (train_data['Perform'] > -0.1), 'Class'] = 0
# train_data.loc[(train_data['Perform'] < 0.1) & (train_data['Perform'] > -0.1), 'Perform'] = 0
train_data = drop_columns_with_excessive_nans(train_data, 200)

# Handle categorical variables - encoding the 'Group' column
le = LabelEncoder()
train_data['Group'] = le.fit_transform(train_data['Group'])
# cols = train_data.columns.tolist()
# cols = cols[1:] + cols[0:1]
# train_data = train_data[cols]

#### Filling misisng values
# train_data = train_data.interpolate()
# Replace NaN values with 0
# train_data = train_data.fillna(0)
# train_data = train_data.fillna(method='ffill')
# Fill NaN values with the mean of each column
# train_data = train_data.apply(lambda x: x.fillna(x.mean()), axis=0)

# Assume 'Class' is the target variable
X = train_data.drop('Class', axis=1)  # Features
y = train_data['Class'] # Target variable

# X = feature_transformations(X)
# X = group_wise_imputation(X, 'Group')
# X = group_wise_knn_imputation(X, 'Group', n_neighbors=5)
X = X.drop(columns=['Perform'], axis=1)
# X = X[feature_importances[feature_importances > 0.001].dropna()[0].index]

In [371]:
state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=state, stratify=y)
np.unique(y_train, return_counts=True)

(array([-1,  0,  1]), array([1239,  454, 1507]))

In [372]:
nan_counts = X.isna().sum().sum()
nan_counts

28933

: 

## Feature Selection

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

estimator = SVC(class_weight='balanced', kernel='linear')
selector = RFE(estimator, n_features_to_select=50, step=1, verbose=3)
selector = selector.fit(X_train, y_train)

# Selected features
selected_features = selector.get_support(indices=True)
print("Selected features:", selected_features)

In [139]:
# Filter the train and test sets to keep only selected features
X_train = X_train.loc[:, selector.support_]
X_test = X_test.loc[:, selector.support_]

In [None]:
# importances = model.feature_importances_

# final_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
# final_df.set_index('Importance')

# final_df = final_df.sort_values('Importance')

# # ax, fig = plt.subplots(figsize=(10, 10))
# final_df.plot(kind='barh', x='Feature', y='Importance', color='blue', edgecolor='black', figsize=(20, 20))

from sklearn.feature_selection import mutual_info_classif

importances = mutual_info_classif(X_train, y_train)
feature_importances = pd.Series(importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)
feature_importances.plot(kind='barh', figsize=(20, 20))

### Exhaustive Feature Selection


In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector

efs = ExhaustiveFeatureSelector(estimator=RandomForestClassifier(n_estimators=100,
                                                                n_jobs=-1),
                                min_features=1,
                                max_features=100,
                                scoring='accuracy',
                                cv=2)
efs = efs.fit(X_train, y_train)
selected_features = X_train.columns[list(efs.best_idx_)]
print(selected_features)


print(efs.best_score_)

In [None]:
from sklearn.feature_selection import RFE

model = RandomForestClassifier(random_state=state, n_estimators=100, class_weight='balanced', verbose=1)

selector = RFE(model, n_features_to_select=50, step=1)  # Select 5 features at a time
selector = selector.fit(X, y)

In [31]:
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

# Sampling Data

In [100]:
from sklearn.utils import resample
# Display original class distribution
print("Original class distribution:", np.bincount(y_train + 1))

# Separate majority and minority classes
X_train_minority = X_train[y_train == 0]
y_train_minority = y_train[y_train == 0]

X_train_majority = X_train[y_train != 0]
y_train_majority = y_train[y_train != 0]

Original class distribution: [2477  909 3014]


In [109]:
# Upsample minority class
X_upsampled, y_upsampled = resample(X_train_minority,
                                    y_train_minority,
                                    replace=True,  # sample with replacement
                                    n_samples=X_train_majority.shape[0],  # to match majority class
                                    random_state=123)  # reproducible results

# Combine the upsampled minority class with the majority class
X_train_balanced = np.vstack((X_train_majority, X_upsampled))
y_train_balanced = np.hstack((y_train_majority, y_upsampled))

# Shuffle the dataset to mix up minority and majority samples
indices = np.arange(X_train_balanced.shape[0])
np.random.shuffle(indices)
X_train_balanced = X_train_balanced[indices]
y_train_balanced = y_train_balanced[indices]


# Display new class distribution
print("New class distribution:", np.bincount(y_train_balanced + 1))
X_train = X_train_balanced
y_train = y_train_balanced

New class distribution: [2477 5491 3014]


In [105]:
# Downsample majority class
X_majority_downsampled, y_majority_downsampled = resample(X_train_majority,
                                                          y_train_majority,
                                                          replace=False,  # sample without replacement
                                                          n_samples=len(y_train_minority),  # match minority class
                                                          random_state=123)  # reproducible results

# Combine the downsampled majority class with the minority class
X_train_balanced = np.vstack((X_majority_downsampled, X_train_minority))
y_train_balanced = np.hstack((y_majority_downsampled, y_train_minority))

# Shuffle the dataset to mix up minority and majority samples
indices = np.arange(X_train_balanced.shape[0])
np.random.shuffle(indices)
X_train_balanced = X_train_balanced[indices]
y_train_balanced = y_train_balanced[indices]

# Display new class distribution
print("New class distribution:", np.bincount(y_train_balanced + 1))

# X_train = X_train_balanced
# y_train = y_train_balanced

New class distribution: [400 909 509]


# Training

### Model Selection

In [367]:
from sklearn.neighbors import KNeighborsClassifier

model = RandomForestClassifier(random_state=state, n_estimators=100, class_weight='balanced', verbose=1)
# model = SVC(random_state=state, class_weight='balanced')
# model = KNeighborsClassifier(n_neighbors=5)

In [368]:
# Initialize the Decision Tree Classifier
# model = DecisionTreeClassifier(random_state=42)
# model = GaussianNB()

# Initialize search
model.fit(X_train, y_train)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Predict on the test set
y_pred = model.predict(X_test)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [369]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Custom Error:", calculate_custom_error(y_pred, y_test))

Accuracy: 0.47020833333333334
Classification Report:
              precision    recall  f1-score   support

          -1       0.44      0.39      0.42      1857
           0       0.19      0.02      0.03       682
           1       0.49      0.67      0.57      2261

    accuracy                           0.47      4800
   macro avg       0.37      0.36      0.34      4800
weighted avg       0.43      0.47      0.43      4800

Confusion Matrix:
[[ 727   22 1108]
 [ 207   13  462]
 [ 710   34 1517]]
Custom Error: 0.9085416666666667


# Testing

In [329]:

test_data = load_csv('../data/test_data_no_target.csv')
test_data = test_data[X.columns]
# test_data = drop_columns_with_excessive_nans(test_data, 50)
le = LabelEncoder()
test_data['Group'] = le.fit_transform(test_data['Group'])


# Replace NaN values with 0
test_data = test_data.fillna(0) 
# test_data = test_data.interpolate()
# test_data = feature_transformations(test_data)
# scaler = StandardScaler()
# test_data = scaler.fit_transform(test_data)


predicts = model.predict(test_data)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [96]:
np.savetxt('predictions_prob_threshold_0.5.txt', np.array(results), fmt='%d', delimiter='\n')

In [330]:
np.savetxt('predicts_RF_margin.txt', np.array(predicts), fmt='%d', delimiter='\n')