In [1]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import KNNImputer


np.random.seed(0)

def load_csv(file_path, remove_outliners=False, filter_by_group=False):
    # Load the data
    data = pd.read_csv(file_path, delimiter=';')

    # Replace commas in numeric columns and convert them to floats
    data = data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')
    # if 'Perform' in data.columns:
    #     data.drop('Perform', axis=1, inplace=True)
    return data

In [2]:
import numpy as np
from sklearn.metrics import confusion_matrix

cost_matrix = np.array([[0, 1, 2],
                        [1, 0, 1],
                        [2, 1, 0]])
def calculate_custom_error(preds, gt, cost_matrix=cost_matrix):
    """
    Calculate a custom error metric based on a confusion matrix and a cost matrix.

    Args:
    preds (array-like): Predicted labels.
    gt (array-like): Ground truth (actual) labels.
    cost_matrix (numpy.ndarray): A matrix of costs associated with misclassifications.

    Returns:
    float: The calculated error metric.
    """
    # Calculate the confusion matrix
    cm = confusion_matrix(gt, preds)
    
    # Validate dimensions of cost_matrix
    if cm.shape != cost_matrix.shape:
        raise ValueError("Cost matrix dimensions must match the confusion matrix dimensions.")
    
    # Calculate weighted confusion matrix
    weighted_cm = cm * cost_matrix
    
    # Calculate the custom error
    total_samples = len(gt)
    if total_samples == 0:
        raise ValueError("The length of ground truth cannot be zero.")
    
    error = np.sum(weighted_cm) / total_samples
    return error


In [3]:
def group_wise_knn_imputation(df, group_column, n_neighbors=5):
    # Initialize an empty DataFrame to collect the imputed groups
    try:
        df_copy = df.drop(columns=['I21', 'I48', 'I50', 'dI21', 'dI48', 'dI50'], axis=1)
    except:
        df_copy = df.copy()

    # We will collect the group imputed dataframes here and concatenate them at the end
    imputed_dfs = []

    # Iterate over each group
    for group_name, group_data in df_copy.groupby(group_column):
        # Create an imputer object
        imputer = KNNImputer(n_neighbors=n_neighbors)

        # Select numeric columns for imputation
        numeric_cols = group_data.select_dtypes(include=[np.number]).columns.tolist()

        # Perform imputation
        group_data.loc[:, numeric_cols] = imputer.fit_transform(group_data[numeric_cols])

        # Append the imputed group data
        imputed_dfs.append(group_data)

    # Concatenate all the imputed group dataframes
    df_imputed = pd.concat(imputed_dfs, ignore_index=False)

    return df_imputed

In [4]:
def group_wise_imputation(X, group_column):
    # Iterate over each group defined by the 'group_column'
    for group, group_data in X.groupby(group_column):
        # Select only numeric columns for imputation, excluding the group column explicitly
        numeric_cols = group_data.select_dtypes(include=[np.number]).columns.tolist()
        if group_column in numeric_cols:
            numeric_cols.remove(group_column)  # Ensure the group column is not in the list

        for column in numeric_cols:
            upper_quartile = group_data[column].quantile(0.75)
            lower_quartile = group_data[column].quantile(0.25)
            IQR = upper_quartile - lower_quartile
            upper_whisker = upper_quartile + 1.5 * IQR
            lower_whisker = lower_quartile - 1.5 * IQR
            
            # Impute outliers with the median of the group
            median_value = group_data[column].median()
            group_data[column] = np.where((group_data[column] > upper_whisker) | 
                                        (group_data[column] < lower_whisker), 
                                        median_value, group_data[column])
        
        # Assign the corrected group data back to the main DataFrame
        X.loc[group_data.index, group_data.columns] = group_data

    return X

In [5]:
def drop_columns_with_excessive_nans(dataframe, threshold=200):
    """Drop columns from a DataFrame where the number of NaN values exceeds the specified threshold."""
    nan_counts = dataframe.isna().sum()
    columns_to_drop = nan_counts[nan_counts > threshold].index
    return dataframe.drop(columns=columns_to_drop)


In [10]:
train_data = load_csv('../data/training_data.csv')
train_data = drop_columns_with_excessive_nans(train_data, threshold=200)


# Handle categorical variables - encoding the 'Group' column
le = LabelEncoder()
train_data['Group'] = le.fit_transform(train_data['Group'])

# Assume 'Class' is the target variable
X = train_data.drop('Class', axis=1)  # Features
y = train_data['Class'] # Target variable

# X = feature_transformations(X)
# X = group_wise_imputation(X, 'Group')
X = group_wise_knn_imputation(X, 'Group', n_neighbors=5)
X = group_wise_imputation(X, 'Group')
X = X.drop(columns=['Perform'], axis=1)


In [11]:
state = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state, stratify=y)
np.unique(y_train, return_counts=True)
X_train_group = X_train['Group']
X_test_group = X_test['Group']

In [12]:
X_test.shape

(1600, 52)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

estimator = SVC(kernel='linear')
selector = RFE(estimator, n_features_to_select=50, step=1, verbose=3)
selector = selector.fit(X_train, y_train)

# Selected features
selected_features = selector.get_support(indices=True)
print("Selected features:", selected_features)

X_train = X_train.iloc[:, selected_features]
X_test = X_test.iloc[:, selected_features]

In [14]:
X_train['Group'] = X_train_group
X_test['Group'] = X_test_group

In [15]:
models = {}

for group_name, group_data in X_train.groupby('Group'):
    # Initialize the model
    model = RandomForestClassifier(random_state=state, class_weight='balanced')
    
    y_train_group = y_train.values[np.where(X_train.values[:, 0] == group_name)[0]]
    
    # Fit the model
    model.fit(group_data, y_train_group)

    # Save the model
    models[group_name] = model

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required by RandomForestClassifier.

In [17]:
y_preds = []

for group_name, group_data in X_test.groupby('Group'):
    # Get the model
    model = models[group_name]
    
    # Get the ground truth labels
    y_test_group = y_test.values[np.where(X_test.values[:, 0] == group_name)[0]]
    
    # Get the predictions
    y_pred = model.predict(group_data)
    y_preds.extend(y_pred)
    
# Calculate the custom error
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_preds)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_preds))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_preds))
print("Custom Error:", calculate_custom_error(y_preds, y_test))

Accuracy: 0.435
Classification Report:
              precision    recall  f1-score   support

          -1       0.38      0.35      0.36       619
           0       0.10      0.01      0.02       227
           1       0.48      0.63      0.54       754

    accuracy                           0.43      1600
   macro avg       0.32      0.33      0.31      1600
weighted avg       0.38      0.43      0.40      1600

Confusion Matrix:
[[216  10 393]
 [ 93   2 132]
 [267   9 478]]
Custom Error: 0.9775


: 

In [61]:
train_data[train_data['Class'] == 0]['Group'].value_counts()

Group
3     170
7     155
5     154
0     118
6     116
8     103
10     94
9      82
4      67
2      45
1      32
Name: count, dtype: int64

In [62]:
train_data[train_data['Class'] == 1]['Group'].value_counts()

Group
3     705
5     535
7     497
0     413
6     387
10    360
8     285
9     259
4     129
2     128
1      70
Name: count, dtype: int64

In [64]:
train_data[train_data['Class'] == -1]['Group'].value_counts()

Group
3     499
7     367
5     354
0     349
6     326
10    299
8     289
9     244
4     187
2     124
1      58
Name: count, dtype: int64