In [1]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import KNNImputer

import matplotlib.pyplot as plt
%matplotlib inline 

np.random.seed(0)

def load_csv(file_path):
    # Load the data
    data = pd.read_csv(file_path, delimiter=';')

    # Replace commas in numeric columns and convert them to floats
    data = data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')
    return data

In [2]:
def drop_columns_with_excessive_nans(dataframe, threshold=200):
    """Drop columns from a DataFrame where the number of NaN values exceeds the specified threshold."""
    nan_counts = dataframe.isna().sum()
    columns_to_drop = nan_counts[nan_counts > threshold].index
    return dataframe.drop(columns=columns_to_drop)

## Data Loader

In [133]:
train_data = load_csv('../data/training_data.csv')

le = LabelEncoder()
train_data['Group'] = le.fit_transform(train_data['Group'])

# Cut outliers
top_quantiles = train_data.quantile(0.97)
outliers_top = (train_data > top_quantiles)

low_quantiles = train_data.quantile(0.03)
outliers_low = (train_data < low_quantiles)

train_data = train_data.mask(outliers_top, top_quantiles, axis=1)
train_data = train_data.mask(outliers_low, low_quantiles, axis=1)

In [136]:
train_data = train_data.groupby(['Group']).transform(lambda x: x.fillna(x.mean()))
train_data = train_data.fillna(0)


In [137]:
X = train_data.drop(columns=['Class', 'Perform']).iloc[:, :20]
y = pd.DataFrame(train_data['Perform'])

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Training 

In [139]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge



# Initialize and train the regressor
# model = RandomForestRegressor(n_estimators=10, random_state=42)
model = Ridge()

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# scaler = StandardScaler()
# y_train = scaler.fit_transform(y_train)
# y_test = scaler.transform(y_test)

model.fit(X_train, y_train)

In [140]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the mean squared error and R² score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R² Score:", r2)


Mean Squared Error: 0.019573058240432283
R² Score: 0.008359843664085154


: 

In [58]:
pd.DataFrame([y_pred, y_test])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1590,1591,1592,1593,1594,1595,1596,1597,1598,1599
0,0.166162,0.148337,-0.115815,0.095798,0.033458,0.190387,0.175568,0.039992,-0.066106,-0.095139,...,0.09524,0.128636,-0.043666,0.035407,-0.119868,0.026269,0.296182,0.080284,-0.005941,0.109091
1,0.152285,0.317457,-0.193163,0.146018,-0.028856,-0.044617,0.064821,0.153098,-0.228221,-0.196243,...,0.012605,-0.101496,0.13925,0.01434,0.177339,-0.139877,-0.023002,0.178263,-0.096864,-0.047704


In [60]:
test_data = load_csv('./test_data_no_target.csv')

# Fill missing values or drop rows/columns with missing values
# For example, to fill missing values with the mean:
test_data.fillna(0)

# Encode categorical variables if any, example using get_dummies:
test_data = pd.get_dummies(test_data)

  data = data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')


In [19]:
y_pred = model.predict(test_data)

In [20]:
np.savetxt('test_data_predictions.csv', y_pred, fmt='%f', newline='\n')

# Classify

In [32]:
import numpy as np
from sklearn.metrics import confusion_matrix

cost_matrix = np.array([[0, 1, 2],
                        [1, 0, 1],
                        [2, 1, 0]])
def calculate_custom_error(preds, gt, cost_matrix=cost_matrix):
    """
    Calculate a custom error metric based on a confusion matrix and a cost matrix.

    Args:
    preds (array-like): Predicted labels.
    gt (array-like): Ground truth (actual) labels.
    cost_matrix (numpy.ndarray): A matrix of costs associated with misclassifications.

    Returns:
    float: The calculated error metric.
    """
    # Calculate the confusion matrix
    cm = confusion_matrix(gt, preds)
    
    # Validate dimensions of cost_matrix
    if cm.shape != cost_matrix.shape:
        raise ValueError("Cost matrix dimensions must match the confusion matrix dimensions.")
    
    # Calculate weighted confusion matrix
    weighted_cm = cm * cost_matrix
    
    # Calculate the custom error
    total_samples = len(gt)
    if total_samples == 0:
        raise ValueError("The length of ground truth cannot be zero.")
    
    error = np.sum(weighted_cm) / total_samples
    return error


In [29]:
train_data = pd.read_csv('training_data.csv', delimiter=';')
train_data = train_data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')
train_data = train_data[['Perform', 'Class']]

test_data = pd.read_csv('test_data_predictions.csv', header=None)

X = train_data.drop('Class', axis=1)  # Features
y = train_data['Class'] # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  train_data = train_data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')


In [30]:
model = SVC(random_state=42)

# Initialize search
model.fit(X_train, y_train)

# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# # Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)


In [33]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Custom Error:", calculate_custom_error(y_pred, y_test))

Accuracy: 0.998125
Classification Report:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       619
           0       1.00      0.99      0.99       227
           1       1.00      1.00      1.00       754

    accuracy                           1.00      1600
   macro avg       1.00      1.00      1.00      1600
weighted avg       1.00      1.00      1.00      1600

Confusion Matrix:
[[619   0   0]
 [  1 224   2]
 [  0   0 754]]
Custom Error: 0.001875


In [34]:
predicts = model.predict(test_data)



In [35]:
np.savetxt('regression.txt', predicts, fmt='%d', newline='\n')