In [48]:
import warnings
# Suppress specific warnings
warnings.filterwarnings("ignore", category=FutureWarning)
    
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np

# Step 1: Load Data
training_data = pd.read_csv('../data/training_data.csv', delimiter=';')
test_data = pd.read_csv('../data/test_data_no_target.csv', delimiter=';')

# Step 2: Convert Numerical Columns from strings to floats
def convert_to_float(df):
    for col in df.columns:
        if df[col].dtype == 'object' and col not in ['Group', 'Class', 'Perform']:
            df[col] = df[col].str.replace(',', '.').astype(float)
    return df

# Apply conversion to the training data
training_data = convert_to_float(training_data)
test_data = convert_to_float(test_data)

# Step 3: Handle Missing Values (fill with the median)
training_data.fillna(training_data.median(numeric_only=True), inplace=True)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

In [49]:
# Step 4: One-Hot Encoding for the 'Group' column
training_data = pd.get_dummies(training_data, columns=['Group'])
test_data = pd.get_dummies(test_data, columns=['Group'])

training_data = training_data[['I5', 'I8', 'I9', 'I18', 'I37', 'I38', 'I44', 'I47', 'I57', 'dI5',
        'dI6', 'dI23', 'dI25', 'dI28', 'dI35', 'dI40', 'dI42', 'dI46', 'dI47',
        'dI54', 'dI56', 'dI57', 'dI58', 'Class', 'Perform']]

# Ensure the test set has the same columns as the training set
missing_cols = set(training_data.columns) - set(test_data.columns) - {'Class', 'Perform'}
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[training_data.columns.drop(['Class', 'Perform'])]

In [50]:
import numpy as np
from sklearn.metrics import confusion_matrix

cost_matrix = np.array([[0, 1, 2],
                        [1, 0, 1],
                        [2, 1, 0]])
def calculate_custom_error(preds, gt, cost_matrix=cost_matrix):
    """
    Calculate a custom error metric based on a confusion matrix and a cost matrix.

    Args:
    preds (array-like): Predicted labels.
    gt (array-like): Ground truth (actual) labels.
    cost_matrix (numpy.ndarray): A matrix of costs associated with misclassifications.

    Returns:
    float: The calculated error metric.
    """
    # Calculate the confusion matrix
    cm = confusion_matrix(gt, preds)
    
    # Validate dimensions of cost_matrix
    if cm.shape != cost_matrix.shape:
        raise ValueError("Cost matrix dimensions must match the confusion matrix dimensions.")
    
    # Calculate weighted confusion matrix
    weighted_cm = cm * cost_matrix
    
    # Calculate the custom error
    total_samples = len(gt)
    if total_samples == 0:
        raise ValueError("The length of ground truth cannot be zero.")
    
    error = np.sum(weighted_cm) / total_samples
    return error


In [51]:
# Step 5: Separate features and target
X_train = training_data.drop(columns=['Class', 'Perform'])
y_train = training_data['Class']

# Step 6: Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(test_data)

# Convert scaled data back to DataFrame for easier manipulation
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_train.columns)

# Step 7: Perform Exhaustive Feature Selection
# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.1, random_state=42)

# Initialize the Logistic Regression model
# model = LogisticRegression(max_iter=1000)
# model = SVC(kernel='linear')
model = RandomForestClassifier(n_estimators=100)

In [52]:
# Train the model on selected features
model.fit(X_train_split, y_train_split)
y_val_pred = model.predict(X_val_split)

In [53]:
# Evaluate the model
accuracy = accuracy_score(y_val_split, y_val_pred)
precision = precision_score(y_val_split, y_val_pred, average='weighted')
recall = recall_score(y_val_split, y_val_pred, average='weighted')
f1 = f1_score(y_val_split, y_val_pred, average='weighted')
report = classification_report(y_val_split, y_val_pred)

print(f"Error: {calculate_custom_error(y_val_pred, y_val_split)}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Classification Report:")
print(report)

Error: 0.93625
Accuracy: 0.455
Precision: 0.38341472672064775
Recall: 0.455
F1 Score: 0.41483851115720377
Classification Report:
              precision    recall  f1-score   support

          -1       0.41      0.42      0.41       297
           0       0.00      0.00      0.00       121
           1       0.49      0.63      0.55       382

    accuracy                           0.46       800
   macro avg       0.30      0.35      0.32       800
weighted avg       0.38      0.46      0.41       800



In [27]:
y_test_pred = model.predict(X_test_scaled)
np.savetxt('recursive_search.txt', y_test_pred.astype(int), fmt='%d', newline='\n')

In [1]:
import warnings

warnings.filterwarnings('ignore', category=FutureWarning, module='sklearn')

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import numpy as np
import joblib
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# Step 1: Load Data
training_data = pd.read_csv('../data/training_data.csv', delimiter=';')
test_data = pd.read_csv('../data/test_data_no_target.csv', delimiter=';')

# Step 2: Convert Numerical Columns from strings to floats
def convert_to_float(df):
    for col in df.columns:
        if df[col].dtype == 'object' and col not in ['Group', 'Class', 'Perform']:
            df[col] = df[col].str.replace(',', '.').astype(float)
    return df

# Apply conversion to the training data
training_data = convert_to_float(training_data)
test_data = convert_to_float(test_data)

# Step 3: One-Hot Encoding for the 'Group' column
training_data = pd.get_dummies(training_data, columns=['Group'])
test_data = pd.get_dummies(test_data, columns=['Group'])

# Ensure the test set has the same columns as the training set
missing_cols = set(training_data.columns) - set(test_data.columns) - {'Class', 'Perform'}
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[training_data.columns.drop(['Class', 'Perform'])]

# Step 4: Separate features and target
X_train = training_data.drop(columns=['Class', 'Perform'])
y_train = training_data['Class']

# Step 5: Handle Missing Values with KNN Imputer
# Apply ColumnTransformer to apply KNNImputer to numerical columns only
numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['uint8']).columns.tolist()  # One-hot encoded columns

In [2]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', KNNImputer(n_neighbors=5), numerical_cols),
        ('cat', 'passthrough', categorical_cols)
    ],
    remainder='drop'
)

# Step 6: Scale the features
pipeline = Pipeline(steps=[
    ('imputer', preprocessor),
    ('scaler', StandardScaler())
])

X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(test_data)

# Convert processed data back to DataFrame for easier manipulation
X_train_processed = pd.DataFrame(X_train_processed, columns=numerical_cols + categorical_cols)
X_test_processed = pd.DataFrame(X_test_processed, columns=numerical_cols + categorical_cols)

# Step 7: Perform Sequential Feature Selection
# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_processed, y_train, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model

In [3]:

model = RandomForestClassifier()

# Perform SFS
sfs = SFS(model, 
        k_features='best', 
        forward=True, 
        floating=False, 
        scoring='accuracy', 
        cv=5, 
        n_jobs=-1, 
        verbose=2)
sfs = sfs.fit(X_train_split, y_train_split)

# Print the selected features and their performance
selected_features = list(sfs.k_feature_names_)
print(f"Selected features: {selected_features}")


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

TypeError: 'NoneType' object is not iterable