**Import packages**

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler

**Data Preprocessing**

In [10]:
# Load data
data = pd.read_csv('labelled_dysx.csv')
y = data['Label']
X = data.drop(['Label'], axis=1)
columns = X.columns  # Get feature names

# Define test samples as DataFrames with proper feature names
test1 = pd.DataFrame([[0.5, 0.1, 0.2, 0.8, 0.3, 0.5]], columns=columns)
test2 = pd.DataFrame([[0.7, 0.9, 0.4, 0.9, 0.3, 0.8]], columns=columns)
test3 = pd.DataFrame([[0.1, 0.7, 0.2, 0.6, 0.9, 0.6]], columns=columns)
test4 = pd.DataFrame([[0.3, 0.4, 0.5, 0.3, 0.3, 0.5]], columns=columns)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Initialize label lists for storing predictions (optional initialization for multiple models)
label_1 = [0, 0, 0, 0, 0]
label_2 = [0, 0, 0, 0, 0]
label_3 = [0, 0, 0, 0, 0]
label_4 = [0, 0, 0, 0, 0]

# Scale data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
test1 = sc.transform(test1)
test2 = sc.transform(test2)
test3 = sc.transform(test3)
test4 = sc.transform(test4)

**Training model - Random Forest with GridSearchSV**

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler

# -------------------------
# Step 1: Data Preprocessing
# -------------------------

# Load data
data = pd.read_csv('labelled_dysx.csv')
y = data['Label']
X = data.drop(['Label'], axis=1)
columns = X.columns  # Get feature names

# Define test samples as DataFrames with proper feature names
test1 = pd.DataFrame([[0.5, 0.1, 0.2, 0.8, 0.3, 0.5]], columns=columns)
test2 = pd.DataFrame([[0.7, 0.9, 0.4, 0.9, 0.3, 0.8]], columns=columns)
test3 = pd.DataFrame([[0.1, 0.7, 0.2, 0.6, 0.9, 0.6]], columns=columns)
test4 = pd.DataFrame([[0.3, 0.4, 0.5, 0.3, 0.3, 0.5]], columns=columns)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Initialize label lists for storing predictions
label_1 = [0, 0, 0, 0, 0]
label_2 = [0, 0, 0, 0, 0]
label_3 = [0, 0, 0, 0, 0]
label_4 = [0, 0, 0, 0, 0]

# Scale data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
test1 = sc.transform(test1)
test2 = sc.transform(test2)
test3 = sc.transform(test3)
test4 = sc.transform(test4)

# -------------------------
# Step 2: Model Training and Evaluation
# -------------------------

# Random Forest with GridSearchCV for hyperparameter tuning
n_est = {'n_estimators': [10, 100, 500, 1000]}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=0), n_est, scoring='f1_macro')
rf_grid.fit(X_train, y_train)

# Print the best parameters
print('Best value of n_estimators for RandomForest model is:', rf_grid.best_params_)

# Evaluate model on test data and store error metrics
pred_rf_grid = rf_grid.predict(X_test)
test_error = round(mean_absolute_error(y_test, pred_rf_grid), 3)
test_f1 = round(f1_score(y_test, pred_rf_grid, average='macro'), 3)
test_accuracy = round(accuracy_score(y_test, pred_rf_grid), 3)

print('Metrics on Test Set:')
print(f'Mean Absolute Error: {test_error}')
print(f'F1 Score: {test_f1}')
print(f'Accuracy: {test_accuracy}')

# Make predictions on validation samples
ans_1 = rf_grid.predict(test1)
ans_2 = rf_grid.predict(test2)
ans_3 = rf_grid.predict(test3)
ans_4 = rf_grid.predict(test4)

# Store predictions in respective label lists at the RandomForest(GridSearch) index
label_1[3] = ans_1[0]
label_2[3] = ans_2[0]
label_3[3] = ans_3[0]
label_4[3] = ans_4[0]

# Print predictions for each validation sample
print('\nPredicted Labels for Validation Samples:')
print('Labels for test1:', label_1)
print('Labels for test2:', label_2)
print('Labels for test3:', label_3)
print('Labels for test4:', label_4)

# Optional: Calculate and print F1 score and accuracy on the validation samples
# (assuming the actual labels for these samples are known, replace 'actual_label' accordingly)

# Placeholder actual labels for demonstration purposes (replace with real labels if available)
actual_labels = [1, 0, 1, 0]  # Replace these with the real labels of test1, test2, test3, test4
predictions = [label_1[3], label_2[3], label_3[3], label_4[3]]

validation_f1 = round(f1_score(actual_labels, predictions, average='macro'), 3)
validation_accuracy = round(accuracy_score(actual_labels, predictions), 3)



Best value of n_estimators for RandomForest model is: {'n_estimators': 100}
Metrics on Test Set:
Mean Absolute Error: 0.04
F1 Score: 0.965
Accuracy: 0.96

Predicted Labels for Validation Samples:
Labels for test1: [0, 0, 0, 0, 0]
Labels for test2: [0, 0, 0, 2, 0]
Labels for test3: [0, 0, 0, 0, 0]
Labels for test4: [0, 0, 0, 1, 0]
