# Ensemble Learning Models


## VOTING - HARD - ENSEMBLE- HYPERPARAMETER TUNING

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

# Load datasets
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]    # Target for training

X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]     # Target for validation

X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]


X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]    # Target for testing


X_test = X_test.dropna()
y_test = y_test[X_test.index]


pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale the features
    ('voting', VotingClassifier(estimators=[
        ('RF', RandomForestClassifier()),  # Random Forest
        ('GB', GradientBoostingClassifier()),  # Gradient Boosting
        ('AB', AdaBoostClassifier())  # AdaBoost
    ], voting='hard'))
])

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'voting__RF__n_estimators': [50, 100, 200],
    'voting__GB__n_estimators': [50, 100, 200],
    'voting__AB__n_estimators': [50, 100, 200],
}

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted', verbose=2,n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best estimator from grid search
best_model = grid_search.best_estimator_

# Validate the best model on the validation set
y_validate_pred = best_model.predict(X_validate)

# Calculate metrics for validation set
precision = precision_score(y_validate, y_validate_pred, average='weighted')
recall = recall_score(y_validate, y_validate_pred, average='weighted')
f1_weighted = f1_score(y_validate, y_validate_pred, average='weighted')
mcc = matthews_corrcoef(y_validate, y_validate_pred)
f1_perclass = f1_score(y_validate, y_validate_pred, average=None)

# Output results for validation
print(f'Validation Results:')
print(f'Precision (Weighted): {precision:.2f}')
print(f'Recall (Weighted): {recall:.2f}')
print(f'F1-Score (Weighted): {f1_weighted:.2f}')
print(f'F1-Score (Per Class): {f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')

# Test the best model on the test set
y_test_pred = best_model.predict(X_test)

# Calculate metrics for test set
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
test_mcc = matthews_corrcoef(y_test, y_test_pred)
test_f1_perclass = f1_score(y_test, y_test_pred, average=None)

# Output results for test
print(f'\nTest Results:')
print(f'Precision (Weighted): {test_precision:.2f}')
print(f'Recall (Weighted): {test_recall:.2f}')
print(f'F1-Score (Weighted): {test_f1_weighted:.2f}')
print(f'F1-Score (Per Class): {test_f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {test_mcc:.2f}')

# Classification report for the test set
report = classification_report(y_test, y_test_pred)
print("\nClassification Report (Test Set Metrics):\n", report)


Fitting 3 folds for each of 27 candidates, totalling 81 fits




Validation Results:
Precision (Weighted): 0.63
Recall (Weighted): 0.64
F1-Score (Weighted): 0.55
F1-Score (Per Class): [0.125      0.77439024 0.16666667 0.21052632]
Matthews Correlation Coefficient (MCC): 0.19

Test Results:
Precision (Weighted): 0.60
Recall (Weighted): 0.61
F1-Score (Weighted): 0.51
F1-Score (Per Class): [0.21818182 0.7480315  0.17241379 0.13953488]
Matthews Correlation Coefficient (MCC): 0.18

Classification Report (Test Set Metrics):
               precision    recall  f1-score   support

           0       0.75      0.13      0.22        47
           1       0.61      0.97      0.75       294
           2       0.50      0.10      0.17        48
           3       0.56      0.08      0.14       113

    accuracy                           0.61       502
   macro avg       0.61      0.32      0.32       502
weighted avg       0.60      0.61      0.51       502



## VOTING - SOFT - ENSEMBLE- HYPERPARAMETER TUNING

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

# Load datasets
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]    # Target for training

X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]     # Target for validation

X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]


X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]    # Target for testing


X_test = X_test.dropna()
y_test = y_test[X_test.index]


pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale the features
    ('voting', VotingClassifier(estimators=[
        ('RF', RandomForestClassifier()),  # Random Forest
        ('GB', GradientBoostingClassifier()),  # Gradient Boosting
        ('AB', AdaBoostClassifier())  # AdaBoost
    ], voting='soft'))
])

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'voting__RF__n_estimators': [50, 100, 200],
    'voting__GB__n_estimators': [50, 100, 200],
    'voting__AB__n_estimators': [50, 100, 200],
}

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted', verbose=2,n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best estimator from grid search
best_model = grid_search.best_estimator_

# Validate the best model on the validation set
y_validate_pred = best_model.predict(X_validate)

# Calculate metrics for validation set
precision = precision_score(y_validate, y_validate_pred, average='weighted')
recall = recall_score(y_validate, y_validate_pred, average='weighted')
f1_weighted = f1_score(y_validate, y_validate_pred, average='weighted')
mcc = matthews_corrcoef(y_validate, y_validate_pred)
f1_perclass = f1_score(y_validate, y_validate_pred, average=None)

# Output results for validation
print(f'Validation Results:')
print(f'Precision (Weighted): {precision:.2f}')
print(f'Recall (Weighted): {recall:.2f}')
print(f'F1-Score (Weighted): {f1_weighted:.2f}')
print(f'F1-Score (Per Class): {f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')

# Test the best model on the test set
y_test_pred = best_model.predict(X_test)

# Calculate metrics for test set
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
test_mcc = matthews_corrcoef(y_test, y_test_pred)
test_f1_perclass = f1_score(y_test, y_test_pred, average=None)

# Output results for test
print(f'\nTest Results:')
print(f'Precision (Weighted): {test_precision:.2f}')
print(f'Recall (Weighted): {test_recall:.2f}')
print(f'F1-Score (Weighted): {test_f1_weighted:.2f}')
print(f'F1-Score (Per Class): {test_f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {test_mcc:.2f}')

# Classification report for the test set
report = classification_report(y_test, y_test_pred)
print("\nClassification Report (Test Set Metrics):\n", report)


Fitting 3 folds for each of 27 candidates, totalling 81 fits




Validation Results:
Precision (Weighted): 0.62
Recall (Weighted): 0.65
F1-Score (Weighted): 0.58
F1-Score (Per Class): [0.17142857 0.78125    0.20408163 0.3015873 ]
Matthews Correlation Coefficient (MCC): 0.24

Test Results:
Precision (Weighted): 0.59
Recall (Weighted): 0.61
F1-Score (Weighted): 0.53
F1-Score (Per Class): [0.33333333 0.73854447 0.14035088 0.22068966]
Matthews Correlation Coefficient (MCC): 0.19

Classification Report (Test Set Metrics):
               precision    recall  f1-score   support

           0       0.77      0.21      0.33        47
           1       0.61      0.93      0.74       294
           2       0.44      0.08      0.14        48
           3       0.50      0.14      0.22       113

    accuracy                           0.61       502
   macro avg       0.58      0.34      0.36       502
weighted avg       0.59      0.61      0.53       502



## VOTING - HARD - ENSEMBLE- NO TUNING

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.metrics import classification_report

# Load datasets
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]    # Target for training


X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]     # Target for validation


X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]


X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]    # Target for testing

X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Create a pipeline for preprocessing and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale the features
    ('voting', VotingClassifier(estimators=[
        ('RF', RandomForestClassifier()),  # Random Forest
        ('GB', GradientBoostingClassifier()),  # Gradient Boosting
        ('AB', AdaBoostClassifier())  # AdaBoost
    ], voting='hard'))
])



# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Validate the model on the validation set
y_validate_pred = pipeline.predict(X_validate)

# Calculate metrics for validation set
precision = precision_score(y_validate, y_validate_pred, average='weighted')
recall = recall_score(y_validate, y_validate_pred, average='weighted')
f1_weighted = f1_score(y_validate, y_validate_pred, average='weighted')
mcc = matthews_corrcoef(y_validate, y_validate_pred)
f1_perclass = f1_score(y_validate, y_validate_pred, average=None)

# Output results for validation
print(f'Validation Results:')
print(f'Precision (Weighted): {precision:.2f}')
print(f'Recall (Weighted): {recall:.2f}')
print(f'F1-Score (Weighted): {f1_weighted:.2f}')
print(f'F1-Score (Per Class): {f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')

# Test the model on the test set
y_test_pred = pipeline.predict(X_test)

# Calculate metrics for test set
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
test_mcc = matthews_corrcoef(y_test, y_test_pred)
test_f1_perclass = f1_score(y_test, y_test_pred, average=None)

# Output results for test
print(f'\nTest Results:')
print(f'Precision (Weighted): {test_precision:.2f}')
print(f'Recall (Weighted): {test_recall:.2f}')
print(f'F1-Score (Weighted): {test_f1_weighted:.2f}')
print(f'F1-Score (Per Class): {test_f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {test_mcc:.2f}')

# Classification report for the test set
report = classification_report(y_test, y_test_pred)
print("\nClassification Report (Test Set Metrics):\n", report)




Validation Results:
Precision (Weighted): 0.58
Recall (Weighted): 0.63
F1-Score (Weighted): 0.52
F1-Score (Per Class): [0.         0.77198212 0.17391304 0.11764706]
Matthews Correlation Coefficient (MCC): 0.15

Test Results:
Precision (Weighted): 0.67
Recall (Weighted): 0.61
F1-Score (Weighted): 0.49
F1-Score (Per Class): [0.11764706 0.74711168 0.1509434  0.09917355]
Matthews Correlation Coefficient (MCC): 0.17

Classification Report (Test Set Metrics):
               precision    recall  f1-score   support

           0       0.75      0.06      0.12        47
           1       0.60      0.99      0.75       294
           2       0.80      0.08      0.15        48
           3       0.75      0.05      0.10       113

    accuracy                           0.61       502
   macro avg       0.73      0.30      0.28       502
weighted avg       0.67      0.61      0.49       502



## VOTING - SOFT - ENSEMBLE - NO TUNING

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.metrics import classification_report

# Load datasets
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]    # Target for training


X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]     # Target for validation


X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]


X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]    # Target for testing

X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Create a pipeline for preprocessing and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale the features
    ('voting', VotingClassifier(estimators=[
        ('RF', RandomForestClassifier()),  # Random Forest
        ('GB', GradientBoostingClassifier()),  # Gradient Boosting
        ('AB', AdaBoostClassifier())  # AdaBoost
    ], voting='soft'))
])



# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Validate the model on the validation set
y_validate_pred = pipeline.predict(X_validate)

# Calculate metrics for validation set
precision = precision_score(y_validate, y_validate_pred, average='weighted')
recall = recall_score(y_validate, y_validate_pred, average='weighted')
f1_weighted = f1_score(y_validate, y_validate_pred, average='weighted')
mcc = matthews_corrcoef(y_validate, y_validate_pred)
f1_perclass = f1_score(y_validate, y_validate_pred, average=None)

# Output results for validation
print(f'Validation Results:')
print(f'Precision (Weighted): {precision:.2f}')
print(f'Recall (Weighted): {recall:.2f}')
print(f'F1-Score (Weighted): {f1_weighted:.2f}')
print(f'F1-Score (Per Class): {f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')

# Test the model on the test set
y_test_pred = pipeline.predict(X_test)

# Calculate metrics for test set
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
test_mcc = matthews_corrcoef(y_test, y_test_pred)
test_f1_perclass = f1_score(y_test, y_test_pred, average=None)

# Output results for test
print(f'\nTest Results:')
print(f'Precision (Weighted): {test_precision:.2f}')
print(f'Recall (Weighted): {test_recall:.2f}')
print(f'F1-Score (Weighted): {test_f1_weighted:.2f}')
print(f'F1-Score (Per Class): {test_f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {test_mcc:.2f}')

# Classification report for the test set
report = classification_report(y_test, y_test_pred)
print("\nClassification Report (Test Set Metrics):\n", report)




Validation Results:
Precision (Weighted): 0.62
Recall (Weighted): 0.64
F1-Score (Weighted): 0.56
F1-Score (Per Class): [0.11764706 0.77488515 0.13333333 0.27118644]
Matthews Correlation Coefficient (MCC): 0.21

Test Results:
Precision (Weighted): 0.64
Recall (Weighted): 0.61
F1-Score (Weighted): 0.52
F1-Score (Per Class): [0.22222222 0.74901445 0.18518519 0.17777778]
Matthews Correlation Coefficient (MCC): 0.20

Classification Report (Test Set Metrics):
               precision    recall  f1-score   support

           0       0.86      0.13      0.22        47
           1       0.61      0.97      0.75       294
           2       0.83      0.10      0.19        48
           3       0.55      0.11      0.18       113

    accuracy                           0.61       502
   macro avg       0.71      0.33      0.33       502
weighted avg       0.64      0.61      0.52       502



## VOTING - SOFT - Basic - HYPERPARAMETER TUNING

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier

# Load datasets
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]    # Target for training

X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]     # Target for validation

X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]


X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]    # Target for testing


X_test = X_test.dropna()
y_test = y_test[X_test.index]


# Create a pipeline for preprocessing and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scaler (although data is already scaled, retained for consistency)
    ('voting', VotingClassifier(estimators=[
        ('LR', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=200)),
        ('SVC', SVC(gamma='auto', probability=True)),  # SVC with probability estimates enabled
        ('DTC', DecisionTreeClassifier()),  # Decision Tree Classifier
        ('KNN', KNeighborsClassifier(n_neighbors=5)),  # K-Nearest Neighbors with default k=5
        ('NB', GaussianNB())  # Naive Bayes
    ], voting='hard'))  # Using hard voting (majority rule)
])

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'voting__LR__C': [0.01, 0.1, 1, 10, 100],  # Logistic Regression regularization parameter
    'voting__SVC__C': [0.01, 0.1, 1, 10, 100],  # SVC regularization parameter
    'voting__DTC__max_depth': [None, 5, 10, 15],  # Decision Tree max depth
    'voting__KNN__n_neighbors': [3, 5, 7, 9],  # Number of neighbors for KNN
    'voting__KNN__weights': ['uniform', 'distance'],  # KNN weight function
    # Naive Bayes has no regular hyperparameters for tuning in GaussianNB, but we could add smoothing if necessary
}

# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted', verbose=2,n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best estimator from grid search
best_model = grid_search.best_estimator_

# Validate the best model on the validation set
y_validate_pred = best_model.predict(X_validate)

# Calculate metrics for validation set
precision = precision_score(y_validate, y_validate_pred, average='weighted')
recall = recall_score(y_validate, y_validate_pred, average='weighted')
f1_weighted = f1_score(y_validate, y_validate_pred, average='weighted')
mcc = matthews_corrcoef(y_validate, y_validate_pred)
f1_perclass = f1_score(y_validate, y_validate_pred, average=None)

# Output results for validation
print(f'Validation Results:')
print(f'Precision (Weighted): {precision:.2f}')
print(f'Recall (Weighted): {recall:.2f}')
print(f'F1-Score (Weighted): {f1_weighted:.2f}')
print(f'F1-Score (Per Class): {f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')

# Test the best model on the test set
y_test_pred = best_model.predict(X_test)

# Calculate metrics for test set
test_precision = precision_score(y_test, y_test_pred, average='weighted')
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1_weighted = f1_score(y_test, y_test_pred, average='weighted')
test_mcc = matthews_corrcoef(y_test, y_test_pred)
test_f1_perclass = f1_score(y_test, y_test_pred, average=None)

# Output results for test
print(f'\nTest Results:')
print(f'Precision (Weighted): {test_precision:.2f}')
print(f'Recall (Weighted): {test_recall:.2f}')
print(f'F1-Score (Weighted): {test_f1_weighted:.2f}')
print(f'F1-Score (Per Class): {test_f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {test_mcc:.2f}')

# Classification report for the test set
report = classification_report(y_test, y_test_pred)
print("\nClassification Report (Test Set Metrics):\n", report)


Fitting 3 folds for each of 800 candidates, totalling 2400 fits




Validation Results:
Precision (Weighted): 0.55
Recall (Weighted): 0.63
F1-Score (Weighted): 0.51
F1-Score (Per Class): [0.12121212 0.77037037 0.17391304 0.02083333]
Matthews Correlation Coefficient (MCC): 0.12

Test Results:
Precision (Weighted): 0.45
Recall (Weighted): 0.59
F1-Score (Weighted): 0.45
F1-Score (Per Class): [0.07272727 0.73751601 0.1509434  0.        ]
Matthews Correlation Coefficient (MCC): 0.08

Classification Report (Test Set Metrics):
               precision    recall  f1-score   support

           0       0.25      0.04      0.07        47
           1       0.59      0.98      0.74       294
           2       0.80      0.08      0.15        48
           3       0.00      0.00      0.00       113

    accuracy                           0.59       502
   macro avg       0.41      0.28      0.24       502
weighted avg       0.45      0.59      0.45       502



## VOTING - SOFT - Basic -NO TUNING

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef
from sklearn.metrics import classification_report
from sklearn.preprocessing import label_binarize

# Load dataset from CSV
# Assume the target column is named 'target' and is the last column in the dataset
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]    # Target for training

X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]     # Target for validation

X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]


X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]    # Target for testing


X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Ensemble of models
estimator = []
estimator.append(('LR', LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=200)))
estimator.append(('SVC', SVC(gamma='auto', probability=True)))
estimator.append(('DTC', DecisionTreeClassifier()))
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier

# Adding more non-ensemble estimators
estimator.append(('KNN', KNeighborsClassifier()))
estimator.append(('GNB', GaussianNB()))
estimator.append(('Ridge', RidgeClassifier()))


# Create Voting Classifier (ensemble method)
ensemble_model = VotingClassifier(estimators=estimator, voting='hard') #CHANGE TO HARD

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Predict using the test set
y_pred = ensemble_model.predict(X_test)

# Precision, Recall, F1-Weighted, and MCC
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

# F1-Perclass
f1_perclass = f1_score(y_test, y_pred, average=None)

# AUC (Binarize the labels for AUC calculation in multiclass case)
# y_test_bin = label_binarize(y_test, classes=list(set(y_test)))  # Binarizing for AUC
# auc = roc_auc_score(y_test_bin, y_pred, average='macro', multi_class='ovr')

# Classification report for per-class metrics
report = classification_report(y_test, y_pred)

# Output results
print(f'Precision (Weighted): {precision:.2f}')
print(f'Recall (Weighted): {recall:.2f}')
print(f'F1-Score (Weighted): {f1_weighted:.2f}')
print(f'F1-Score (Per Class): {f1_perclass}')
# print(f'AUC: {auc:.2f}')
print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')
print("\nClassification Report (Per Class Metrics):\n", report)




Precision (Weighted): 0.34
Recall (Weighted): 0.58
F1-Score (Weighted): 0.43
F1-Score (Per Class): [0.         0.73392182 0.         0.        ]
Matthews Correlation Coefficient (MCC): -0.04

Classification Report (Per Class Metrics):
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        47
           1       0.58      0.99      0.73       294
           2       0.00      0.00      0.00        48
           3       0.00      0.00      0.00       113

    accuracy                           0.58       502
   macro avg       0.15      0.25      0.18       502
weighted avg       0.34      0.58      0.43       502



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## ENSEMBLE - MIXED - HYPERPARAMETER TUNED

In [None]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import label_binarize

# Hyperparameter tuning function using GridSearchCV
def tune_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Get a list of base models with scaling and hyperparameter tuning
def get_models(X_train, y_train):
    models = list()

    # Logistic Regression
    lr = LogisticRegression()
    lr_params = {'model__C': [0.1, 1, 10]}
    models.append(('lr', tune_model(Pipeline([('scaler', StandardScaler()), ('model', lr)]), lr_params, X_train, y_train)))

    # Decision Tree
    cart = DecisionTreeClassifier()
    cart_params = {'model__max_depth': [5, 10, 20]}
    models.append(('cart', tune_model(Pipeline([('scaler', StandardScaler()), ('model', cart)]), cart_params, X_train, y_train)))

    # Gaussian Naive Bayes
    models.append(('bayes', Pipeline([('scaler', StandardScaler()), ('model', GaussianNB())])))

    # Random Forest
    rf = RandomForestClassifier()
    rf_params = {'model__n_estimators': [100, 200], 'model__max_depth': [10, 20]}
    models.append(('rf', tune_model(Pipeline([('scaler', StandardScaler()), ('model', rf)]), rf_params, X_train, y_train)))

    # Gradient Boosting
    gb = GradientBoostingClassifier()
    gb_params = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    models.append(('gb', tune_model(Pipeline([('scaler', StandardScaler()), ('model', gb)]), gb_params, X_train, y_train)))

    # XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_params = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    models.append(('xgb', tune_model(Pipeline([('scaler', StandardScaler()), ('model', xgb)]), xgb_params, X_train, y_train)))

    # LightGBM
    lgb = LGBMClassifier()
    lgb_params = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    models.append(('lgb', tune_model(Pipeline([('scaler', StandardScaler()), ('model', lgb)]), lgb_params, X_train, y_train)))

    return models

# Evaluate each base model
def evaluate_models(models, X_train, X_val, y_train, y_val):
    scores = list()
    for name, model in models:
        # Fit the model
        model.fit(X_train, y_train)
        # Evaluate the model
        yhat = model.predict(X_val)
        acc = accuracy_score(y_val, yhat)
        # Store the accuracy as the weight
        scores.append(acc)
    return scores

# Evaluate the ensemble model
def evaluate_ensemble(ensemble, X_test, y_test):
    # Predictions and probabilities
    yhat = ensemble.predict(X_test)
    y_prob = ensemble.predict_proba(X_test)

    # Metrics
    precision = precision_score(y_test, yhat, average='weighted')
    recall = recall_score(y_test, yhat, average='weighted')
    f1_weighted = f1_score(y_test, yhat, average='weighted')
    f1_perclass = f1_score(y_test, yhat, average=None)
    mcc = matthews_corrcoef(y_test, yhat)

    # AUC (needs binarized labels for multiclass AUC)
    y_test_bin = label_binarize(y_test, classes=list(set(y_test)))  # Binarize for AUC
    auc = roc_auc_score(y_test_bin, y_prob, average='macro', multi_class='ovr')

    # Accuracy
    accuracy = accuracy_score(y_test, yhat)

    # Print metrics
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Precision (Weighted): {precision:.2f}')
    print(f'Recall (Weighted): {recall:.2f}')
    print(f'F1-Score (Weighted): {f1_weighted:.2f}')
    print(f'F1-Score (Per Class): {f1_perclass}')
    print(f'AUC: {auc:.2f}')
    print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')

# Load dataset from CSV files
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]   # Target for training

X_train = X_train.dropna()        # Handle missing data in training set
y_train = y_train[X_train.index]  # Match target with the dropped features

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]   # Target for validation

X_validate = X_validate.dropna()        # Handle missing data in validation set
y_validate = y_validate[X_validate.index]  # Match target with the dropped features

X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]   # Target for testing

X_test = X_test.dropna()       # Handle missing data in test set
y_test = y_test[X_test.index]  # Match target with the dropped features

# Create the base models
models = get_models(X_train, y_train)

# Fit and evaluate each model
scores = evaluate_models(models, X_train, X_validate, y_train, y_validate)
print("Model Scores (Used as Weights):", scores)

# Create the ensemble with weighted voting
ensemble = VotingClassifier(estimators=models, voting='soft', weights=scores)

# Fit the ensemble on the full training dataset
ensemble.fit(X_train, y_train)

# Evaluate the ensemble on the test set
evaluate_ensemble(ensemble, X_test, y_test)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002483 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 2414, number of used features: 10
[LightGBM] [Info] Start training from score -2.500773
[LightGBM] [Info] Start training from score -0.459947
[LightGBM] [Info] Start training from score -2.475834
[LightGBM] [Info] Start training from score -1.596678


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 2414, number of used features: 10
[LightGBM] [Info] Start training from score -2.500773
[LightGBM] [Info] Start training from score -0.459947
[LightGBM] [Info] Start training from score -2.475834
[LightGBM] [Info] Start training from score -1.596678
Model Scores (Used as Weights): [0.6235294117647059, 0.6070588235294118, 0.5764705882352941, 0.6470588235294118, 0.6258823529411764, 0.6282352941176471, 0.6423529411764706]


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 2414, number of used features: 10
[LightGBM] [Info] Start training from score -2.500773
[LightGBM] [Info] Start training from score -0.459947
[LightGBM] [Info] Start training from score -2.475834
[LightGBM] [Info] Start training from score -1.596678
Accuracy: 60.36%
Precision (Weighted): 0.62
Recall (Weighted): 0.60
F1-Score (Weighted): 0.49
F1-Score (Per Class): [0.11764706 0.74580645 0.18518519 0.09677419]
AUC: 0.64
Matthews Correlation Coefficient (MCC): 0.16


In [None]:
import pandas as pd
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Function to tune models
def tune_model(model, params, X_train, y_train):
    grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Get base models for Bagging with tuning
def get_bagging_models(X_train, y_train):
    models = list()

    # Decision Tree
    dt = DecisionTreeClassifier()
    dt_params = {'model__max_depth': [5, 10, 20]}
    models.append(('cart', tune_model(Pipeline([('scaler', StandardScaler()), ('model', dt)]), dt_params, X_train, y_train)))

    # Logistic Regression
    lr = LogisticRegression()
    lr_params = {'model__C': [0.1, 1, 10]}
    models.append(('lr', tune_model(Pipeline([('scaler', StandardScaler()), ('model', lr)]), lr_params, X_train, y_train)))

    # Random Forest
    rf = RandomForestClassifier()
    rf_params = {'model__n_estimators': [100, 200], 'model__max_depth': [10, 20]}
    models.append(('rf', tune_model(Pipeline([('scaler', StandardScaler()), ('model', rf)]), rf_params, X_train, y_train)))

    # Gradient Boosting
    gb = GradientBoostingClassifier()
    gb_params = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    models.append(('gb', tune_model(Pipeline([('scaler', StandardScaler()), ('model', gb)]), gb_params, X_train, y_train)))

    # XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_params = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    models.append(('xgb', tune_model(Pipeline([('scaler', StandardScaler()), ('model', xgb)]), xgb_params, X_train, y_train)))

    # LightGBM
    lgb = LGBMClassifier()
    lgb_params = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    models.append(('lgb', tune_model(Pipeline([('scaler', StandardScaler()), ('model', lgb)]), lgb_params, X_train, y_train)))

    # Extra Trees Classifier
    etc = ExtraTreesClassifier()
    etc_params = {'model__n_estimators': [100, 200], 'model__max_depth': [10, 20]}
    models.append(('etc', tune_model(Pipeline([('scaler', StandardScaler()), ('model', etc)]), etc_params, X_train, y_train)))

    # KNeighborsClassifier
    knn = KNeighborsClassifier()
    knn_params = {'model__n_neighbors': [3, 5, 7], 'model__weights': ['uniform', 'distance']}
    models.append(('knn', tune_model(Pipeline([('scaler', StandardScaler()), ('model', knn)]), knn_params, X_train, y_train)))

    return models

# Define the bagging ensemble
def get_bagging_ensemble(base_model):
    bagging_ensemble = BaggingClassifier(base_model, n_estimators=10, random_state=42)
    return bagging_ensemble

# Evaluate the bagging ensemble
def evaluate_bagging_ensemble(bagging_ensemble, X_train_full, y_train_full, X_test, y_test):
    # Fit the bagging ensemble on the full training dataset
    bagging_ensemble.fit(X_train_full, y_train_full)

    # Make predictions
    yhat = bagging_ensemble.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, yhat)
    precision = precision_score(y_test, yhat, average='weighted')
    recall = recall_score(y_test, yhat, average='weighted')
    f1_weighted = f1_score(y_test, yhat, average='weighted')
    f1_perclass = f1_score(y_test, yhat, average=None)
    mcc = matthews_corrcoef(y_test, yhat)

    # Print evaluation results
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Precision (Weighted): {precision:.2f}')
    print(f'Recall (Weighted): {recall:.2f}')
    print(f'F1-Score (Weighted): {f1_weighted:.2f}')
    print(f'F1-Score (Per Class): {f1_perclass}')
    print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')


# Read datasets
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]   # Target for training
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]   # Target for validation
X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]

X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]   # Target for testing
X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Combine training and validation sets for the final training set
X_train_full = pd.concat([X_train, X_validate])
y_train_full = pd.concat([y_train, y_validate])

# Get tuned base models
models = get_bagging_models(X_train, y_train)

# Evaluate each model using Bagging
for name, model in models:
    print(f'\nEvaluating Bagging for {name}...')
    bagging_ensemble = get_bagging_ensemble(model)
    evaluate_bagging_ensemble(bagging_ensemble, X_train_full, y_train_full, X_test, y_test)


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 2414, number of used features: 10
[LightGBM] [Info] Start training from score -2.500773
[LightGBM] [Info] Start training from score -0.459947
[LightGBM] [Info] Start training from score -2.475834
[LightGBM] [Info] Start training from score -1.596678

Evaluating Bagging for cart...
Accuracy: 58.96%
Precision (Weighted): 0.50
Recall (Weighted): 0.59
F1-Score (Weighted): 0.44
F1-Score (Per Class): [0.         0.74148802 0.         0.03448276]
Matthews Correlation Coefficient (MCC): 0.07

Evaluating Bagging for lr...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 58.17%
Precision (Weighted): 0.34
Recall (Weighted): 0.58
F1-Score (Weighted): 0.43
F1-Score (Per Class): [0.         0.73737374 0.         0.        ]
Matthews Correlation Coefficient (MCC): -0.00

Evaluating Bagging for rf...
Accuracy: 62.95%
Precision (Weighted): 0.66
Recall (Weighted): 0.63
F1-Score (Weighted): 0.55
F1-Score (Per Class): [0.3        0.75797872 0.18518519 0.24637681]
Matthews Correlation Coefficient (MCC): 0.26

Evaluating Bagging for gb...
Accuracy: 62.35%
Precision (Weighted): 0.67
Recall (Weighted): 0.62
F1-Score (Weighted): 0.53
F1-Score (Per Class): [0.25454545 0.7565445  0.1509434  0.1969697 ]
Matthews Correlation Coefficient (MCC): 0.24

Evaluating Bagging for xgb...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 61.95%
Precision (Weighted): 0.64
Recall (Weighted): 0.62
F1-Score (Weighted): 0.52
F1-Score (Per Class): [0.28070175 0.7542707  0.18518519 0.16666667]
Matthews Correlation Coefficient (MCC): 0.23

Evaluating Bagging for lgb...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 2839, number of used features: 10
[LightGBM] [Info] Start training from score -2.553044
[LightGBM] [Info] Start training from score -0.480983
[LightGBM] [Info] Start training from score -2.429746
[LightGBM] [Info] Start training from score -1.532842
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000331 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140

In [None]:
import pandas as pd
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Reading datasets
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]   # Target for training

X_train = X_train.dropna()  # Dropping rows with missing values
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]   # Target for validation

X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]

X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]   # Target for testing

X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Function to tune models
def tune_model(model, params, X_train, y_train):
    grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Get base models for Bagging with tuning
def get_bagging_models(X_train, y_train):
    models = list()

    # Decision Tree
    dt = DecisionTreeClassifier()
    dt_params = {'model__max_depth': [5, 10, 20]}
    models.append(('cart', tune_model(Pipeline([('scaler', StandardScaler()), ('model', dt)]), dt_params, X_train, y_train)))

    # Logistic Regression
    lr = LogisticRegression()
    lr_params = {'model__C': [0.1, 1, 10]}
    models.append(('lr', tune_model(Pipeline([('scaler', StandardScaler()), ('model', lr)]), lr_params, X_train, y_train)))

    # Random Forest
    rf = RandomForestClassifier()
    rf_params = {'model__n_estimators': [100, 200], 'model__max_depth': [10, 20]}
    models.append(('rf', tune_model(Pipeline([('scaler', StandardScaler()), ('model', rf)]), rf_params, X_train, y_train)))

    # Gradient Boosting
    gb = GradientBoostingClassifier()
    gb_params = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    models.append(('gb', tune_model(Pipeline([('scaler', StandardScaler()), ('model', gb)]), gb_params, X_train, y_train)))

    # XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    xgb_params = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    models.append(('xgb', tune_model(Pipeline([('scaler', StandardScaler()), ('model', xgb)]), xgb_params, X_train, y_train)))

    # LightGBM
    lgb = LGBMClassifier()
    lgb_params = {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    models.append(('lgb', tune_model(Pipeline([('scaler', StandardScaler()), ('model', lgb)]), lgb_params, X_train, y_train)))

    # Extra Trees Classifier
    etc = ExtraTreesClassifier()
    etc_params = {'model__n_estimators': [100, 200], 'model__max_depth': [10, 20]}
    models.append(('etc', tune_model(Pipeline([('scaler', StandardScaler()), ('model', etc)]), etc_params, X_train, y_train)))

    # KNeighborsClassifier
    knn = KNeighborsClassifier()
    knn_params = {'model__n_neighbors': [3, 5, 7], 'model__weights': ['uniform', 'distance']}
    models.append(('knn', tune_model(Pipeline([('scaler', StandardScaler()), ('model', knn)]), knn_params, X_train, y_train)))

    return models

# Define the bagging ensemble
def get_bagging_ensemble(base_model):
    bagging_ensemble = BaggingClassifier(base_model, n_estimators=10, random_state=42)
    return bagging_ensemble

# Evaluate the bagging ensemble
def evaluate_bagging_ensemble(bagging_ensemble, X_train, y_train, X_test, y_test):
    # Fit the bagging ensemble on the full training dataset
    bagging_ensemble.fit(X_train, y_train)

    # Make predictions
    yhat = bagging_ensemble.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, yhat)
    precision = precision_score(y_test, yhat, average='weighted')
    recall = recall_score(y_test, yhat, average='weighted')
    f1_weighted = f1_score(y_test, yhat, average='weighted')
    f1_perclass = f1_score(y_test, yhat, average=None)
    mcc = matthews_corrcoef(y_test, yhat)

    # Print evaluation results
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Precision (Weighted): {precision:.2f}')
    print(f'Recall (Weighted): {recall:.2f}')
    print(f'F1-Score (Weighted): {f1_weighted:.2f}')
    print(f'F1-Score (Per Class): {f1_perclass}')
    print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')

# Get tuned base models
models = get_bagging_models(X_train, y_train)

# Evaluate each model using Bagging
for name, model in models:
    print(f'\nEvaluating Bagging for {name}...')
    bagging_ensemble = get_bagging_ensemble(model)
    evaluate_bagging_ensemble(bagging_ensemble, X_train, y_train, X_test, y_test)


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1403
[LightGBM] [Info] Number of data points in the train set: 2414, number of used features: 10
[LightGBM] [Info] Start training from score -2.500773
[LightGBM] [Info] Start training from score -0.459947
[LightGBM] [Info] Start training from score -2.475834
[LightGBM] [Info] Start training from score -1.596678

Evaluating Bagging for cart...
Accuracy: 59.16%
Precision (Weighted): 0.52
Recall (Weighted): 0.59
F1-Score (Weighted): 0.45
F1-Score (Per Class): [0.         0.74336283 0.07843137 0.0173913 ]
Matthews Correlation Coefficient (MCC): 0.10

Evaluating Bagging for lr...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 58.17%
Precision (Weighted): 0.34
Recall (Weighted): 0.58
F1-Score (Weighted): 0.43
F1-Score (Per Class): [0.         0.73737374 0.         0.        ]
Matthews Correlation Coefficient (MCC): -0.00

Evaluating Bagging for rf...
Accuracy: 62.55%
Precision (Weighted): 0.68
Recall (Weighted): 0.63
F1-Score (Weighted): 0.54
F1-Score (Per Class): [0.31578947 0.75265957 0.15384615 0.25174825]
Matthews Correlation Coefficient (MCC): 0.24

Evaluating Bagging for gb...
Accuracy: 60.56%
Precision (Weighted): 0.61
Recall (Weighted): 0.61
F1-Score (Weighted): 0.51
F1-Score (Per Class): [0.25       0.74338624 0.1509434  0.17266187]
Matthews Correlation Coefficient (MCC): 0.18

Evaluating Bagging for xgb...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Accuracy: 60.16%
Precision (Weighted): 0.58
Recall (Weighted): 0.60
F1-Score (Weighted): 0.51
F1-Score (Per Class): [0.28070175 0.74202128 0.14814815 0.15602837]
Matthews Correlation Coefficient (MCC): 0.17

Evaluating Bagging for lgb...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1382
[LightGBM] [Info] Number of data points in the train set: 2414, number of used features: 10
[LightGBM] [Info] Start training from score -2.574105
[LightGBM] [Info] Start training from score -0.471164
[LightGBM] [Info] Start training from score -2.446706
[LightGBM] [Info] Start training from score -1.546817
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1380
[LightGBM] [Info] Number of data points in the train set: 2414

# MLP

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

# Load train, validation, and test datasets
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the datasets into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]   # Target for training

# Dropping rows with missing values in the training set
X_train = X_train.dropna()
y_train = y_train[X_train.index]

# Splitting the validation dataset into features and target
X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]   # Target for validation

# Dropping rows with missing values in the validation set
X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]

# Splitting the test dataset into features and target
X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]   # Target for testing

# Dropping rows with missing values in the test set
X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Create the MLPClassifier
mlp = MLPClassifier(max_iter=500, random_state=42)

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(10,), (50,), (10, 10), (50, 50)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': np.logspace(-5, 3, 5),
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=2, n_jobs=-1, verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters from grid search
print("Best Parameters:", grid_search.best_params_)

# Use the best estimator to make predictions on the test set
best_mlp = grid_search.best_estimator_
y_pred = best_mlp.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
f1_perclass = f1_score(y_test, y_pred, average=None)
mcc = matthews_corrcoef(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation results
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision (Weighted): {precision:.2f}')
print(f'Recall (Weighted): {recall:.2f}')
print(f'F1-Score (Weighted): {f1_weighted:.2f}')
print(f'F1-Score (Per Class): {f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')
print('Confusion Matrix:')
print(conf_matrix)


Fitting 2 folds for each of 720 candidates, totalling 1440 fits
Best Parameters: {'activation': 'relu', 'alpha': 10.0, 'hidden_layer_sizes': (10,), 'learning_rate': 'constant', 'solver': 'lbfgs'}
Accuracy: 58.57%
Precision (Weighted): 0.38
Recall (Weighted): 0.59
F1-Score (Weighted): 0.44
F1-Score (Per Class): [0.         0.74017744 0.07407407 0.        ]
Matthews Correlation Coefficient (MCC): 0.05
Confusion Matrix:
[[  0  46   0   1]
 [  0 292   2   0]
 [  0  46   2   0]
 [  0 111   2   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# RANDOM FOREST

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

# Load the datasets
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the training dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]   # Target for training

# Dropping rows with missing values in the training set
X_train = X_train.dropna()
y_train = y_train[X_train.index]

# Splitting the validation dataset into features and target
X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]   # Target for validation

# Dropping rows with missing values in the validation set
X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]

# Splitting the test dataset into features and target
X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]   # Target for testing

# Dropping rows with missing values in the test set
X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Create the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 100, None],
    'max_features': ['auto', 'sqrt', None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 10],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
}

# Set up the GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=2, n_jobs=-1, verbose=1)

# Fit the model using the training set
grid_search.fit(X_train, y_train)

# Best parameters from grid search
print("Best Parameters:", grid_search.best_params_)

# Use the best estimator to make predictions on the validation set
best_rf = grid_search.best_estimator_
y_validate_pred = best_rf.predict(X_validate)

# Evaluate the model on the validation set
accuracy_validate = accuracy_score(y_validate, y_validate_pred)
precision_validate = precision_score(y_validate, y_validate_pred, average='weighted')
recall_validate = recall_score(y_validate, y_validate_pred, average='weighted')
f1_weighted_validate = f1_score(y_validate, y_validate_pred, average='weighted')
f1_perclass_validate = f1_score(y_validate, y_validate_pred, average=None)
mcc_validate = matthews_corrcoef(y_validate, y_validate_pred)
conf_matrix_validate = confusion_matrix(y_validate, y_validate_pred)

# Print validation evaluation results
print(f'Validation Accuracy: {accuracy_validate * 100:.2f}%')
print(f'Validation Precision (Weighted): {precision_validate:.2f}')
print(f'Validation Recall (Weighted): {recall_validate:.2f}')
print(f'Validation F1-Score (Weighted): {f1_weighted_validate:.2f}')
print(f'Validation F1-Score (Per Class): {f1_perclass_validate}')
print(f'Validation Matthews Correlation Coefficient (MCC): {mcc_validate:.2f}')
print('Validation Confusion Matrix:')
print(conf_matrix_validate)

# Now, make predictions on the test set and evaluate
y_test_pred = best_rf.predict(X_test)

# Evaluate the model on the test set
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_weighted_test = f1_score(y_test, y_test_pred, average='weighted')
f1_perclass_test = f1_score(y_test, y_test_pred, average=None)
mcc_test = matthews_corrcoef(y_test, y_test_pred)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)

# Print test evaluation results
print(f'Test Accuracy: {accuracy_test * 100:.2f}%')
print(f'Test Precision (Weighted): {precision_test:.2f}')
print(f'Test Recall (Weighted): {recall_test:.2f}')
print(f'Test F1-Score (Weighted): {f1_weighted_test:.2f}')
print(f'Test F1-Score (Per Class): {f1_perclass_test}')
print(f'Test Matthews Correlation Coefficient (MCC): {mcc_test:.2f}')
print('Test Confusion Matrix:')
print(conf_matrix_test)


Fitting 2 folds for each of 1152 candidates, totalling 2304 fits


768 fits failed out of a total of 2304.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
384 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
s

Best Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Validation Accuracy: 65.41%
Validation Precision (Weighted): 0.68
Validation Recall (Weighted): 0.65
Validation F1-Score (Weighted): 0.56
Validation F1-Score (Per Class): [0.06451613 0.78614458 0.13043478 0.23853211]
Validation Matthews Correlation Coefficient (MCC): 0.24
Validation Confusion Matrix:
[[  1  26   2   1]
 [  0 261   0   3]
 [  0  36   3   0]
 [  0  77   2  13]]
Test Accuracy: 62.55%
Test Precision (Weighted): 0.69
Test Recall (Weighted): 0.63
Test F1-Score (Weighted): 0.53
Test F1-Score (Per Class): [0.31578947 0.75555556 0.1509434  0.18604651]
Test Matthews Correlation Coefficient (MCC): 0.25
Test Confusion Matrix:
[[  9  38   0   0]
 [  1 289   1   3]
 [  0  43   4   1]
 [  0 101   0  12]]


# NN

In [None]:
from tensorflow.keras.metrics import AUC, Precision, Recall, F1Score, TruePositives, TrueNegatives, FalsePositives, FalseNegatives

# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(num_features,)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(num_classes, activation='softmax')
])

# Compile the model with additional metrics
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=[
        'accuracy',  # Explicitly include accuracy
        Precision(name='precision'),
        Recall(name='recall'),
        AUC(name='auc'),
        F1Score(name='f1_score', average='weighted'),
        TruePositives(name='tp'),
        TrueNegatives(name='tn'),
        FalsePositives(name='fp'),
        FalseNegatives(name='fn')
    ]
)

# Train the model
history = model.fit(
    X_train_scaled, y_train_encoded,
    epochs=500,
    batch_size=32,
    validation_data=(X_validate_scaled, y_validate_encoded),
    verbose=1
)

# Evaluate the model on the test set
test_results = model.evaluate(X_test_scaled, y_test_encoded, verbose=0)

# Print all metrics
print("\nTest Results:")
for name, value in zip(model.metrics_names, test_results):
    print(f"{name}: {value:.4f}")

# Safely get accuracy if it exists
if 'accuracy' in model.metrics_names:
    test_accuracy = test_results[model.metrics_names.index('accuracy')]
    print(f"\nTest accuracy: {test_accuracy:.4f}")
else:
    print("\nAccuracy metric not found in model metrics.")

# Make predictions on the test set
predictions = model.predict(X_test_scaled)
predicted_classes = np.argmax(predictions, axis=1)

print("\nSample predictions:")
for i in range(5):
    print(f"True class: {y_test.iloc[i]}, Predicted class: {predicted_classes[i]}")

# Calculate additional evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

precision_weighted = precision_score(y_test, predicted_classes, average='weighted')
recall_weighted = recall_score(y_test, predicted_classes, average='weighted')
f1_weighted = f1_score(y_test, predicted_classes, average='weighted')
f1_per_class = f1_score(y_test, predicted_classes, average=None)

# For AUC, we need to binarize the output
y_test_bin = to_categorical(y_test, num_classes=num_classes)
predictions_bin = to_categorical(predicted_classes, num_classes=num_classes)
auc_weighted = roc_auc_score(y_test_bin, predictions_bin, average='weighted', multi_class='ovr')

mcc = matthews_corrcoef(y_test, predicted_classes)

print("\nAdditional Evaluation Metrics:")
print(f"Precision (Weighted): {precision_weighted:.4f}")
print(f"Recall (Weighted): {recall_weighted:.4f}")
print(f"F1 (Weighted): {f1_weighted:.4f}")
print(f"F1 Per-class: {', '.join([f'{score:.4f}' for score in f1_per_class])}")
print(f"AUC (Weighted): {auc_weighted:.4f}")
print(f"MCC: {mcc:.4f}")

Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 24ms/step - accuracy: 0.3642 - auc: 0.6788 - f1_score: 0.3465 - fn: 1163.4546 - fp: 50.4416 - loss: 1.2851 - precision: 0.4018 - recall: 0.0399 - tn: 3690.9092 - tp: 83.6623 - val_accuracy: 0.6212 - val_auc: 0.7982 - val_f1_score: 0.4760 - val_fn: 203.0000 - val_fp: 140.0000 - val_loss: 1.0520 - val_precision: 0.6133 - val_recall: 0.5224 - val_tn: 1135.0000 - val_tp: 222.0000
Epoch 2/500
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6405 - auc: 0.8055 - f1_score: 0.5003 - fn: 515.1818 - fp: 398.5325 - loss: 1.0243 - precision: 0.6518 - recall: 0.5885 - tn: 3342.8181 - tp: 731.9351 - val_accuracy: 0.6235 - val_auc: 0.8023 - val_f1_score: 0.4813 - val_fn: 169.0000 - val_fp: 158.0000 - val_loss: 1.0331 - val_precision: 0.6184 - val_recall: 0.6024 - val_tn: 1117.0000 - val_tp: 256.0000
Epoch 3/500
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.639

# GENETIC ALGO

In [None]:
!pip install deap

Collecting deap
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deap
Successfully installed deap-1.4.1


In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from deap import base, creator, tools, algorithms
import random
import pandas as pd

# Load dataset
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]    # Target for training

X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]  # Features for validation
y_validate = validate_df.iloc[:, -1]     # Target for validation

X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]


X_test = test_df.iloc[:, :-1]  # Features for testing
y_test = test_df.iloc[:, -1]    # Target for testing


X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Genetic Algorithm Setup
NUM_FEATURES = X_train.shape[1]

# Create custom Fitness class with accuracy as the metric
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Generate random feature selection (0 or 1 for each feature)
def create_individual():
    return creator.Individual(np.random.randint(0, 2, NUM_FEATURES).tolist())

# Evaluate individual: train classifier using selected features and return accuracy
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:  # If no features are selected, return 0 fitness
        return 0,

    # Check if X_train is a DataFrame or NumPy array
    if isinstance(X_train, pd.DataFrame):
        X_train_selected = X_train.iloc[:, selected_features]
        X_test_selected = X_test.iloc[:, selected_features]
    else:
        X_train_selected = X_train[:, selected_features]
        X_test_selected = X_test[:, selected_features]

    clf = RandomForestClassifier()
    clf.fit(X_train_selected, y_train)
    y_pred = clf.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy,

# Crossover (two-point crossover)
def crossover(ind1, ind2):
    tools.cxTwoPoint(ind1, ind2)
    return ind1, ind2

# Mutation: flip bit (0 to 1 or 1 to 0) with probability 0.05
def mutate(individual):
    for i in range(len(individual)):
        if random.random() < 0.05:
            individual[i] = 1 - individual[i]
    return individual,

# Set up the toolbox for the Genetic Algorithm
toolbox = base.Toolbox()
toolbox.register("individual", create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", crossover)
toolbox.register("mutate", mutate)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# Genetic Algorithm main process
def genetic_algorithm_feature_selection():
    population = toolbox.population(n=50)
    ngen = 20  # Number of generations
    cxpb = 0.5  # Crossover probability
    mutpb = 0.2  # Mutation probability

    # Run the algorithm
    result_pop, logbook = algorithms.eaSimple(population, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=ngen,
                                              verbose=False)

    # Select the best individual (with highest fitness score)
    best_individual = tools.selBest(result_pop, k=1)[0]
    print("Best Individual:", best_individual)

    selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]
    print(f"Selected Features (Indexes): {selected_features}")

    return best_individual, selected_features

# Run the GA
best_individual, selected_features = genetic_algorithm_feature_selection()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, matthews_corrcoef

# Final evaluation with selected features
if isinstance(X_train, pd.DataFrame):
    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]
else:
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]

clf = RandomForestClassifier()
clf.fit(X_train_selected, y_train)
y_pred = clf.predict(X_test_selected)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
f1_perclass = f1_score(y_test, y_pred, average=None)
mcc = matthews_corrcoef(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print metrics
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision (Weighted): {precision:.2f}')
print(f'Recall (Weighted): {recall:.2f}')
print(f'F1-Score (Weighted): {f1_weighted:.2f}')
print(f'F1-Score (Per Class): {f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')
print('Confusion Matrix:')
print(conf_matrix)



Best Individual: [0, 1, 1, 0, 1, 0, 1, 1, 1, 0]
Selected Features (Indexes): [1, 2, 4, 6, 7, 8]
Accuracy: 63.35%
Precision (Weighted): 0.63
Recall (Weighted): 0.63
F1-Score (Weighted): 0.57
F1-Score (Per Class): [0.38095238 0.75623269 0.25396825 0.32051282]
Matthews Correlation Coefficient (MCC): 0.28
Confusion Matrix:
[[ 12  31   1   3]
 [  4 273   5  12]
 [  0  37   8   3]
 [  0  87   1  25]]


WITH HYPERPARAMETER TUNING

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV
from deap import base, creator, tools, algorithms
import random

# Load dataset
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]
y_validate = validate_df.iloc[:, -1]

X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]

X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Genetic Algorithm Setup
NUM_FEATURES = X_train.shape[1]

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

def create_individual():
    return creator.Individual(np.random.randint(0, 2, NUM_FEATURES).tolist())

def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:
        return 0,

    X_train_selected = X_train.iloc[:, selected_features]
    X_validate_selected = X_validate.iloc[:, selected_features]

    param_dist = {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    clf = RandomForestClassifier(random_state=42)
    clf_random = RandomizedSearchCV(estimator=clf, param_distributions=param_dist,
                                    n_iter=50, cv=3, verbose=0, random_state=42, n_jobs=-1)
    clf_random.fit(X_train_selected, y_train)
    best_clf = clf_random.best_estimator_

    y_pred = best_clf.predict(X_validate_selected)
    accuracy = accuracy_score(y_validate, y_pred)
    return accuracy,

def crossover(ind1, ind2):
    return tools.cxTwoPoint(ind1, ind2)

def mutate(individual):
    return tools.mutFlipBit(individual, indpb=0.05)

toolbox = base.Toolbox()
toolbox.register("individual", create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", crossover)
toolbox.register("mutate", mutate)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

def genetic_algorithm_feature_selection():
    population = toolbox.population(n=50)
    ngen = 20
    cxpb = 0.5
    mutpb = 0.2

    result_pop, logbook = algorithms.eaSimple(population, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=ngen,
                                              verbose=True, stats=tools.Statistics(lambda ind: ind.fitness.values))

    best_individual = tools.selBest(result_pop, k=1)[0]
    print("Best Individual:", best_individual)

    selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]
    print(f"Selected Features (Indexes): {selected_features}")

    return best_individual, selected_features

best_individual, selected_features = genetic_algorithm_feature_selection()

X_train_selected = X_train.iloc[:, selected_features]
X_test_selected = X_test.iloc[:, selected_features]

param_dist = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

clf = RandomForestClassifier(random_state=42)
clf_random = RandomizedSearchCV(estimator=clf, param_distributions=param_dist,
                                n_iter=50, cv=3, verbose=0, random_state=42, n_jobs=-1)
clf_random.fit(X_train_selected, y_train)
best_clf = clf_random.best_estimator_

y_pred = best_clf.predict(X_test_selected)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
f1_perclass = f1_score(y_test, y_pred, average=None)
mcc = matthews_corrcoef(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision (Weighted): {precision:.2f}')
print(f'Recall (Weighted): {recall:.2f}')
print(f'F1-Score (Weighted): {f1_weighted:.2f}')
print(f'F1-Score (Per Class): {f1_perclass}')
print(f'Matthews Correlation Coefficient (MCC): {mcc:.2f}')
print('Confusion Matrix:')
print(conf_matrix)



KeyboardInterrupt: 

# RL - FEATURE TRANSFORMATION

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
import random

# Define transformation functions
def identity(x):
    return x

def square(x):
    return x ** 2

def sqrt(x):
    return np.sqrt(np.abs(x))

def cube(x):
    return x ** 3

def half(x):
    return x / 2

def double(x):
    return x * 2

def add_five(x):
    return x + 5

def subtract_five(x):
    return x - 5

def log(x):
    return np.log(np.abs(x) + 1)

def exp(x):
    return np.exp(x)

def sin(x):
    return np.sin(x)

def cos(x):
    return np.cos(x)

def tan(x):
    return np.tan(x)

def reciprocal(x):
    return 1 / (x + 1e-8)  # Adding small epsilon to avoid division by zero

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def abs_value(x):
    return np.abs(x)

def negative(x):
    return -x

def power_three_halves(x):
    return np.sign(x) * (np.abs(x) ** 1.5)

def log_square(x):
    return np.log(np.abs(x) + 1) ** 2

# List of transformation functions
transformations = [
    identity, square, sqrt, cube, half, double, add_five, subtract_five,
    log, exp, sin, cos, tan, reciprocal, sigmoid, tanh, abs_value,
    negative, power_three_halves, log_square
]

class FeatureTransformAgent:
    def __init__(self, n_features, n_actions):
        self.n_features = n_features
        self.n_actions = n_actions
        self.q_table = np.zeros((n_features, n_actions))
        self.epsilon = 0.1
        self.alpha = 0.1
        self.gamma = 0.9

    def get_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        else:
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        old_value = self.q_table[state, action]
        next_max = np.max(self.q_table[next_state])
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
        self.q_table[state, action] = new_value

def transform_features(X, agent):
    X_transformed = X.copy()
    for i in range(X.shape[1]):
        action = agent.get_action(i)
        X_transformed[:, i] = transformations[action](X[:, i])
    return X_transformed

def train_agent(X_train, y_train, X_val, y_val, agent, n_episodes):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    base_model = RandomForestClassifier(n_estimators=50, random_state=42)
    base_model.fit(X_train_scaled, y_train)
    base_score = f1_score(y_val, base_model.predict(X_val_scaled), average='weighted')

    for episode in range(n_episodes):
        X_transformed = transform_features(X_train_scaled, agent)

        model = RandomForestClassifier(n_estimators=50, random_state=42)
        model.fit(X_transformed, y_train)

        X_val_transformed = transform_features(X_val_scaled, agent)
        episode_score = f1_score(y_val, model.predict(X_val_transformed), average='weighted')

        reward = episode_score - base_score

        for feature in range(X_train.shape[1]):
            action = agent.get_action(feature)
            next_state = (feature + 1) % X_train.shape[1]
            agent.update_q_table(feature, action, reward, next_state)

        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}/{n_episodes}, Weighted F1 Score: {episode_score:.4f}")

    return agent

# Load and prepare your data
# Replace this with your actual data loading code
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]
y_validate = validate_df.iloc[:, -1]

X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]

X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

X_test = X_test.dropna()
y_test = y_test[X_test.index]
# Initialize and train the agent
agent = FeatureTransformAgent(n_features=X_train.shape[1], n_actions=len(transformations))
trained_agent = train_agent(X_train, y_train, X_validate, y_validate, agent, n_episodes=1000)

# Apply learned transformations to test set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_transformed = transform_features(X_test_scaled, trained_agent)

# Train final model on transformed data
final_model = RandomForestClassifier(n_estimators=500, random_state=42)
X_train_transformed = transform_features(X_train_scaled, trained_agent)
final_model.fit(X_train_transformed, y_train)

# Evaluate final model
final_score = f1_score(y_test, final_model.predict(X_test_transformed), average='weighted')
print(f"Final Weighted F1 Score on test set: {final_score:.4f}")

# Print learned transformation policy
print("\nLearned Transformation Policy:")
for feature in range(X_train.shape[1]):
    best_action = np.argmax(trained_agent.q_table[feature])
    print(f"Feature {feature}: {transformations[best_action].__name__}")

Episode 10/1000, Weighted F1 Score: 0.5227
Episode 20/1000, Weighted F1 Score: 0.5377
Episode 30/1000, Weighted F1 Score: 0.5491
Episode 40/1000, Weighted F1 Score: 0.5643
Episode 50/1000, Weighted F1 Score: 0.5736
Episode 60/1000, Weighted F1 Score: 0.4944
Episode 70/1000, Weighted F1 Score: 0.5380
Episode 80/1000, Weighted F1 Score: 0.5299
Episode 90/1000, Weighted F1 Score: 0.5126
Episode 100/1000, Weighted F1 Score: 0.5340
Episode 110/1000, Weighted F1 Score: 0.4959
Episode 120/1000, Weighted F1 Score: 0.5477
Episode 130/1000, Weighted F1 Score: 0.5406
Episode 140/1000, Weighted F1 Score: 0.5337
Episode 150/1000, Weighted F1 Score: 0.5126
Episode 160/1000, Weighted F1 Score: 0.5549
Episode 170/1000, Weighted F1 Score: 0.5838
Episode 180/1000, Weighted F1 Score: 0.5366
Episode 190/1000, Weighted F1 Score: 0.4910
Episode 200/1000, Weighted F1 Score: 0.5863
Episode 210/1000, Weighted F1 Score: 0.5219
Episode 220/1000, Weighted F1 Score: 0.5833
Episode 230/1000, Weighted F1 Score: 0.56

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
import random

# [Keep the transformation functions as they were]

class FeatureTransformAgent:
    def __init__(self, n_features, n_actions):
        self.n_features = n_features
        self.n_actions = n_actions
        self.q_table = np.zeros((n_features, n_actions))
        self.epsilon = 0.1
        self.alpha = 0.1
        self.gamma = 0.9
        self.best_actions = None
        self.best_score = -np.inf

    def get_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        else:
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        old_value = self.q_table[state, action]
        next_max = np.max(self.q_table[next_state])
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
        self.q_table[state, action] = new_value

    def get_best_actions(self):
        return np.argmax(self.q_table, axis=1)

def transform_features(X, actions):
    X_transformed = X.copy()
    for i in range(X.shape[1]):
        X_transformed[:, i] = transformations[actions[i]](X[:, i])
    return X_transformed

def train_agent(X_train, y_train, X_val, y_val, agent, n_episodes):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    base_model = RandomForestClassifier(n_estimators=100, random_state=42)
    base_model.fit(X_train_scaled, y_train)
    base_score = f1_score(y_val, base_model.predict(X_val_scaled), average='weighted')

    for episode in range(n_episodes):
        actions = [agent.get_action(i) for i in range(agent.n_features)]
        X_transformed = transform_features(X_train_scaled, actions)

        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_transformed, y_train)

        X_val_transformed = transform_features(X_val_scaled, actions)
        episode_score = f1_score(y_val, model.predict(X_val_transformed), average='weighted')

        if episode_score > agent.best_score:
            agent.best_score = episode_score
            agent.best_actions = actions

        reward = episode_score - base_score

        for feature in range(X_train.shape[1]):
            next_state = (feature + 1) % X_train.shape[1]
            agent.update_q_table(feature, actions[feature], reward, next_state)

        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}/{n_episodes}, Weighted F1 Score: {episode_score:.4f}")

    return agent

# Load and prepare your data
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

# Splitting the dataset into features (X) and target (y)
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]
y_validate = validate_df.iloc[:, -1]

X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]

X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]

X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Initialize and train the agent
agent = FeatureTransformAgent(n_features=X_train.shape[1], n_actions=len(transformations))
trained_agent = train_agent(X_train, y_train, X_validate, y_validate, agent, n_episodes=1000)

# Apply learned transformations to all sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

best_actions = trained_agent.best_actions
X_train_transformed = transform_features(X_train_scaled, best_actions)
X_val_transformed = transform_features(X_val_scaled, best_actions)
X_test_transformed = transform_features(X_test_scaled, best_actions)

# Train final model on transformed data
final_model = RandomForestClassifier(n_estimators=500, random_state=42)
final_model.fit(X_train_transformed, y_train)

# Evaluate final model
val_score = f1_score(y_validate, final_model.predict(X_val_transformed), average='weighted')
test_score = f1_score(y_test, final_model.predict(X_test_transformed), average='weighted')
print(f"Final Weighted F1 Score on validation set: {val_score:.4f}")
print(f"Final Weighted F1 Score on test set: {test_score:.4f}")

# Print learned transformation policy
print("\nLearned Transformation Policy:")
for feature, action in enumerate(best_actions):
    print(f"Feature {feature}: {transformations[action].__name__}")

Episode 10/1000, Weighted F1 Score: 0.5972
Episode 20/1000, Weighted F1 Score: 0.5805
Episode 30/1000, Weighted F1 Score: 0.5972
Episode 40/1000, Weighted F1 Score: 0.5883
Episode 50/1000, Weighted F1 Score: 0.5988
Episode 60/1000, Weighted F1 Score: 0.5972
Episode 70/1000, Weighted F1 Score: 0.5972
Episode 80/1000, Weighted F1 Score: 0.5972
Episode 90/1000, Weighted F1 Score: 0.5994
Episode 100/1000, Weighted F1 Score: 0.5972
Episode 110/1000, Weighted F1 Score: 0.5836
Episode 120/1000, Weighted F1 Score: 0.5812
Episode 130/1000, Weighted F1 Score: 0.5805
Episode 140/1000, Weighted F1 Score: 0.5937
Episode 150/1000, Weighted F1 Score: 0.5709
Episode 160/1000, Weighted F1 Score: 0.5972
Episode 170/1000, Weighted F1 Score: 0.5972
Episode 180/1000, Weighted F1 Score: 0.5972
Episode 190/1000, Weighted F1 Score: 0.5972
Episode 200/1000, Weighted F1 Score: 0.5972
Episode 210/1000, Weighted F1 Score: 0.5972
Episode 220/1000, Weighted F1 Score: 0.5972
Episode 230/1000, Weighted F1 Score: 0.59

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from deap import base, creator, tools, algorithms
import random

# Define transformation functions (same as in the second approach)
def identity(x): return x
def square(x): return x ** 2
def sqrt(x): return np.sqrt(np.abs(x))
def cube(x): return x ** 3
def half(x): return x / 2
def double(x): return x * 2
def add_five(x): return x + 5
def subtract_five(x): return x - 5
def log(x): return np.log(np.abs(x) + 1)
def exp(x): return np.exp(x)
def sin(x): return np.sin(x)
def cos(x): return np.cos(x)
def tan(x): return np.tan(x)
def reciprocal(x): return 1 / (x + 1e-8)
def sigmoid(x): return 1 / (1 + np.exp(-x))
def tanh(x): return np.tanh(x)
def abs_value(x): return np.abs(x)
def negative(x): return -x
def power_three_halves(x): return np.sign(x) * (np.abs(x) ** 1.5)
def log_square(x): return np.log(np.abs(x) + 1) ** 2

transformations = [
    identity, square, sqrt, cube, half, double, add_five, subtract_five,
    log, exp, sin, cos, tan, reciprocal, sigmoid, tanh, abs_value,
    negative, power_three_halves, log_square
]

# Load dataset
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')
test_df = pd.read_csv('/content/test.csv')

X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_validate = validate_df.iloc[:, :-1]
y_validate = validate_df.iloc[:, -1]
X_validate = X_validate.dropna()
y_validate = y_validate[X_validate.index]

X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]
X_test = X_test.dropna()
y_test = y_test[X_test.index]

# Genetic Algorithm Setup
NUM_FEATURES = X_train.shape[1]

# Create custom Fitness class with F1 score as the metric
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

def create_individual():
    return creator.Individual(np.random.randint(0, 2, NUM_FEATURES).tolist())

def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:
        return 0,

    X_train_selected = X_train.iloc[:, selected_features]
    X_validate_selected = X_validate.iloc[:, selected_features]

    clf = RandomForestClassifier(n_estimators=500,random_state=42)
    clf.fit(X_train_selected, y_train)
    y_pred = clf.predict(X_validate_selected)
    f1 = f1_score(y_validate, y_pred, average='weighted')
    return f1,

def crossover(ind1, ind2):
    tools.cxTwoPoint(ind1, ind2)
    return ind1, ind2

def mutate(individual):
    for i in range(len(individual)):
        if random.random() < 0.05:
            individual[i] = 1 - individual[i]
    return individual,

toolbox = base.Toolbox()
toolbox.register("individual", create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", crossover)
toolbox.register("mutate", mutate)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

def genetic_algorithm_feature_selection():
    population = toolbox.population(n=50)
    ngen = 20  # Number of generations
    cxpb = 0.5  # Crossover probability
    mutpb = 0.2  # Mutation probability

    result_pop, logbook = algorithms.eaSimple(population, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=ngen, verbose=False)
    best_individual = tools.selBest(result_pop, k=1)[0]
    selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]
    return best_individual, selected_features

best_individual, selected_features = genetic_algorithm_feature_selection()

# Reinforcement Learning for Feature Transformation
class FeatureTransformAgent:
    def __init__(self, n_features, n_actions):
        self.n_features = n_features
        self.n_actions = n_actions
        self.q_table = np.zeros((n_features, n_actions))
        self.epsilon = 0.1
        self.alpha = 0.1
        self.gamma = 0.9

    def get_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, self.n_actions - 1)
        else:
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        old_value = self.q_table[state, action]
        next_max = np.max(self.q_table[next_state])
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
        self.q_table[state, action] = new_value

def transform_features(X, agent):
    X_transformed = X.copy()
    for i in range(X.shape[1]):
        action = agent.get_action(i)
        X_transformed[:, i] = transformations[action](X[:, i])
    return X_transformed

def train_agent(X_train, y_train, X_val, y_val, agent, n_episodes):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    base_model = RandomForestClassifier(n_estimators=500, random_state=42)
    base_model.fit(X_train_scaled, y_train)
    base_score = f1_score(y_val, base_model.predict(X_val_scaled), average='weighted')

    for episode in range(n_episodes):
        X_transformed = transform_features(X_train_scaled, agent)
        model = RandomForestClassifier(n_estimators=500, random_state=42)
        model.fit(X_transformed, y_train)
        X_val_transformed = transform_features(X_val_scaled, agent)
        episode_score = f1_score(y_val, model.predict(X_val_transformed), average='weighted')
        reward = episode_score - base_score

        for feature in range(X_train.shape[1]):
            action = agent.get_action(feature)
            next_state = (feature + 1) % X_train.shape[1]
            agent.update_q_table(feature, action, reward, next_state)

        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}/{n_episodes}, Weighted F1 Score: {episode_score:.4f}")

    return agent

# Initialize and train the agent on selected features
X_train_selected = X_train.iloc[:, selected_features].values
X_validate_selected = X_validate.iloc[:, selected_features].values
X_test_selected = X_test.iloc[:, selected_features].values

agent = FeatureTransformAgent(n_features=X_train_selected.shape[1], n_actions=len(transformations))
trained_agent = train_agent(X_train_selected, y_train, X_validate_selected, y_validate, agent, n_episodes=1000)

# Apply learned transformations to test set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)
X_test_transformed = transform_features(X_test_scaled, trained_agent)

# Train final model on transformed data
final_model = RandomForestClassifier(n_estimators=500, random_state=42)
X_train_transformed = transform_features(X_train_scaled, trained_agent)
final_model.fit(X_train_transformed, y_train)

# Evaluate final model
final_score = f1_score(y_test, final_model.predict(X_test_transformed), average='weighted')
print(f"Final Weighted F1 Score on test set: {final_score:.4f}")

# Print learned transformation policy
print("\nLearned Transformation Policy:")
for feature in range(X_train_selected.shape[1]):
    best_action = np.argmax(trained_agent.q_table[feature])
    print(f"Feature {feature}: {transformations[best_action].__name__}")




Episode 10/1000, Weighted F1 Score: 0.5881
Episode 20/1000, Weighted F1 Score: 0.4810
Episode 30/1000, Weighted F1 Score: 0.5726
Episode 40/1000, Weighted F1 Score: 0.5873
Episode 50/1000, Weighted F1 Score: 0.5048
Episode 60/1000, Weighted F1 Score: 0.5848
Episode 70/1000, Weighted F1 Score: 0.4880
Episode 80/1000, Weighted F1 Score: 0.5882
Episode 90/1000, Weighted F1 Score: 0.4885
Episode 100/1000, Weighted F1 Score: 0.5778
Episode 110/1000, Weighted F1 Score: 0.4573
Episode 120/1000, Weighted F1 Score: 0.4712
Episode 130/1000, Weighted F1 Score: 0.5068
Episode 140/1000, Weighted F1 Score: 0.5118
Episode 150/1000, Weighted F1 Score: 0.5625
Episode 160/1000, Weighted F1 Score: 0.5075
Episode 170/1000, Weighted F1 Score: 0.5037
Episode 180/1000, Weighted F1 Score: 0.5726
Episode 190/1000, Weighted F1 Score: 0.5139
Episode 200/1000, Weighted F1 Score: 0.5064
Episode 210/1000, Weighted F1 Score: 0.5012
Episode 220/1000, Weighted F1 Score: 0.5791
Episode 230/1000, Weighted F1 Score: 0.47

# FEATURE PROCESSING
