<center>
<h1 style="font-family: 'Times New Roman', Times, serif; font-size: 60px;">DAA</h1>
<h2 style="font-family: 'Times New Roman', Times, serif; font-size: 40px;">Extra Trees</h2>
</center>

<font face="Times New Roman">

<h3 style="font-family: 'Times New Roman'">Imports</h3>

In [1]:
# Import necessary libraries
import pandas as pd                                                                                 # For data manipulation and analysis
import matplotlib.pyplot as plt                                                                     # For data visualization
import seaborn as sns                                                                               # For enhanced data visualization
from sklearn.model_selection import train_test_split                                                # For splitting data
from sklearn.ensemble import ExtraTreesClassifier                                                   # For the ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer    # For evaluating model performance
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV                # For cross-validation
import os                                                                                           # For interacting with the operating system

<h3 style="font-family: 'Times New Roman'">Load All Datasets</h3>

In [2]:
final = pd.read_csv("Ficheiros/Ficheiros_Models/Final.csv", na_filter = False)
final_teste = pd.read_csv("Ficheiros/Ficheiros_Models/Final_Teste.csv")

<h3 style="font-family: 'Times New Roman'">Extra Trees</h3>

<h4 style="font-family: 'Times New Roman'">Test Locally</h4>

In [3]:
X = final.drop(columns=['injection'])
y = final['injection']

In [4]:
# Split the test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2024)

# Define the Extra Trees parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 1000],  # Adjust the number of estimators
    'max_depth': [30, 30, 50, 60, 70, 80, 90, 100],  # Adjust the maximum depth of the trees
    'min_samples_split': [5, 6, 7, 8, 9, 10, 11, 12],  # Adjust the minimum samples required to split an internal node
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_samples_leaf': [1, 2, 4, 5],
    'max_features': [None, 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'random_state': [2024]
}

modelo = ExtraTreesClassifier(random_state=2024)
scoring = make_scorer(accuracy_score)
cv_value = 5
grid_accuracy = GridSearchCV(estimator=modelo, param_grid=param_grid, scoring=scoring, cv=cv_value, refit=True)

grid_accuracy.fit(X_train, y_train)

# Get the best model
best_model = grid_accuracy.best_estimator_
best_model_params = best_model.get_params()

print("Hyperparameters of the Best Model:")
for param, value in best_model_params.items():
    print(f"{param}: {value}")

# Predict on the test set
y_pred = best_model.predict(X_test)

# Print the cross-validated accuracy scores
cv_scores = grid_accuracy.cv_results_['mean_test_score']
print("Mean Accuracy: {:.2f}%".format(cv_scores.mean() * 100))

# Get accuracy on the test set
accuracy_injection = accuracy_score(y_test, y_pred)
print("Extra Trees Model Accuracy: {:.2f}%".format(accuracy_injection * 100))

# Display classification report
classification_report_str = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_report_str)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Define a custom color palette
color_palette = sns.light_palette("seagreen", as_cmap=True)

# Plot confusion matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap=color_palette, cbar=False)

# Add labels and title
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

# Customize tick labels
tick_labels = ['None', 'Low', 'Medium', 'High', 'Very High']
plt.xticks(ticks=range(len(tick_labels)), labels=tick_labels)
plt.yticks(ticks=range(len(tick_labels)), labels=tick_labels)

plt.show()

KeyboardInterrupt: 

<h4 style="font-family: 'Times New Roman'">Kaggle</h4>

In [None]:
final_kaggle = pd.read_csv("Ficheiros/Ficheiros_Models/Final.csv", na_filter = False)
final_teste_kaggle = pd.read_csv("Ficheiros/Ficheiros_Models/Final_Teste.csv")

In [None]:
X_kaggle = final_kaggle.drop(columns=['injection'])
y_kaggle = final_kaggle['injection']

In [None]:
# Define the Extra Trees parameter grid for hyperparameter tuning
param_grid_kaggle = {
    'n_estimators': [400],  # Adjust the number of estimators
    'max_depth': [50],  # Adjust the maximum depth of the trees
    'min_samples_split': [7],  # Adjust the minimum samples required to split an internal node
    'criterion': ['entropy'],
    'min_samples_leaf': [1],
}

modelo_kaggle = ExtraTreesClassifier(random_state=2023)
scoring_kaggle = make_scorer(accuracy_score)
cv_value_kaggle = 10
grid_accuracy_kaggle = GridSearchCV(estimator=modelo_kaggle, param_grid=param_grid_kaggle, scoring=scoring_kaggle, cv=cv_value_kaggle, refit=True)

grid_accuracy_kaggle.fit(X_kaggle, y_kaggle)

best_model_kaggle = grid_accuracy_kaggle.best_estimator_
X_teste_kaggle = final_teste
y_pred_kaggle = best_model_kaggle.predict(X_teste_kaggle)

In [None]:
reverse_mapping = {0: 'None', 1: 'Low', 2: 'Medium', 3: 'High', 4: 'Very High'}

# Aplica o mapeamento inverso às previsões 'y_pred'
y_pred_kaggle = [reverse_mapping[pred] for pred in y_pred_kaggle]

In [None]:
# Salvar as previsões em 'prediction_results.csv'
with open('Outputs/Extra_Trees.csv', 'w') as file:
    file.write("RowId,Result\n")
    for row_id, prediction in enumerate(y_pred_kaggle, start=1):
        file.write(f"{row_id},{prediction}\n")

print("Previsões salvas em Extra_Trees.csv")

Previsões salvas em Extra_Trees.csv
