In [1]:
from datasets.airline_cs import AirlineCSDataset
from random_forest import RandomForestClassifier, TournamentRandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    ConfusionMatrixDisplay,
    f1_score,
    precision_score,
    recall_score,
)
from tree import DecisionTreeClassifier, RandomizedDecisionTreeClassifier, TournamentDecisionTreeClassifier, RandomizedTournamentDecisionTreeClassifier
import math, time
import seaborn as sns
from utils.experiments import grid_search

In [2]:
path = "../data/airline/Airline_customer_satisfaction.csv"
dataset = AirlineCSDataset(path=path)

In [3]:
dataset.labels.value_counts()

satisfaction
satisfied       71087
dissatisfied    58793
Name: count, dtype: int64

In [4]:
dataset.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Customer Type                      129880 non-null  object 
 1   Age                                129880 non-null  int64  
 2   Type of Travel                     129880 non-null  object 
 3   Class                              129880 non-null  object 
 4   Flight Distance                    129880 non-null  int64  
 5   Seat comfort                       129880 non-null  int64  
 6   Departure/Arrival time convenient  129880 non-null  int64  
 7   Food and drink                     129880 non-null  int64  
 8   Gate location                      129880 non-null  int64  
 9   Inflight wifi service              129880 non-null  int64  
 10  Inflight entertainment             129880 non-null  int64  
 11  Online support                     1298

In [5]:
dataset.clean()

In [6]:
X_train, X_val, y_train, y_val = dataset.split(test_size=0.4, random_state=42)

## DecisionTree

In [7]:
params_matrix = {
    "max_depth": [
        6, 7
    ],
    "max_split_values" : [1000]
}

In [8]:
n_calls = 1
time_start = time.time()
best_params, score, all_results = grid_search(
    params_matrix, DecisionTreeClassifier, X_train, X_val, y_train, y_val, n_calls, path="../out/airline_cs/decision_tree.csv"
)
print(f"Execution time: {time.time() - time_start}")
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")

  0%|          | 0/2 [00:00<?, ?it/s]INFO:root:DecisionTreeClassifier(max_depth=6) created
INFO:root:Node(split_feature=None, split_val=None, depth=6) created
INFO:root:Node(split_feature=None, split_val=None, depth=6) created
INFO:root:Node(split_feature=23, split_val=0.5, depth=5) created
INFO:root:Node(split_feature=None, split_val=None, depth=5) created
INFO:root:Node(split_feature=22, split_val=0.5, depth=4) created
INFO:root:Node(split_feature=None, split_val=None, depth=4) created
INFO:root:Node(split_feature=21, split_val=0.5, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=6) created
INFO:root:Node(split_feature=None, split_val=None, depth=6) created
INFO:root:Node(split_feature=2, split_val=3.5, depth=5) created
INFO:root:Node(split_feature=None, split_val=None, depth=6) created
INFO:root:Node(split_feature=None, split_val=None, depth=6) created
INFO:root:Node(split_feature=21, split_val=0.5, depth=5) created
INFO:root:Node(split_feature=17, split_va

Execution time: 173.85856080055237
Best params: {'max_depth': 7, 'max_split_values': 1000}
Best score: 0.8949607329842932
All rEsults: [{'max_depth': 6, 'max_split_values': 1000, 'accuracy': 0.8837773329226979, 'precision': 0.8575933813892529, 'recall': 0.8910212765957447, 'f1': 0.873987812004341}, {'max_depth': 7, 'max_split_values': 1000, 'accuracy': 0.8949607329842932, 'precision': 0.8991990796052922, 'recall': 0.8647234042553191, 'f1': 0.8816243302457754}]





## Random Forest

In [9]:
n_features = round(math.sqrt(X_train.shape[1]))

In [15]:
params_matrix = {
    "n_trees": [10, 20, 30, 40, 50],
    "max_depth": [
        3, 4, 5, 6, 7
    ],
    "max_split_values" : [1000],
    "max_features": [n_features],
}

In [16]:

path = "../out/airline_cs/random_forestt_classifier.csv"
n_calls = 1
time_start = time.time()
best_params, score, all_results = grid_search(
    params_matrix, RandomForestClassifier, X_train, X_val, y_train, y_val, n_calls, path = path
)
print(f"Execution time: {time.time() - time_start}")
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")

  0%|          | 0/25 [00:00<?, ?it/s]INFO:root:RandomForestClassifier: n_trees=10, max_depth=3
INFO:root:DecisionTreeClassifier(max_depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=8, split_val=3.5, depth=2) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=18, split_val=0.5, depth=2) created
INFO:root:Node(split_feature=6, split_val=4.5, depth=1) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=17, split_val=0.5, depth=2) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=9, split_val=3.5, d

Execution time: 9502.660490989685
Best params: {'n_trees': 10, 'max_depth': 7, 'max_split_values': 1000, 'max_features': 5}
Best score: 0.8072259008315368
All rEsults: [{'n_trees': 10, 'max_depth': 3, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6687904219279335, 'precision': 0.5780517450946345, 'recall': 0.9916170212765958, 'f1': 0.7303527494397694}, {'n_trees': 10, 'max_depth': 4, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.756756236526024, 'precision': 0.6566040999913503, 'recall': 0.969063829787234, 'f1': 0.782805974253648}, {'n_trees': 10, 'max_depth': 5, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.7260548198336927, 'precision': 0.6252906505164116, 'recall': 0.9841276595744681, 'f1': 0.7647058823529411}, {'n_trees': 10, 'max_depth': 6, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.8013358484755159, 'precision': 0.7003435590283056, 'recall': 0.9802127659574468, 'f1': 0.8169743398769307}, {'n_trees': 10, 'max_depth': 7, 'max_s




All rEsults: [{'n_trees': 10, 'max_depth': 3, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6687904219279335, 'precision': 0.5780517450946345, 'recall': 0.9916170212765958, 'f1': 0.7303527494397694}, {'n_trees': 10, 'max_depth': 4, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.756756236526024, 'precision': 0.6566040999913503, 'recall': 0.969063829787234, 'f1': 0.782805974253648}, {'n_trees': 10, 'max_depth': 5, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.7260548198336927, 'precision': 0.6252906505164116, 'recall': 0.9841276595744681, 'f1': 0.7647058823529411}, {'n_trees': 10, 'max_depth': 6, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.8013358484755159, 'precision': 0.7003435590283056, 'recall': 0.9802127659574468, 'f1': 0.8169743398769307}, {'n_trees': 10, 'max_depth': 7, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.8072259008315368, 'precision': 0.7062494264476461, 'recall': 0.982468085106383, 'f1': 0.8217686106315958}, {'n_trees': 20, 'max_depth': 3, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6912149676624576, 'precision': 0.5951566806165153, 'recall': 0.992468085106383, 'f1': 0.7440977539561}, {'n_trees': 20, 'max_depth': 4, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.673025100092393, 'precision': 0.5806963201585925, 'recall': 0.9971914893617021, 'f1': 0.7339754130451804}, {'n_trees': 20, 'max_depth': 5, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.701320449645827, 'precision': 0.6026752411575562, 'recall': 0.9969787234042553, 'f1': 0.7512304609218436}, {'n_trees': 20, 'max_depth': 6, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.7559670465044657, 'precision': 0.651977305920683, 'recall': 0.9877872340425532, 'f1': 0.785496751488901}, {'n_trees': 20, 'max_depth': 7, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.7496535263319988, 'precision': 0.6442751869775627, 'recall': 0.9970638297872341, 'f1': 0.7827553952027795}, {'n_trees': 30, 'max_depth': 3, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6567600862334463, 'precision': 0.5689068274654736, 'recall': 0.9956595744680851, 'f1': 0.7240824410472241}, {'n_trees': 30, 'max_depth': 4, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6661918694179242, 'precision': 0.5757702529776553, 'recall': 0.9956170212765958, 'f1': 0.7296058375951104}, {'n_trees': 30, 'max_depth': 5, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6837080381890976, 'precision': 0.5889011873616422, 'recall': 0.9961702127659574, 'f1': 0.7402137481818757}, {'n_trees': 30, 'max_depth': 6, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6958153680320296, 'precision': 0.5984296273561984, 'recall': 0.9956595744680851, 'f1': 0.7475518778255243}, {'n_trees': 30, 'max_depth': 7, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.7472667077302125, 'precision': 0.6423746498984019, 'recall': 0.9954893617021277, 'f1': 0.7808671851530425}, {'n_trees': 40, 'max_depth': 3, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6351054819833692, 'precision': 0.5536072498997002, 'recall': 0.9982127659574468, 'f1': 0.712218966799751}, {'n_trees': 40, 'max_depth': 4, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6485602094240838, 'precision': 0.5629231286311038, 'recall': 0.9977872340425532, 'f1': 0.7197716180127084}, {'n_trees': 40, 'max_depth': 5, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6699645826917154, 'precision': 0.5783747779751333, 'recall': 0.9976595744680851, 'f1': 0.7322443625460678}, {'n_trees': 40, 'max_depth': 6, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6822066522944257, 'precision': 0.5875018777226979, 'recall': 0.9985531914893617, 'f1': 0.7397623025755808}, {'n_trees': 40, 'max_depth': 7, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.7198567908838929, 'precision': 0.6177477098030957, 'recall': 0.9985957446808511, 'f1': 0.7633034087952121}, {'n_trees': 50, 'max_depth': 3, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6301778564829073, 'precision': 0.550247310063527, 'recall': 0.9988510638297873, 'f1': 0.7095935549207213}, {'n_trees': 50, 'max_depth': 4, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6667885740683708, 'precision': 0.5759815355906401, 'recall': 0.9982127659574468, 'f1': 0.7304716085135535}, {'n_trees': 50, 'max_depth': 5, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6607445334154605, 'precision': 0.5714807154155007, 'recall': 0.9993617021276596, 'f1': 0.7271460639368372}, {'n_trees': 50, 'max_depth': 6, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6699068370803819, 'precision': 0.578250905596215, 'recall': 0.9985531914893617, 'f1': 0.7323855745072643}, {'n_trees': 50, 'max_depth': 7, 'max_split_values': 1000, 'max_features': 5, 'accuracy': 0.6866145672928857, 'precision': 0.5907753439126826, 'recall': 0.9996170212765958, 'f1': 0.7426457803139276}

In [None]:
params_matrix = {
    "n_trees": [10, 50, 100],
    "max_depth": [
        3, 5, 7
    ],
    "tournament_size": [3, 5, 7],
    "max_split_values" : [1000],
    "max_features": [n_features],
}

In [None]:

path = "../out/airline_cs/random_forest_classifier.csv"
n_calls = 1
time_start = time.time()
best_params, score, all_results = grid_search(
    params_matrix, TournamentRandomForestClassifier, X_train, X_val, y_train, y_val, n_calls, path = path
)
print(f"Execution time: {time.time() - time_start}")
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")