In [1]:
from datasets.airline_cs import AirlineCSDataset
from random_forest import RandomForestClassifier, TournamentRandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    ConfusionMatrixDisplay,
    f1_score,
    precision_score,
    recall_score,
)
from tree import DecisionTreeClassifier, RandomizedDecisionTreeClassifier, TournamentDecisionTreeClassifier, RandomizedTournamentDecisionTreeClassifier
import math, time
import seaborn as sns
from utils.experiments import grid_search

In [2]:
path = "../data/airline/Airline_customer_satisfaction.csv"
dataset = AirlineCSDataset(path=path)

In [3]:
dataset.labels.value_counts()

satisfaction
satisfied       71087
dissatisfied    58793
Name: count, dtype: int64

In [4]:
dataset.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Customer Type                      129880 non-null  object 
 1   Age                                129880 non-null  int64  
 2   Type of Travel                     129880 non-null  object 
 3   Class                              129880 non-null  object 
 4   Flight Distance                    129880 non-null  int64  
 5   Seat comfort                       129880 non-null  int64  
 6   Departure/Arrival time convenient  129880 non-null  int64  
 7   Food and drink                     129880 non-null  int64  
 8   Gate location                      129880 non-null  int64  
 9   Inflight wifi service              129880 non-null  int64  
 10  Inflight entertainment             129880 non-null  int64  
 11  Online support                     1298

In [5]:
dataset.clean()

In [6]:
X_train, X_val, y_train, y_val = dataset.split(test_size=0.4, random_state=42)

## DecisionTree

In [10]:
params_matrix = {
    "max_depth": [
        6, 7
    ],
    "max_split_values" : [1000]
}

In [11]:
n_calls = 1
time_start = time.time()
best_params, score, all_results = grid_search(
    params_matrix, DecisionTreeClassifier, X_train, X_val, y_train, y_val, n_calls, path="../out/airline_cs/decision_tree.csv"
)
print(f"Execution time: {time.time() - time_start}")
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")

  0%|          | 0/2 [00:00<?, ?it/s]INFO:root:DecisionTreeClassifier(max_depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=3, split_val=0.5, depth=2) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=10, split_val=4.5, depth=2) created
INFO:root:Node(split_feature=25, split_val=0.5, depth=1) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=25, split_val=0.5, depth=2) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=None, split_val=None, depth=3) created
INFO:root:Node(split_feature=24, split_val=0.5, depth=2) created
INFO:root:Node(split_feature=8, split_val

Execution time: 107.61101412773132
Best params: {'max_depth': 5, 'max_split_values': 1000}
Best score: 0.8735948567908839
All rEsults: [{'max_depth': 3, 'max_split_values': 1000, 'accuracy': 0.8290537419156144, 'precision': 0.8512662790138882, 'recall': 0.7537872340425532, 'f1': 0.7995666794556411}, {'max_depth': 5, 'max_split_values': 1000, 'accuracy': 0.8735948567908839, 'precision': 0.8381865388456161, 'recall': 0.892936170212766, 'f1': 0.864695580508911}]





## Random Forest

In [7]:
n_features = round(math.sqrt(X_train.shape[1]))

In [8]:
params_matrix = {
    "n_trees": [100, 200],
    "max_depth": [
        3, 5
    ],
    "max_split_values" : [1000],
    "max_features": [n_features],
}

In [9]:

path = "../out/airline_cs/random_forestt_classifier.csv"
n_calls = 1
time_start = time.time()
best_params, score, all_results = grid_search(
    params_matrix, RandomForestClassifier, X_train, X_val, y_train, y_val, n_calls
)
print(f"Execution time: {time.time() - time_start}")
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")

  0%|          | 0/4 [00:00<?, ?it/s]INFO:root:DecisionTreeClassifier(max_depth=3) created
  0%|          | 0/4 [00:22<?, ?it/s]


KeyboardInterrupt: 

In [None]:
n_calls = 1
time_start = time.time()


best_params, score, all_results = grid_search(
    params_matrix, RandomForestClassifier, X_train, X_val, y_train, y_val, n_calls
)
print(f"Execution time: {time.time() - time_start}")
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")