In [1]:
from datasets.airline_cs import AirlineCSDataset
from random_forest import RandomForestClassifier, TournamentRandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    ConfusionMatrixDisplay,
    f1_score,
    precision_score,
    recall_score,
)
from tree import DecisionTreeClassifier, RandomizedDecisionTreeClassifier, TournamentDecisionTreeClassifier, RandomizedTournamentDecisionTreeClassifier
import math, time
import seaborn as sns
from utils.experiments import grid_search

In [2]:
path = "../data/airline/Airline_customer_satisfaction.csv"
dataset = AirlineCSDataset(path=path)

In [3]:
dataset.labels.value_counts()

satisfaction
satisfied       71087
dissatisfied    58793
Name: count, dtype: int64

In [4]:
dataset.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Customer Type                      129880 non-null  object 
 1   Age                                129880 non-null  int64  
 2   Type of Travel                     129880 non-null  object 
 3   Class                              129880 non-null  object 
 4   Flight Distance                    129880 non-null  int64  
 5   Seat comfort                       129880 non-null  int64  
 6   Departure/Arrival time convenient  129880 non-null  int64  
 7   Food and drink                     129880 non-null  int64  
 8   Gate location                      129880 non-null  int64  
 9   Inflight wifi service              129880 non-null  int64  
 10  Inflight entertainment             129880 non-null  int64  
 11  Online support                     1298

In [5]:
dataset.clean()

In [6]:
X_train, X_val, y_train, y_val = dataset.split(test_size=0.4, random_state=42)

## DecisionTree

In [10]:
params_matrix = {
    "max_depth": [
        3,
    ],
    "max_split_values" : [1000]
}

In [11]:
n_calls = 1
time_start = time.time()
best_params, score, all_results = grid_search(
    params_matrix, DecisionTreeClassifier, X_train, X_val, y_train, y_val, n_calls
)
print(f"Execution time: {time.time() - time_start}")
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")

  0%|          | 0/1 [00:00<?, ?it/s]INFO:root:DecisionTreeClassifier(max_depth=3) created
DEBUG:root:Feature values: 
DEBUG:root:Split values: [ 7.5  8.5  9.5 10.5 11.5 12.5 13.5 14.5 15.5 16.5 17.5 18.5 19.5 20.5
 21.5 22.5 23.5 24.5 25.5 26.5 27.5 28.5 29.5 30.5 31.5 32.5 33.5 34.5
 35.5 36.5 37.5 38.5 39.5 40.5 41.5 42.5 43.5 44.5 45.5 46.5 47.5 48.5
 49.5 50.5 51.5 52.5 53.5 54.5 55.5 56.5 57.5 58.5 59.5 60.5 61.5 62.5
 63.5 64.5 65.5 66.5 67.5 68.5 69.5 70.5 71.5 72.5 73.5 74.5 75.5 76.5
 77.5 78.5 79.5 82.5]
DEBUG:root:Classes.shape: (77928,)
DEBUG:root:Group created, entropy: 0.9935874530279281
DEBUG:root:Group created, entropy: 0.9941429994757438
DEBUG:root:Group created, entropy: 0.9934499476593011
DEBUG:root:Group created, entropy: 0.9833761901392237
DEBUG:root:Group created, entropy: 0.993194117068451
DEBUG:root:Group created, entropy: 0.9809979390454775
DEBUG:root:Group created, entropy: 0.992943818329453
DEBUG:root:Group created, entropy: 0.9798687566511527
DEBUG:root:Gro

Execution time: 495.2111656665802
Best params: {'max_depth': 3, 'max_split_values': 1000}
Best score: 0.8290537419156144
All rEsults: [{'max_depth': 3, 'max_split_values': 1000, 'accuracy': 0.8290537419156144, 'precision': 0.8512662790138882, 'recall': 0.7537872340425532, 'f1': 0.7995666794556411}]





## Random Forest

In [7]:
n_features = round(math.sqrt(X_train.shape[1]))

In [8]:
params_matrix = {
    "n_trees": [100, 200],
    "max_depth": [
        3, 5
    ],
    "max_split_values" : [1000],
    "max_features": [n_features],
}

In [9]:
n_calls = 8
time_start = time.time()
best_params, score, all_results = grid_search(
    params_matrix, RandomForestClassifier, X_train, X_val, y_train, y_val, n_calls
)
print(f"Execution time: {time.time() - time_start}")
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")

  0%|          | 0/4 [00:00<?, ?it/s]INFO:root:DecisionTreeClassifier(max_depth=3) created
  0%|          | 0/4 [00:22<?, ?it/s]


KeyboardInterrupt: 

In [None]:
n_calls = 1
time_start = time.time()


best_params, score, all_results = grid_search(
    params_matrix, RandomForestClassifier, X_train, X_val, y_train, y_val, n_calls
)
print(f"Execution time: {time.time() - time_start}")
print(f"Best params: {best_params}")
print(f"Best score: {score}")
print(f"All rEsults: {all_results}")