### Import packages

In [1]:
# Importing the required packages
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


#### load Dataset and split into train and test 

In [3]:
# import dataset
# one_hot_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data Mining Gruppe A/train_set_ohe.csv")
one_hot_data = pd.read_csv("train_set_ohe.csv")

one_hot_data = one_hot_data[["Departure_FRA", "Departure_STN", "Departure_SXF", "Destination_FRA", "Destination_STN", "Destination_SXF", "Price_Dev_Cat_Falling", 
                                  "Price_Dev_Cat_Rising", "Price_Dev_Cat_Steady", "Price_In_Eur",
                                  "Price_Dev", "Price_Dev_Three_Days", "Same_Day_Request_route_Flight_price",
                                  "Request_Month", "Request_Time", "Request_Day", "Flight_Day", "Departure_hour",
                                  "Hours_to_Flight", "Request_Count", "Request_Count_Sum", "Last_Request_Bool", "Is_Holiday_UK", "Is_Holiday_GER",
                                  "Is_School_Holiday_BER", "Is_School_Holiday_FRA", "Is_School_Holiday_UK", "buy"]]
Y = np.array(one_hot_data["buy"])
X = one_hot_data.drop(
    ["buy"], axis=1
)
X.info()
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42, shuffle=True, stratify=Y
)
X = np.array(X)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83624 entries, 0 to 83623
Data columns (total 27 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Departure_FRA                        83624 non-null  float64
 1   Departure_STN                        83624 non-null  float64
 2   Departure_SXF                        83624 non-null  float64
 3   Destination_FRA                      83624 non-null  float64
 4   Destination_STN                      83624 non-null  float64
 5   Destination_SXF                      83624 non-null  float64
 6   Price_Dev_Cat_Falling                83624 non-null  float64
 7   Price_Dev_Cat_Rising                 83624 non-null  float64
 8   Price_Dev_Cat_Steady                 83624 non-null  float64
 9   Price_In_Eur                         83624 non-null  float64
 10  Price_Dev                            83624 non-null  float64
 11  Price_Dev_Three_Days        

#### Functions

In [4]:
# Function to perform training with giniIndex.
def train_using_gini(X_train, y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(max_depth=9,
                                      max_features='auto',
                                      min_samples_leaf=3,
                                      min_samples_split=9,
                                      random_state=42
                                      )
    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini


# Function to perform training with entropy.
def train_using_entropy(X_train, y_train):
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
        criterion="entropy", random_state=100, max_depth=3, min_samples_leaf=5
    )
    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy


# Function to make predictions
def prediction(X_test, clf_object):
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    print("Predicted values:")
    print(y_pred)
    return y_pred


# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    print("Accuracy : ", accuracy_score(y_test, y_pred) * 100)
    print("Report : ", classification_report(y_test, y_pred))


#### Predict

In [None]:
# Build models
clf_gini = train_using_gini(X_train, y_train)
clf_entropy = train_using_entropy(X_train, y_train)

# Prediction using gini
print("Results Using Gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)

# # Prediction using entropy
# print("Results Using Entropy:")
# y_pred_entropy = prediction(X_test, clf_entropy)
# cal_accuracy(y_test, y_pred_entropy)

Results Using Gini Index:
Predicted values:
[0 0 1 ... 0 1 0]
Confusion Matrix:  [[15664   563]
 [ 3494  1185]]
Accuracy :  80.59408782167799
Report :                precision    recall  f1-score   support

           0       0.82      0.97      0.89     16227
           1       0.68      0.25      0.37      4679

    accuracy                           0.81     20906
   macro avg       0.75      0.61      0.63     20906
weighted avg       0.79      0.81      0.77     20906



#### Tuning

##### RandomizedSearch

In [None]:
# Setup the parameters and distributions to sample from: param_dist
param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(1, 100),
    "min_samples_split": range(1, 100),
    "min_samples_leaf": range(1, 50),
    "max_features": ["auto", "sqrt", "log2"],
}
# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier(random_state=42)

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=3)

# Fit it to the data
tree_cv.fit(X, Y)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))


Tuned Decision Tree Parameters: {'min_samples_split': 70, 'min_samples_leaf': 44, 'max_features': 'log2', 'max_depth': 23, 'criterion': 'gini'}
Best score is 0.6261380294176667


In [None]:
# {'min_samples_split': 60, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 3, 'criterion': 'gini'}
params = {
  "criterion":['gini', 'entropy'],
  "max_depth":range(1,3),
  "min_samples_split":range(58,61),
  "min_samples_leaf":range(1,3),
  "max_features": ['auto', 'sqrt', 'log2']
}
grid_random_search_cv = GridSearchCV(
    DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3, n_jobs=-1
)
grid_random_search_cv.fit(X_train, y_train)
grid_random_search_cv.best_estimator_


Fitting 3 folds for each of 72 candidates, totalling 216 fits


DecisionTreeClassifier(max_depth=2, max_features='log2', min_samples_split=58,
                       random_state=42)

In [None]:
grid_random_search_cv.best_score_


0.7798877515226889

##### GridSearch

In [None]:
params = {
  "criterion":['gini', 'entropy'],
  "max_depth":range(25,100),
  "min_samples_split":range(25,100),
  "min_samples_leaf":range(10,50),
  "max_features": ['auto', 'sqrt', 'log2']
}
grid_search_cv = GridSearchCV(
    DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3, n_jobs=-1
)
grid_search_cv.fit(X_train, y_train)
grid_search_cv.best_estimator_

Fitting 3 folds for each of 1350000 candidates, totalling 4050000 fits


DecisionTreeClassifier(max_depth=28, max_features='auto', min_samples_leaf=10,
                       min_samples_split=31, random_state=42)

In [None]:
print(f'Best Params: {grid_search_cv.best_params_}')
print(f'Best Score: {grid_search_cv.best_score_}')


Best Params: {'criterion': 'gini', 'max_depth': 28, 'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 31}
Best Score: 0.8272744666602888


#### Monetäres Maß

In [5]:
# The model quality evaluation function expects a Pandas dataframe with at least the following columns:
# Request_Date          int64
# flight_unique_id     object
# Price               float64
# buy                    bool

def model_quality_evaluation(df):
    # Make a copy of the provided dataframe as to not modify the original.
    df = df.copy()

    # Convert 'Price' to whole cents and store as integers to avoid floating point errors.
    df['Price'] = df['Price'] * 100
    df['Price'] = df['Price'].astype(int)

    # Initialize a variable that stores the sum of all our balances.
    sum_balances = 0

    # Get a list of all 'flight_unique_id'.
    flight_unique_ids = df['flight_unique_id'].unique()

    # Iterate over all 'flight_unique_id'.
    for flight_unique_id in flight_unique_ids:
        # Get a subset of the data for the specified 'flight_unique_id'.
        df_subset = df[df['flight_unique_id'] == flight_unique_id]

        # Get all request dates except for the latest request date before departure.
        # At the latest request date before departure we need to buy a ticket anyway,
        # so we don't care about this specific request date.
        request_dates = df_subset[df_subset['Request_Date'] != df_subset['Request_Date'].max()]

        # Make sure request dates are sorted in descending order.
        request_dates.sort_values(by='Request_Date', ascending=False, inplace=True)

        # Get the ticket price from the latest request date before departure,
        # because we certainly have to buy a ticket at this date.
        last_buying_price = df_subset[df_subset['Request_Date'] == df_subset['Request_Date'].max()]['Price'].values[0]

        # Iterate over the remaining request dates
        for _, row in request_dates.iterrows():
            # and check wether the model wants to buy a ticket at the specific request date.
            if(row['buy'] == 1):
                # If the model decides to buy a ticket the last buying price is set to the
                # price point of this request date and the balance doesn't change.
                last_buying_price = row['Price']
            else:
                # If the models decides to not buy a ticket the balance equals the
                # the current ticket price minus the last buying price.
                current_price = row['Price']
                balance = current_price - last_buying_price

                # The balance is added to the sum of all balances.
                sum_balances = sum_balances + balance

    # Return the sum of all our previously calculated balances.
    return sum_balances / 100

In [None]:
import itertools
params = {
    "criterion": "gini",
    "max_depth": [26, 28, 30],
    "min_samples_split": [29, 31, 33],
    "min_samples_leaf": [8, 10, 12],
    "max_features": "auto",
}
money_scores = {}
for param in itertools.product(params["max_depth"], params["min_samples_split"], params["min_samples_leaf"]):
    money_scores[','.join(str(x) for x in param)] = 2


money_scores

{'26,29,8': 2,
 '26,29,10': 2,
 '26,29,12': 2,
 '26,31,8': 2,
 '26,31,10': 2,
 '26,31,12': 2,
 '26,33,8': 2,
 '26,33,10': 2,
 '26,33,12': 2,
 '28,29,8': 2,
 '28,29,10': 2,
 '28,29,12': 2,
 '28,31,8': 2,
 '28,31,10': 2,
 '28,31,12': 2,
 '28,33,8': 2,
 '28,33,10': 2,
 '28,33,12': 2,
 '30,29,8': 2,
 '30,29,10': 2,
 '30,29,12': 2,
 '30,31,8': 2,
 '30,31,10': 2,
 '30,31,12': 2,
 '30,33,8': 2,
 '30,33,10': 2,
 '30,33,12': 2}

In [None]:
from sklearn.model_selection import StratifiedKFold
import itertools

train_set_or = pd.read_csv("./train_set.csv")
train_set = pd.read_csv("./train_set_ohe.csv")

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
combinations = {}

#  {'criterion': 'gini', 'max_depth': 28, 'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 31}
params = {
    "criterion": "gini",
    "max_depth": [26, 28, 30],
    "min_samples_split": [29, 31, 33],
    "min_samples_leaf": [8, 10, 12],
    "max_features": "auto",
}

for param in itertools.product(
    params["max_depth"], params["min_samples_split"], params["min_samples_leaf"]
):
    clf = DecisionTreeClassifier(
        max_depth=param[0],
        max_features="auto",
        min_samples_leaf=param[2],
        min_samples_split=param[1],
        random_state=42,
    )
    money_scores = []
    # for each fold create a dataframe
    for train_index, test_index in skf.split(X, Y):  # split() return index of each fold
        # get each fold train, test fold with index index
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = Y[train_index], Y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        y_pred = clf.predict(x_test_fold)

        X_train_1 = train_set_or.loc[test_index]
        X_train_1 = X_train_1.reset_index(drop=True)
        df = pd.DataFrame()
        df["buy"] = y_pred
        df["flight_unique_id"] = X_train_1["flight_unique_id"]
        df["Request_Date"] = X_train_1["Request_Date"]
        df["Price"] = X_train_1["Price_In_Eur"]
        # eval with custom func and append
        score = model_quality_evaluation(df)
        money_scores.append(score)

    combinations[",".join(str(x) for x in param)] = money_scores

# Print the output.
# print("List of possible accuracy:", money_scores)
# print("\nMaximum Money That can be obtained from this model is:", max(money_scores))
# print("\nMinimum Money:", min(money_scores))
# print("\nOverall Money:", np.mean(money_scores))
# print("\nStandard Deviation is:", np.std(money_scores))


In [55]:
first = True
maxMoney = 0
minMoney = 0
maxParam = ""
allMoney = []
for key, value in combinations.items():
  allMoney.extend(value)
  if first:
    maxMoney = max(value)
    minMoney = min(value)
    moneyParam = key
    first = False
  if max(value) > maxMoney:
    maxMoney = max(value)
    maxParam = key
  if min(value) < minMoney:
    minMoney = min(value)

# Print the output.
print("Maximum Money That can be obtained from this model is:", maxMoney)
print("\nMinimum Money:", minMoney)
print("\nOverall Money:", np.mean(allMoney))
print("\nStandard Deviation is:", np.std(allMoney))
print("\nParams for Maximum Money:", maxParam, "(max_depth, min_samples_split, min_samples_leaf)")
print("\nAll combinations:", combinations)
print("\nList of possible accuracy:", allMoney)



Maximum Money That can be obtained from this model is: -126921.45

Minimum Money: -182838.98

Overall Money: -151745.4148888889

Standard Deviation is: 10653.508223169449

Params for Maximum Money: 30,33,8 (max_depth, min_samples_split, min_samples_leaf)

All combinations: {'26,29,8': [-141908.32, -144875.44, -133809.2, -160021.22, -147499.31, -140524.27, -162148.0, -128831.55, -153823.88, -142543.14], '26,29,10': [-141077.83, -131269.85, -157569.98, -152051.14, -155063.89, -158908.51, -153746.57, -146288.2, -172579.48, -136411.4], '26,29,12': [-171095.7, -140120.58, -147523.94, -163015.84, -157059.84, -161196.93, -148619.28, -152445.71, -149698.23, -134088.68], '26,31,8': [-164278.01, -149991.78, -162421.65, -158715.63, -155817.18, -147987.52, -165817.92, -142641.2, -163755.57, -128869.33], '26,31,10': [-151686.13, -135388.15, -146168.21, -146838.06, -161462.25, -165438.0, -180108.64, -141426.5, -152404.26, -144365.3], '26,31,12': [-149087.44, -155257.45, -135373.96, -155683.03, -1525

In [8]:
params = {
    "criterion": "gini",
    "max_depth": 30,
    "min_samples_split": 33,
    "min_samples_leaf": 8,
    "max_features": "auto",
}

tree_clf = DecisionTreeClassifier(
    criterion="gini",
    max_depth=30,
    min_samples_split=33,
    min_samples_leaf=8,
    max_features="auto",
    random_state=42
)
tree_clf.fit(X_train, y_train)
y_pred_tree = prediction(X_test, tree_clf)
accuracy_score(y_test, y_pred_tree)

Predicted values:
[0 0 0 ... 0 1 0]


0.8298574571893237