## Import packages

In [1]:
# Importing the required packages
import numpy as np
import pandas as pd
import itertools
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## load Dataset and split into train and test 

In [3]:
# import dataset
# one_hot_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data Mining Gruppe A/train_set_ohe.csv")
one_hot_data = pd.read_csv("train_set_ohe.csv")

one_hot_data = one_hot_data[["Departure_FRA", "Departure_STN", "Departure_SXF", "Destination_FRA", "Destination_STN", "Destination_SXF", "Price_Dev_Cat_Falling", 
                                  "Price_Dev_Cat_Rising", "Price_Dev_Cat_Steady", "Price_In_Eur",
                                  "Price_Dev", "Price_Dev_Three_Days", "Same_Day_Request_route_Flight_price",
                                  "Request_Month", "Request_Time", "Request_Day", "Flight_Day", "Departure_hour",
                                  "Hours_to_Flight", "Request_Count", "Request_Count_Sum", "Last_Request_Bool", "Is_Holiday_UK", "Is_Holiday_GER",
                                  "Is_School_Holiday_BER", "Is_School_Holiday_FRA", "Is_School_Holiday_UK", "buy"]]
Y = np.array(one_hot_data["buy"])
X = one_hot_data.drop(
    ["buy"], axis=1
)
X.info()
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42, shuffle=True, stratify=Y
)
X = np.array(X)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83624 entries, 0 to 83623
Data columns (total 27 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Departure_FRA                        83624 non-null  float64
 1   Departure_STN                        83624 non-null  float64
 2   Departure_SXF                        83624 non-null  float64
 3   Destination_FRA                      83624 non-null  float64
 4   Destination_STN                      83624 non-null  float64
 5   Destination_SXF                      83624 non-null  float64
 6   Price_Dev_Cat_Falling                83624 non-null  float64
 7   Price_Dev_Cat_Rising                 83624 non-null  float64
 8   Price_Dev_Cat_Steady                 83624 non-null  float64
 9   Price_In_Eur                         83624 non-null  float64
 10  Price_Dev                            83624 non-null  float64
 11  Price_Dev_Three_Days        

## Functions

In [46]:
# Function to perform training with giniIndex.
def train_using_gini(X_train, y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(random_state=42)
    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini


# Function to perform training with entropy.
def train_using_entropy(X_train, y_train):
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=42)
    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy


# Function to make predictions
def prediction(X_test, clf_object):
    # Predicton on test
    y_pred = clf_object.predict(X_test)
    return y_pred


# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    print("Accuracy : ", accuracy_score(y_test, y_pred) * 100)
    print("Report : ", classification_report(y_test, y_pred))


## Predict

In [44]:
# Build models
clf_gini = train_using_gini(X_train, y_train)
clf_entropy = train_using_entropy(X_train, y_train)

# Prediction using gini
print("Results Using Gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
scores = cross_val_score(clf_gini, X_train, y_train, cv=10, n_jobs=-1)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
print("Scores:", scores)
# Prediction using entropy
print("\nResults Using Entropy:")
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)
scores = cross_val_score(clf_entropy, X_train, y_train, cv=10, n_jobs=-1)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
print("Scores:", scores)

Results Using Gini Index:
Confusion Matrix:  [[14973  1254]
 [ 1293  3386]]
Accuracy :  87.81689467138621
Report :                precision    recall  f1-score   support

           0       0.92      0.92      0.92     16227
           1       0.73      0.72      0.73      4679

    accuracy                           0.88     20906
   macro avg       0.83      0.82      0.82     20906
weighted avg       0.88      0.88      0.88     20906

0.87 accuracy with a standard deviation of 0.00
Scores: [0.86559311 0.87866709 0.86702806 0.87404337 0.86670918 0.87324617
 0.87436224 0.87372449 0.87242864 0.87434221]

Results Using Entropy:
Confusion Matrix:  [[15004  1223]
 [ 1330  3349]]
Accuracy :  87.78819477661915
Report :                precision    recall  f1-score   support

           0       0.92      0.92      0.92     16227
           1       0.73      0.72      0.72      4679

    accuracy                           0.88     20906
   macro avg       0.83      0.82      0.82     20906
we

## Tuning

### RandomizedSearch

In [18]:
# Setup the parameters and distributions to sample from: param_dist
param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(1, 100),
    "min_samples_split": range(2, 100),
    "min_samples_leaf": range(1, 50),
    "max_features": ["auto", "sqrt", "log2"],
}
# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier(random_state=42)

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=3, random_state=42, n_iter=100)

# Fit it to the data
tree_cv.fit(X, Y)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Tuned Decision Tree Parameters: {'min_samples_split': 23, 'min_samples_leaf': 42, 'max_features': 'sqrt', 'max_depth': 2, 'criterion': 'gini'}
Best score is 0.7761647374749474


In [50]:
# Cross Validation Score
randomized_tree = DecisionTreeClassifier(random_state=42, min_samples_split= 23, min_samples_leaf= 42, max_features= 'sqrt', max_depth= 2, criterion= 'gini')

scores = cross_val_score(randomized_tree, X_train, y_train, cv=10, n_jobs=-1)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
print("Scores:", scores)

0.78 accuracy with a standard deviation of 0.00
Scores: [0.77614796 0.77614796 0.77614796 0.77614796 0.77614796 0.77614796
 0.77614796 0.77614796 0.77611226 0.77627173]


### GridSearch

In [None]:
# Setup the parameters and distributions to sample from
params = {
  "criterion":['gini', 'entropy'],
  "max_depth":range(25,100),
  "min_samples_split":range(25,100),
  "min_samples_leaf":range(10,50),
  "max_features": ['auto', 'sqrt', 'log2']
}
# Instantiate the GridSearchCV object
grid_search_cv = GridSearchCV(
    DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3, n_jobs=-1
)
# Fit it to the data
grid_search_cv.fit(X_train, y_train)
# Print the best estimator
grid_search_cv.best_estimator_

Fitting 3 folds for each of 1350000 candidates, totalling 4050000 fits


DecisionTreeClassifier(max_depth=28, max_features='auto', min_samples_leaf=10,
                       min_samples_split=31, random_state=42)

In [None]:
# Print the tuned parameters and score
print(f'Best Params: {grid_search_cv.best_params_}')
print(f'Best Score: {grid_search_cv.best_score_}')

Best Params: {'criterion': 'gini', 'max_depth': 28, 'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 31}
Best Score: 0.8272744666602888


In [51]:
# Classifier with best params
grid_tree = DecisionTreeClassifier(max_depth=28, max_features="auto", min_samples_leaf=10, min_samples_split=31, random_state=42)
scores = cross_val_score(grid_tree, X_train, y_train, cv=10, n_jobs=-1)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
print("Scores:", scores)

0.83 accuracy with a standard deviation of 0.00
Scores: [0.82892219 0.83785077 0.82477679 0.82764668 0.83067602 0.8309949
 0.82987883 0.82366071 0.82411099 0.83351937]


## Monetäres Maß

In [7]:
# The model quality evaluation function expects a Pandas dataframe with at least the following columns:
# Request_Date          int64
# flight_unique_id     object
# Price               float64
# buy                    bool

def model_quality_evaluation(df):
    # Make a copy of the provided dataframe as to not modify the original.
    df = df.copy()

    # Convert 'Price' to whole cents and store as integers to avoid floating point errors.
    df['Price'] = df['Price'] * 100
    df['Price'] = df['Price'].astype(int)

    # Initialize a variable that stores the sum of all our balances.
    sum_balances = 0

    # Get a list of all 'flight_unique_id'.
    flight_unique_ids = df['flight_unique_id'].unique()

    # Iterate over all 'flight_unique_id'.
    for flight_unique_id in flight_unique_ids:
        # Get a subset of the data for the specified 'flight_unique_id'.
        df_subset = df[df['flight_unique_id'] == flight_unique_id]

        # Get all request dates except for the latest request date before departure.
        # At the latest request date before departure we need to buy a ticket anyway,
        # so we don't care about this specific request date.
        request_dates = df_subset[df_subset['Request_Date'] != df_subset['Request_Date'].max()]

        # Make sure request dates are sorted in descending order.
        request_dates.sort_values(by='Request_Date', ascending=False, inplace=True)

        # Get the ticket price from the latest request date before departure,
        # because we certainly have to buy a ticket at this date.
        last_buying_price = df_subset[df_subset['Request_Date'] == df_subset['Request_Date'].max()]['Price'].values[0]

        # Iterate over the remaining request dates
        for _, row in request_dates.iterrows():
            # and check wether the model wants to buy a ticket at the specific request date.
            if(row['buy'] == 1):
                # If the model decides to buy a ticket the last buying price is set to the
                # price point of this request date and the balance doesn't change.
                last_buying_price = row['Price']
            else:
                # If the models decides to not buy a ticket the balance equals the
                # the current ticket price minus the last buying price.
                current_price = row['Price']
                balance = current_price - last_buying_price

                # The balance is added to the sum of all balances.
                sum_balances = sum_balances + balance

    # Return the sum of all our previously calculated balances.
    return sum_balances / 100

### Ermitteln der optimalen Parameter für das Gütemaß

In [None]:
train_set_or = pd.read_csv("./train_set.csv")
train_set = pd.read_csv("./train_set_ohe.csv")

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
combinations = {}

# Setup the parameters and distributions to sample from
params = {
    "max_depth": [23, 28, 33],
    "min_samples_split": [26, 31, 36],
    "min_samples_leaf": [5, 10, 15],
}

# Iterate over all parameter combinations and execute money evaluation
for param in itertools.product(
    params["max_depth"], params["min_samples_split"], params["min_samples_leaf"]
):
    clf = DecisionTreeClassifier(
        max_depth=param[0],
        min_samples_split=param[1],
        min_samples_leaf=param[2],
        max_features="auto",
        random_state=42,
    )
    money_scores = []
    # for each fold create a dataframe
    for train_index, test_index in skf.split(X, Y):  # split() return index of each fold
        # get each fold train, test fold with index index
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = Y[train_index], Y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        y_pred = clf.predict(x_test_fold)

        X_train_1 = train_set_or.loc[test_index]
        X_train_1 = X_train_1.reset_index(drop=True)
        df = pd.DataFrame()
        df["buy"] = y_pred
        df["flight_unique_id"] = X_train_1["flight_unique_id"]
        df["Request_Date"] = X_train_1["Request_Date"]
        df["Price"] = X_train_1["Price_In_Eur"]
        # eval with custom func and append
        score = model_quality_evaluation(df)
        money_scores.append(score)
    # add list off accuracy to a dict with the combinations as a key
    combinations[",".join(str(x) for x in param)] = money_scores


#### Search for max and min values

In [27]:
first = True
maxMoney = 0 
minMoney = 0
maxParam = ""
allMoney = []

# Finds the parameter combination for the Maximum and Minimum Money 
for key, value in combinations.items():
  allMoney.extend(value)
  if first:
    maxMoney = max(value)
    minMoney = min(value)
    moneyParam = key
    first = False
  elif max(value) > maxMoney:
    maxMoney = max(value)
    maxParam = key
  elif min(value) < minMoney:
    minMoney = min(value)

# Print the output
print("Maximum Money That can be obtained from this model is:", maxMoney)
print("\nMinimum Money:", minMoney)
print("\nOverall Money:", np.mean(allMoney))
print("\nStandard Deviation is:", np.std(allMoney))
print("\nParams for Maximum Money:", maxParam, "(max_depth, min_samples_split, min_samples_leaf)")
print("\nAll combinations:", combinations)
print("\nList of possible accuracy:", allMoney)

Maximum Money That can be obtained from this model is: -173067.23

Minimum Money: -308867.94

Overall Money: -235928.8205185185

Standard Deviation is: 28314.611159008065

Params for Maximum Money: 28,26,5 (max_depth, min_samples_split, min_samples_leaf)

All combinations: {'23,26,5': [-241978.61, -180454.27, -175692.39, -210379.66, -235689.43], '23,26,10': [-257050.31, -215108.39, -234775.74, -283150.33, -244970.67], '23,26,15': [-246460.57, -254982.73, -246725.63, -228909.95, -308867.94], '23,31,5': [-193753.47, -205821.33, -218191.86, -219210.99, -204873.42], '23,31,10': [-258411.24, -219826.25, -273104.98, -237283.09, -226836.83], '23,31,15': [-246006.38, -210502.51, -276836.67, -267032.1, -263305.26], '23,36,5': [-225692.49, -222450.0, -227666.3, -246150.33, -217983.87], '23,36,10': [-263861.91, -261803.02, -226110.98, -239178.88, -283367.75], '23,36,15': [-269615.37, -205177.04, -274408.52, -229478.24, -295464.45], '28,26,5': [-201251.42, -173067.23, -200845.56, -230813.01, -2109

### Final prediction and classification report

In [37]:
# initialize classifier with best parameters for the money score
money_tree = DecisionTreeClassifier(
    criterion="gini",
    max_depth=28,
    min_samples_split=26,
    min_samples_leaf=5,
    max_features="auto",
    random_state=42
)
# Fit to train data
money_tree.fit(X_train, y_train)
y_pred_money = prediction(X_test, money_tree)
# print confusion matrix and classification report
cal_accuracy(y_test, y_pred_money)

# Cross Validation Score
scores = cross_val_score(grid_tree, X_train, y_train, cv=10, n_jobs=-1)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
print("Scores:", scores)

Confusion Matrix:  [[15055  1172]
 [ 2242  2437]]
Accuracy :  83.66975987754711
Report :                precision    recall  f1-score   support

           0       0.87      0.93      0.90     16227
           1       0.68      0.52      0.59      4679

    accuracy                           0.84     20906
   macro avg       0.77      0.72      0.74     20906
weighted avg       0.83      0.84      0.83     20906

0.83 accuracy with a standard deviation of 0.00
Scores: [0.82892219 0.83785077 0.82477679 0.82764668 0.83067602 0.8309949
 0.82987883 0.82366071 0.82411099 0.83351937]
