### Import packages

In [9]:
# Importing the required packages
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


#### load Dataset and split into train and test 

In [10]:
# import dataset
# one_hot_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data Mining Gruppe A/train_set_ohe.csv")
one_hot_data = pd.read_csv("./datamining/train_set_ohe.csv")
Y = np.array(one_hot_data["buy"])

one_hot_data = one_hot_data[["Departure_FRA", "Departure_STN", "Departure_SXF", "Destination_FRA", "Destination_STN", "Destination_SXF", "Price_Dev_Cat_Falling", 
                                  "Price_Dev_Cat_Rising", "Price_Dev_Cat_Steady", "Price_In_Eur",
                                  "Price_Dev", "Price_Dev_Three_Days", "Same_Day_Request_route_Flight_price",
                                  "Request_Month", "Request_Time", "Request_Day", "Flight_Day", "Departure_hour",
                                  "Hours_to_Flight", "Request_Count", "Request_Count_Sum", "Last_Request_Bool", "Is_Holiday_UK", "Is_Holiday_GER",
                                  "Is_School_Holiday_BER", "Is_School_Holiday_FRA", "Is_School_Holiday_UK", "buy"]]
X = one_hot_data.drop(
    ["buy"], axis=1
)
X.info()
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, random_state=42, shuffle=True, stratify=Y
)

# verify dataset
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83624 entries, 0 to 83623
Data columns (total 27 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Departure_FRA                        83624 non-null  float64
 1   Departure_STN                        83624 non-null  float64
 2   Departure_SXF                        83624 non-null  float64
 3   Destination_FRA                      83624 non-null  float64
 4   Destination_STN                      83624 non-null  float64
 5   Destination_SXF                      83624 non-null  float64
 6   Price_Dev_Cat_Falling                83624 non-null  float64
 7   Price_Dev_Cat_Rising                 83624 non-null  float64
 8   Price_Dev_Cat_Steady                 83624 non-null  float64
 9   Price_In_Eur                         83624 non-null  float64
 10  Price_Dev                            83624 non-null  float64
 11  Price_Dev_Three_Days        

Unnamed: 0,Departure_FRA,Departure_STN,Departure_SXF,Destination_FRA,Destination_STN,Destination_SXF,Price_Dev_Cat_Falling,Price_Dev_Cat_Rising,Price_Dev_Cat_Steady,Price_In_Eur,...,Departure_hour,Hours_to_Flight,Request_Count,Request_Count_Sum,Last_Request_Bool,Is_Holiday_UK,Is_Holiday_GER,Is_School_Holiday_BER,Is_School_Holiday_FRA,Is_School_Holiday_UK
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,208.07,...,19,56,1.0,4,0,0,0,1,0,0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,259.07,...,19,44,2.0,4,0,0,0,1,0,0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,259.07,...,19,32,3.0,4,0,0,0,1,0,0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,259.07,...,19,20,4.0,4,1,0,0,1,0,0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,143.86,...,21,58,1.0,4,0,0,0,1,0,0


#### Functions

In [3]:
# Function to perform training with giniIndex.
def train_using_gini(X_train, y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(max_depth=9,
                                      max_features='auto',
                                      min_samples_leaf=3,
                                      min_samples_split=9,
                                      random_state=42
                                      )
    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini


# Function to perform training with entropy.
def train_using_entropy(X_train, y_train):
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
        criterion="entropy", random_state=100, max_depth=3, min_samples_leaf=5
    )
    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy


# Function to make predictions
def prediction(X_test, clf_object):
    # Predicton on test with giniIndex
    y_pred = clf_object.predict(X_test)
    print("Predicted values:")
    print(y_pred)
    return y_pred


# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    print("Accuracy : ", accuracy_score(y_test, y_pred) * 100)
    print("Report : ", classification_report(y_test, y_pred))


#### Predict

In [11]:
# Build models
clf_gini = train_using_gini(X_train, y_train)
clf_entropy = train_using_entropy(X_train, y_train)

# Prediction using gini
print("Results Using Gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)

# # Prediction using entropy
# print("Results Using Entropy:")
# y_pred_entropy = prediction(X_test, clf_entropy)
# cal_accuracy(y_test, y_pred_entropy)

Results Using Gini Index:
Predicted values:
[0 0 1 ... 0 1 0]
Confusion Matrix:  [[15664   563]
 [ 3494  1185]]
Accuracy :  80.59408782167799
Report :                precision    recall  f1-score   support

           0       0.82      0.97      0.89     16227
           1       0.68      0.25      0.37      4679

    accuracy                           0.81     20906
   macro avg       0.75      0.61      0.63     20906
weighted avg       0.79      0.81      0.77     20906



#### Tuning

##### RandomizedSearch

In [12]:
# Setup the parameters and distributions to sample from: param_dist
param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": range(1, 100),
    "min_samples_split": range(1, 100),
    "min_samples_leaf": range(1, 50),
    "max_features": ["auto", "sqrt", "log2"],
}
# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

# Fit it to the data
tree_cv.fit(X, Y)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))


Tuned Decision Tree Parameters: {'min_samples_split': 72, 'min_samples_leaf': 49, 'max_features': 'sqrt', 'max_depth': 59, 'criterion': 'gini'}
Best score is 0.6574443995167834


##### GridSearch

In [4]:
params = {
  "criterion":['gini', 'entropy'],
  "max_depth":range(25,100),
  "min_samples_split":range(25,100),
  "min_samples_leaf":range(10,50),
  "max_features": ['auto', 'sqrt', 'log2']
}
grid_search_cv = GridSearchCV(
    DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3, n_jobs=-1
)
grid_search_cv.fit(X_train, y_train)
grid_search_cv.best_estimator_
grid_search_cv.best_estimator_

Fitting 3 folds for each of 1350000 candidates, totalling 4050000 fits


DecisionTreeClassifier(max_depth=28, max_features='auto', min_samples_leaf=10,
                       min_samples_split=31, random_state=42)

In [8]:
print(f'Best Params: {grid_search_cv.best_params_}')
print(f'Best Score: {grid_search_cv.best_score_}')


Best Params: {'criterion': 'gini', 'max_depth': 28, 'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 31}
Best Score: 0.8272744666602888
