## Section 1. clean the train and test dataset 


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = 25

In [2]:
train_data = pd.read_csv('data/train.csv', index_col=0)
test_data = pd.read_csv('data/test.csv', index_col=0)

In [3]:
train_data.shape, test_data.shape

((103904, 24), (25976, 24))

In [4]:
train_data.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,0,0.0,satisfied


In [5]:
# drop the id column as this column has nothing to do with satisfaction
train_data.drop(['id'], axis = 1, inplace = True)
test_data.drop(['id'], axis = 1, inplace = True)

In [6]:
# total satisfied and disatisfied value counts
train_data['satisfaction'].value_counts()

neutral or dissatisfied    58879
satisfied                  45025
Name: satisfaction, dtype: int64

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103904 entries, 0 to 103903
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  object 
 1   Customer Type                      103904 non-null  object 
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  object 
 4   Class                              103904 non-null  object 
 5   Flight Distance                    103904 non-null  int64  
 6   Inflight wifi service              103904 non-null  int64  
 7   Departure/Arrival time convenient  103904 non-null  int64  
 8   Ease of Online booking             103904 non-null  int64  
 9   Gate location                      103904 non-null  int64  
 10  Food and drink                     103904 non-null  int64  
 11  Online boarding                    1039

In [8]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25976 entries, 0 to 25975
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             25976 non-null  object 
 1   Customer Type                      25976 non-null  object 
 2   Age                                25976 non-null  int64  
 3   Type of Travel                     25976 non-null  object 
 4   Class                              25976 non-null  object 
 5   Flight Distance                    25976 non-null  int64  
 6   Inflight wifi service              25976 non-null  int64  
 7   Departure/Arrival time convenient  25976 non-null  int64  
 8   Ease of Online booking             25976 non-null  int64  
 9   Gate location                      25976 non-null  int64  
 10  Food and drink                     25976 non-null  int64  
 11  Online boarding                    25976 non-null  int

In [9]:
# change the attributes' names to snake_case
train_data.rename(columns = {'Gender' : 'gender', 'Customer Type': 'customer_type', 'Age' : 'age', 
                           'Type of Travel': 'travel_type', 'Class': 'class', 
                           'Flight Distance': 'flight_distance', 
                           'Inflight wifi service': 'inflight_wifi',
                           'Departure/Arrival time convenient': 'departure_n_arrival_time_convenient', 
                           'Ease of Online booking' : 'easy_onlinebooking',
                           'Gate location' : 'gate_location', 'Food and drink' : 'food_n_drink', 
                           'Online boarding' : 'online_boarding', 'Seat comfort': 'seat_comfort',
                           'Inflight entertainment' : 'inflight_entertainment', 
                           'On-board service' : 'onboard_service', 
                           'Leg room service' : 'leg_room_service',
                           'Baggage handling' : 'baggage_handling',
                           'Checkin service' : 'checkin_service', 
                           'Inflight service' : 'inflight_service','Cleanliness' : 'cleanliness',
                           'Departure Delay in Minutes': 'departure_delay_minutes', 
                           'Arrival Delay in Minutes' : 'arrival_delay_minutes',
                           'satisfaction' : 'satisfaction'}, inplace= True)
test_data.rename(columns = {'Gender' : 'gender', 'Customer Type': 'customer_type', 'Age' : 'age', 
                           'Type of Travel': 'travel_type', 'Class': 'class', 
                           'Flight Distance': 'flight_distance', 
                           'Inflight wifi service': 'inflight_wifi',
                           'Departure/Arrival time convenient': 'departure_n_arrival_time_convenient', 
                           'Ease of Online booking' : 'easy_onlinebooking',
                           'Gate location' : 'gate_location', 'Food and drink' : 'food_n_drink', 
                           'Online boarding' : 'online_boarding', 'Seat comfort': 'seat_comfort',
                           'Inflight entertainment' : 'inflight_entertainment', 
                           'On-board service' : 'onboard_service', 
                           'Leg room service' : 'leg_room_service',
                           'Baggage handling' : 'baggage_handling',
                           'Checkin service' : 'checkin_service', 
                           'Inflight service' : 'inflight_service','Cleanliness' : 'cleanliness',
                           'Departure Delay in Minutes': 'departure_delay_minutes', 
                           'Arrival Delay in Minutes' : 'arrival_delay_minutes',
                           'satisfaction' : 'satisfaction'}, inplace= True)

In [10]:
train_data.head()

Unnamed: 0,gender,customer_type,age,travel_type,class,flight_distance,inflight_wifi,departure_n_arrival_time_convenient,easy_onlinebooking,gate_location,food_n_drink,online_boarding,seat_comfort,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_minutes,arrival_delay_minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,satisfied
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,2,2,2,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,4,5,5,3,3,4,4,3,3,3,0,0.0,satisfied


In [11]:
# check if any columns in train_data have nan value
train_data.isna().sum()

gender                                   0
customer_type                            0
age                                      0
travel_type                              0
class                                    0
flight_distance                          0
inflight_wifi                            0
departure_n_arrival_time_convenient      0
easy_onlinebooking                       0
gate_location                            0
food_n_drink                             0
online_boarding                          0
seat_comfort                             0
inflight_entertainment                   0
onboard_service                          0
leg_room_service                         0
baggage_handling                         0
checkin_service                          0
inflight_service                         0
cleanliness                              0
departure_delay_minutes                  0
arrival_delay_minutes                  310
satisfaction                             0
dtype: int6

In [12]:
# notice only column 'arrival_delay_minutes' has 310 nan, so delete these entries
train_data.dropna(axis=0, inplace=True)

In [13]:
# now all entries in train dataset are valid input
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103594 entries, 0 to 103903
Data columns (total 23 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   gender                               103594 non-null  object 
 1   customer_type                        103594 non-null  object 
 2   age                                  103594 non-null  int64  
 3   travel_type                          103594 non-null  object 
 4   class                                103594 non-null  object 
 5   flight_distance                      103594 non-null  int64  
 6   inflight_wifi                        103594 non-null  int64  
 7   departure_n_arrival_time_convenient  103594 non-null  int64  
 8   easy_onlinebooking                   103594 non-null  int64  
 9   gate_location                        103594 non-null  int64  
 10  food_n_drink                         103594 non-null  int64  
 11  online_boardi

In [14]:
# check if any columns in test_data have nan value
test_data.isna().sum()

gender                                  0
customer_type                           0
age                                     0
travel_type                             0
class                                   0
flight_distance                         0
inflight_wifi                           0
departure_n_arrival_time_convenient     0
easy_onlinebooking                      0
gate_location                           0
food_n_drink                            0
online_boarding                         0
seat_comfort                            0
inflight_entertainment                  0
onboard_service                         0
leg_room_service                        0
baggage_handling                        0
checkin_service                         0
inflight_service                        0
cleanliness                             0
departure_delay_minutes                 0
arrival_delay_minutes                  83
satisfaction                            0
dtype: int64

In [15]:
# notice only column 'arrival_delay_minutes' has 83 nan, so delete these entries
test_data.dropna(axis=0, inplace=True)

In [16]:
# now all entries in test dataset are valid input
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25893 entries, 0 to 25975
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   gender                               25893 non-null  object 
 1   customer_type                        25893 non-null  object 
 2   age                                  25893 non-null  int64  
 3   travel_type                          25893 non-null  object 
 4   class                                25893 non-null  object 
 5   flight_distance                      25893 non-null  int64  
 6   inflight_wifi                        25893 non-null  int64  
 7   departure_n_arrival_time_convenient  25893 non-null  int64  
 8   easy_onlinebooking                   25893 non-null  int64  
 9   gate_location                        25893 non-null  int64  
 10  food_n_drink                         25893 non-null  int64  
 11  online_boarding             

In [17]:
# notice in the original dataset there are feature attributes that contain many distinct numerical values
# for example, departure_delay_minutes has 445 distinct values. Same for age, flight_distance...
train_data.departure_delay_minutes.value_counts()

0      58552
1       2939
2       2266
3       2006
4       1845
       ...  
465        1
933        1
357        1
420        1
447        1
Name: departure_delay_minutes, Length: 445, dtype: int64

In [18]:
# To simplfy our decision tree modeling, we convert the numerical values to categorical values 
# Now we have train_data and test_data with valid entries on all attributes
# We will now handle numerical features and convert them into categorical values

In [19]:
# Test data:
# convert value of attribute departure_delay_minutes from numerical value to categorical value
cut_labels_delay = ['not delayed', 'less than 1hr delayed', '1-3hr delayed', 'more than 3hr delayed']
cut_bins_delay = [-1, 10, 60, 180, np.inf]
train_data['departure_delay_minutes'] = pd.cut(train_data['departure_delay_minutes'], bins=cut_bins_delay, labels=cut_labels_delay)

# convert value of attribute arrival_delay_minutes from numerical value to categorical value
cut_labels_delay = ['not delayed', 'less than 1hr delayed', '1-3hr delayed', 'more than 3hr delayed']
cut_bins = [-1, 10, 60, 180, np.inf]
train_data['arrival_delay_minutes'] = pd.cut(train_data['arrival_delay_minutes'], bins=cut_bins_delay, labels=cut_labels_delay)

# convert value of attribute age from numerical value to categorical value
cut_labels_age = ['children <18', 'young adults 18-35', 'middle-aged adults 36-55', 'older adults > 56']
cut_bins_age = [-1, 18, 36, 56, np.inf]
train_data['age'] = pd.cut(train_data['age'], bins=cut_bins, labels=cut_labels_age)

# convert value of flight distance from numerical value to categorical value
cut_labels_flight_distance = ['short-haul <1500', 'medium-haul 1500-4100', 'long-haul >4100']
cut_bins_flight_distance = [-1, 1500, 4100, np.inf]
train_data['flight_distance'] = pd.cut(train_data['flight_distance'], bins=cut_bins_flight_distance, labels=cut_labels_flight_distance)

In [20]:
# inflight_wifideparture_n_arrival_time_convenient	easy_onlinebooking	gate_location	food_n_drink	online_boarding	seat_comfort	inflight_entertainment	onboard_service	leg_room_service	baggage_handling	checkin_service	inflight_service	cleanliness

In [21]:
# convert customer rating from 1-5 to categorical rating 'poor', 'fair', 'good'
rated_items = train_data.columns[6:-3]
cut_labels_rating = ['poor','fair','good']
cut_bins_rating = [-1, 1, 3, 5]

for item in rated_items:
    train_data[item] = pd.cut(train_data[item], bins=cut_bins_rating, labels=cut_labels_rating)

In [22]:
# now all the attributes in the train_data are categorical
train_data.head()

Unnamed: 0,gender,customer_type,age,travel_type,class,flight_distance,inflight_wifi,departure_n_arrival_time_convenient,easy_onlinebooking,gate_location,food_n_drink,online_boarding,seat_comfort,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_minutes,arrival_delay_minutes,satisfaction
0,Male,Loyal Customer,young adults 18-35,Personal Travel,Eco Plus,short-haul <1500,fair,good,fair,poor,good,fair,good,good,good,fair,good,good,good,good,less than 1hr delayed,less than 1hr delayed,neutral or dissatisfied
1,Male,disloyal Customer,young adults 18-35,Business travel,Business,short-haul <1500,fair,fair,fair,fair,poor,fair,poor,poor,poor,good,fair,poor,good,poor,not delayed,not delayed,neutral or dissatisfied
2,Female,Loyal Customer,young adults 18-35,Business travel,Business,short-haul <1500,fair,fair,fair,fair,good,good,good,good,good,fair,good,good,good,good,not delayed,not delayed,satisfied
3,Female,Loyal Customer,young adults 18-35,Business travel,Business,short-haul <1500,fair,good,good,good,fair,fair,fair,fair,fair,good,fair,poor,good,fair,less than 1hr delayed,not delayed,neutral or dissatisfied
4,Male,Loyal Customer,middle-aged adults 36-55,Business travel,Business,short-haul <1500,fair,fair,fair,fair,good,good,good,fair,fair,good,good,fair,fair,fair,not delayed,not delayed,satisfied


In [23]:
# Now we handles the test_data in the same way
# convert value of attribute departure_delay_minutes from numerical value to categorical value
cut_labels_delay = ['not delayed', 'less than 1hr delayed', '1-3hr delayed', 'more than 3hr delayed']
cut_bins_delay = [-1, 10, 60, 180, np.inf]
test_data['departure_delay_minutes'] = pd.cut(test_data['departure_delay_minutes'], bins=cut_bins_delay, labels=cut_labels_delay)

# convert value of attribute arrival_delay_minutes from numerical value to categorical value
cut_labels_delay = ['not delayed', 'less than 1hr delayed', '1-3hr delayed', 'more than 3hr delayed']
cut_bins = [-1, 10, 60, 180, np.inf]
test_data['arrival_delay_minutes'] = pd.cut(test_data['arrival_delay_minutes'], bins=cut_bins_delay, labels=cut_labels_delay)

# convert value of attribute age from numerical value to categorical value
cut_labels_age = ['children <18', 'young adults 18-35', 'middle-aged adults 36-55', 'older adults > 56']
cut_bins_age = [-1, 18, 36, 56, np.inf]
test_data['age'] = pd.cut(test_data['age'], bins=cut_bins, labels=cut_labels_age)

# convert value of flight distance from numerical value to categorical value
cut_labels_flight_distance = ['short-haul <1500', 'medium-haul 1500-4100', 'long-haul >4100']
cut_bins_flight_distance = [-1, 1500, 4100, np.inf]
test_data['flight_distance'] = pd.cut(test_data['flight_distance'], bins=cut_bins_flight_distance, labels=cut_labels_flight_distance)

# convert customer rating from 1-5 to categorical rating 'poor', 'fair', 'good'
rated_items = train_data.columns[6:-3]
cut_labels_rating = ['poor','fair','good']
cut_bins_rating = [-1, 1, 3, 5]

for item in rated_items:
    test_data[item] = pd.cut(test_data[item], bins=cut_bins_rating, labels=cut_labels_rating)

In [24]:
# now all the attributes in the test_data are categorical
test_data.head()

Unnamed: 0,gender,customer_type,age,travel_type,class,flight_distance,inflight_wifi,departure_n_arrival_time_convenient,easy_onlinebooking,gate_location,food_n_drink,online_boarding,seat_comfort,inflight_entertainment,onboard_service,leg_room_service,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_minutes,arrival_delay_minutes,satisfaction
0,Female,Loyal Customer,young adults 18-35,Business travel,Eco,short-haul <1500,good,good,fair,good,fair,good,fair,good,good,good,good,fair,good,good,less than 1hr delayed,less than 1hr delayed,satisfied
1,Female,Loyal Customer,young adults 18-35,Business travel,Business,medium-haul 1500-4100,poor,poor,fair,poor,good,good,good,good,good,good,good,fair,good,good,not delayed,not delayed,satisfied
2,Male,disloyal Customer,young adults 18-35,Business travel,Eco,short-haul <1500,fair,poor,fair,good,fair,fair,fair,fair,good,poor,fair,fair,fair,fair,not delayed,not delayed,neutral or dissatisfied
3,Male,Loyal Customer,young adults 18-35,Business travel,Business,medium-haul 1500-4100,poor,poor,poor,fair,fair,good,good,poor,poor,poor,poor,fair,poor,good,not delayed,not delayed,satisfied
4,Female,Loyal Customer,young adults 18-35,Business travel,Eco,short-haul <1500,fair,fair,good,fair,good,poor,fair,fair,fair,fair,fair,good,fair,good,not delayed,less than 1hr delayed,satisfied


## Section 2: This section builds helper functions to calculate Entropy value and Information Gain and builds decision tree with helper functions

In [25]:
def entropy(feature):
    # get all types of entries of the feature and calculate corresponding counts of each type  
    types, counts = np.unique(feature, return_counts=True)
    # calculate the total number of entries of the feature
    numOfEntries = sum(counts)
    # a list used to store the entropy values  
    entroList = []
    sumOfEntropy = 0
    for i in range(len(types)):
        # calculate the proportion of each value 
        proportion = counts[i] / numOfEntries
        # calculate entropy of the value according to the standard entropy equation
        # sum all entropies up to get entropy of the feature
        sumOfEntropy += np.log2(proportion) * (-proportion)

    return sumOfEntropy

In [26]:
 def informationGain(dataSet, feature, targetFeature):
    # get entropy of the target geature in the given data set
    entropyOfTarget = entropy(dataSet[targetFeature])

    # get all types of entries of the feature and calculate corresponding counts of each type 
    types, counts = np.unique(dataSet[feature], return_counts=True)
    
    # calculate the total number of entries of the feature
    numOfEntries = sum(counts)
    
    sumOfEntropy = 0
    for i in range(len(types)):
        proportion = counts[i]/numOfEntries
        entropyValue = entropy(dataSet[dataSet[feature]==types[i]][targetFeature])
        sumOfEntropy += proportion * entropyValue

    # calculate information gain
    infoGain = entropyOfTarget - sumOfEntropy

    return infoGain

In [27]:
# now we can try calculating the entropy value of the target feature 'satisfaction'
print("Entropy for the satisfaction dataset:")
entropy(train_data.satisfaction)

Entropy for the satisfaction dataset:


0.987161175643557

In [28]:
# now we calculate the Information Gain at the first level of the tree
# the table list the information gain for all feature attributes in descending order
# we can see that online boarding has the most Information Gain, thus best deliminating the class inpurity 
# therefore, online boarding should be the root node at the decision tree

information_gain_list=[]
for column in train_data.columns:
    information_gain_list.append(informationGain(train_data, column, 'satisfaction'))
information_gain_values=pd.DataFrame(information_gain_list, index=train_data.columns, columns=['IG_value'])
information_gain_values.sort_values(by='IG_value', ascending=False)

Unnamed: 0,IG_value
satisfaction,0.987161
online_boarding,0.263112
class,0.192687
travel_type,0.163958
inflight_wifi,0.137935
inflight_entertainment,0.133699
seat_comfort,0.11023
leg_room_service,0.086912
onboard_service,0.077211
flight_distance,0.065369


In [29]:
# Now we build the function decision_tree where we recursively find the attribute with the biggest Information Gain
# and save the tree structured attributes in the dictionary

In [30]:
def decisionTree(dataset, initialDataset, features, targetFeature, majorityClass=None):
    # get all unique classes and counts of the target feature 
    uniqAttrOfTargetFeature, counts = np.unique(dataset[targetFeature], return_counts=True)
    # determine majority class of target feature in the current dataset      
    majorityClass = uniqAttrOfTargetFeature[np.argmax(counts)]
    
    # if target feature only contains one class, return this class
    if len(uniqAttrOfTargetFeature) == 1:
        return uniqAttrOfTargetFeature[0]
    # if there is no feature or no data, return majority class   
    elif len(dataset) == 0 or len(features) == 0:
        return majorityClass    
    # if none of the above are true, build a new branch of the tree:
    else:
        # calculate information gains for each feature
        #choose the feature with the highest gain and use it as spliter
        infogainList = [informationGain(dataset, feature, targetFeature) for feature in features]
        chosenFeature = features[np.argmax(infogainList)]

        # create the initial tree structure 
        tree = {chosenFeature: {}}

        # remove the chosen feature from all features
        # the chosen feature will become the parent node of other features
        features = [item for item in features if item != chosenFeature]
        
        # create nodes under parent node
        parentAttributeValues = np.unique(dataset[chosenFeature])
        for value in parentAttributeValues:
            subDataset = dataset[dataset[chosenFeature] == value]

            # recursively build tree
            subtree = decisionTree(subDataset, initialDataset, features, targetFeature, majorityClass)

            # add subtree to parent tree
            tree[chosenFeature][value] = subtree

        return tree

In [31]:
feature_attribute_names = train_data.columns[:-1]

In [32]:
# Save the tree structure in the varible to be used for predicting test dataset
tree = decisionTree(train_data, train_data, feature_attribute_names,'satisfaction', None)

In [33]:
# root node
tree.keys()

dict_keys(['online_boarding'])

In [34]:
# second level node
tree['online_boarding']['poor'].keys(), tree['online_boarding']['fair'].keys(), tree['online_boarding']['good'].keys()

(dict_keys(['inflight_wifi']),
 dict_keys(['inflight_wifi']),
 dict_keys(['travel_type']))

In [35]:
tree['online_boarding']['poor']['inflight_wifi']['poor'].keys(), tree['online_boarding']['poor']['inflight_wifi']['fair'].keys(),tree['online_boarding']['poor']['inflight_wifi']['good'].keys()

(dict_keys(['class']), dict_keys(['class']), dict_keys(['travel_type']))

In [36]:
tree['online_boarding']['poor']['inflight_wifi']['poor'].keys(), tree['online_boarding']['poor']['inflight_wifi']['fair'].keys(),tree['online_boarding']['poor']['inflight_wifi']['good'].keys()

(dict_keys(['class']), dict_keys(['class']), dict_keys(['travel_type']))

In [37]:
tree['online_boarding']['poor']['inflight_wifi']['poor'].keys(), tree['online_boarding']['poor']['inflight_wifi']['fair'].keys(),tree['online_boarding']['poor']['inflight_wifi']['good'].keys()

(dict_keys(['class']), dict_keys(['class']), dict_keys(['travel_type']))

In [38]:
tree

{'online_boarding': {'fair': {'inflight_wifi': {'fair': {'travel_type': {'Business travel': {'customer_type': {'Loyal Customer': {'inflight_entertainment': {'fair': {'baggage_handling': {'fair': {'inflight_service': {'fair': {'gate_location': {'fair': {'seat_comfort': {'fair': {'checkin_service': {'fair': {'onboard_service': {'fair': {'gender': {'Female': {'arrival_delay_minutes': {'1-3hr delayed': {'easy_onlinebooking': {'fair': 'neutral or dissatisfied',
                            'good': {'class': {'Business': 'neutral or dissatisfied',
                              'Eco': 'satisfied'}}}},
                          'less than 1hr delayed': 'neutral or dissatisfied',
                          'more than 3hr delayed': 'neutral or dissatisfied',
                          'not delayed': {'food_n_drink': {'fair': {'cleanliness': {'fair': {'easy_onlinebooking': {'fair': {'age': {'children <18': 'neutral or dissatisfied',
                                  'middle-aged adults 36-55': {'leg

## Section 3. make prediction. Build predict function 

In [39]:
#prediction making function

#functinon to predict for a single entry
def makePrediction(dataDictionary, tree, default='satisfied'):
    # traverse every feature in dataDictionary - dictionary converted from dataframe
    for feature in list(dataDictionary.keys()):
      # check if tree also has the feature
      if feature in list(tree.keys()):
        try:
            res = tree[feature][dataDictionary[feature]]
        except:
            return default

        res = tree[feature][dataDictionary[feature]]

        # if result is a dictionary, pass it into makePrediction funciton to find the best reult
        if isinstance(res, dict):
            return makePrediction(dataDictionary, res)
        else:
            return res
    
def predict(testDatset):
    # convert test data set into a dictionary
    # since the structure of tree is also a dictionary 
    testDataDic = testDatset.to_dict(orient='records')
    predictions = []

    # make a prediction for all test data sets
    for testData in testDataDic:
        predictions.append(makePrediction(testData, tree, 1.0))

    return predictions

In [40]:
test_data_without_target=test_data.drop(['satisfaction'], axis=1)
test_data_label = test_data['satisfaction']

In [41]:
list(test_data_label)

['satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neut

In [42]:
actual_result = list(test_data_label)

In [43]:
predict(test_data_without_target)

['satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'satisfied',
 'neutral or dissatisfied',
 'satisfied',
 'neutral or dissati

In [44]:
predictied_result = predict(test_data_without_target)

## Section 4. evaluate prediction result

In [45]:
from sklearn.metrics import confusion_matrix
confusion_matrix(actual_result, predictied_result)

array([[12943,  1585],
       [ 1368,  9997]])

In [46]:
predictied_result.count('neutral or dissatisfied'), predictied_result.count('satisfied')

(14311, 11582)

In [47]:
actual_result.count('neutral or dissatisfied'), actual_result.count('satisfied')

(14528, 11365)

In [48]:
result = pd.DataFrame(confusion_matrix(actual_result, predictied_result), columns = [['predicted','predicted'],['neutral or dissatisfied','satisfied']],
             index=[['actual','actual'], ['neutral or dissatisfied', 'satisfied']])
result

# Here is the confusion matrix for our prediction

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,neutral or dissatisfied,satisfied
actual,neutral or dissatisfied,12943,1585
actual,satisfied,1368,9997


In [49]:
true_positive = result.loc[('actual','satisfied'),('predicted','satisfied')]
false_positive = result.loc[('actual','neutral or dissatisfied'),('predicted','satisfied')]
true_negative = result.loc[('actual','neutral or dissatisfied'),('predicted','neutral or dissatisfied')]
false_negative = result.loc[('actual','satisfied'),('predicted','neutral or dissatisfied')]

In [50]:
# now we evaluate the accuracy of our prediction
print('precision score:',true_positive/(true_positive + false_positive))
print('recall score:',true_positive/(true_positive + false_negative))

precision score: 0.8631497150751165
recall score: 0.8796304443466784
