# Data Science Specialization (Spring 2025, RUC)
## Workshop: Decision Trees
## Exercise Part II

## 1. Imports

In [1]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

## 2. Data Preparation

In [2]:
train = pd.read_csv(r'airline_kaggle_train.csv')
test = pd.read_csv(r'airline_kaggle_test.csv')
train.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [3]:
train.set_index('id', inplace=True)
test.set_index('id', inplace=True)
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103904 entries, 70172 to 62567
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  object 
 1   Customer Type                      103904 non-null  object 
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  object 
 4   Class                              103904 non-null  object 
 5   Flight Distance                    103904 non-null  int64  
 6   Inflight wifi service              103904 non-null  int64  
 7   Departure/Arrival time convenient  103904 non-null  int64  
 8   Ease of Online booking             103904 non-null  int64  
 9   Gate location                      103904 non-null  int64  
 10  Food and drink                     103904 non-null  int64  
 11  Online boarding                    103904

In [4]:
train.columns

Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [5]:
feature_cols = train.columns.drop(['satisfaction'])
feature_cols

Index(['Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes'],
      dtype='object')

In [6]:
train.describe()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes
count,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103904.0,103594.0
mean,39.379706,1189.448375,2.729683,3.060296,2.756901,2.976883,3.202129,3.250375,3.439396,3.358158,3.382363,3.351055,3.631833,3.30429,3.640428,3.286351,14.815618,15.178678
std,15.114964,997.147281,1.327829,1.525075,1.398929,1.277621,1.329533,1.349509,1.319088,1.332991,1.288354,1.315605,1.180903,1.265396,1.175663,1.312273,38.230901,38.698682
min,7.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,414.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,2.0,0.0,0.0
50%,40.0,843.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,0.0,0.0
75%,51.0,1743.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,12.0,13.0
max,85.0,4983.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1592.0,1584.0


In [7]:
# Show values for categorical features
for feature in ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']:
    print("'{0}' values: {1}:\r".format(feature, train[feature].unique()))

'Gender' values: ['Male' 'Female']:
'Customer Type' values: ['Loyal Customer' 'disloyal Customer']:
'Type of Travel' values: ['Personal Travel' 'Business travel']:
'Class' values: ['Eco Plus' 'Business' 'Eco']:
'satisfaction' values: ['neutral or dissatisfied' 'satisfied']:


In [8]:
# Convert Gender, Customer Type and Type of Travel to numeric by factorization
for feature in ['Gender', 'Customer Type', 'Type of Travel']:
    train[feature] = pd.factorize(train[feature])[0]
    test[feature] = pd.factorize(test[feature])[0]

train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103904 entries, 70172 to 62567
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  int64  
 1   Customer Type                      103904 non-null  int64  
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  int64  
 4   Class                              103904 non-null  object 
 5   Flight Distance                    103904 non-null  int64  
 6   Inflight wifi service              103904 non-null  int64  
 7   Departure/Arrival time convenient  103904 non-null  int64  
 8   Ease of Online booking             103904 non-null  int64  
 9   Gate location                      103904 non-null  int64  
 10  Food and drink                     103904 non-null  int64  
 11  Online boarding                    103904

In [9]:
# Use one-hot encoding for Class (only remaining object type feature)
train = pd.get_dummies(train, columns=['Class'])
test = pd.get_dummies(test, columns=['Class'])

train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103904 entries, 70172 to 62567
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  int64  
 1   Customer Type                      103904 non-null  int64  
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  int64  
 4   Flight Distance                    103904 non-null  int64  
 5   Inflight wifi service              103904 non-null  int64  
 6   Departure/Arrival time convenient  103904 non-null  int64  
 7   Ease of Online booking             103904 non-null  int64  
 8   Gate location                      103904 non-null  int64  
 9   Food and drink                     103904 non-null  int64  
 10  Online boarding                    103904 non-null  int64  
 11  Seat comfort                       103904

In [10]:
# Drop instances with NaN (or should we set them to zero?)
train = train.dropna()
test = test.dropna()

train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103594 entries, 70172 to 62567
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103594 non-null  int64  
 1   Customer Type                      103594 non-null  int64  
 2   Age                                103594 non-null  int64  
 3   Type of Travel                     103594 non-null  int64  
 4   Flight Distance                    103594 non-null  int64  
 5   Inflight wifi service              103594 non-null  int64  
 6   Departure/Arrival time convenient  103594 non-null  int64  
 7   Ease of Online booking             103594 non-null  int64  
 8   Gate location                      103594 non-null  int64  
 9   Food and drink                     103594 non-null  int64  
 10  Online boarding                    103594 non-null  int64  
 11  Seat comfort                       103594

In [11]:
feature_cols = train.columns.drop(['satisfaction'])

X_train = train[feature_cols]
y_train = train.satisfaction

X_test = test[feature_cols]
y_test = test.satisfaction

## 3. Training a Single Decision Tree

In [12]:
dtree = DecisionTreeClassifier(criterion='entropy')
dtree = dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
print("Depth:", dtree.get_depth())
print("Leaves:", dtree.get_n_leaves())
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred, pos_label='satisfied'))
print("Precision:", metrics.precision_score(y_test, y_pred, pos_label='satisfied'))

Depth: 40
Leaves: 3570
Accuracy: 0.5669099756690997
Recall: 0.42296524417069953
Precision: 0.507978442354433


In [20]:
!graphviz --v

'graphviz' is not recognized as an internal or external command,
operable program or batch file.


In [21]:
!where dot

INFO: Could not find files for the given pattern(s).


In [13]:
from sklearn import tree
from sklearn.tree import export_graphviz
import graphviz

export_graphviz(dtree, out_file='airline_dt.dot',
                class_names=['neudis', 'satisfied'],
                feature_names=feature_cols,
                impurity=False,
                filled=True)

! dot -Tpng airline_dt.dot -o airline_dt.png

from IPython import display
display.Image("airline_dt.png")

'dot' is not recognized as an internal or external command,
operable program or batch file.


FileNotFoundError: No such file or directory: 'airline_dt.png'

FileNotFoundError: No such file or directory: 'airline_dt.png'

<IPython.core.display.Image object>

## 4. Vary Metric and Other Parameters

In [14]:
criteria = ['gini', 'entropy']
for c in criteria:
    for md in range(2, 8):
        # Model intializing
        dtree = DecisionTreeClassifier(criterion=c, max_depth=md)
        
        # Model training/fitting
        dtree.fit(X_train, y_train)
               
        # Model validation/test
        y_pred = dtree.predict(X_test)
        
        print("criterion={0}, max_depth={1}:\r".format(c, md))
        print("Accuracy: {}\r\n".format(metrics.accuracy_score(y_test, y_pred)))

criterion=gini, max_depth=2:
Accuracy: 0.5208357471131194

criterion=gini, max_depth=3:
Accuracy: 0.6183138299926622

criterion=gini, max_depth=4:
Accuracy: 0.6432240373846213

criterion=gini, max_depth=5:
Accuracy: 0.6400957787819102

criterion=gini, max_depth=6:
Accuracy: 0.6575908546711466

criterion=gini, max_depth=7:
Accuracy: 0.6387054416251496

criterion=entropy, max_depth=2:
Accuracy: 0.5150040551500406

criterion=entropy, max_depth=3:
Accuracy: 0.618082107133202

criterion=entropy, max_depth=4:
Accuracy: 0.6482060788630132

criterion=entropy, max_depth=5:
Accuracy: 0.6338392615764878

criterion=entropy, max_depth=6:
Accuracy: 0.645077820260302

criterion=entropy, max_depth=7:
Accuracy: 0.6068049279728112



## 5. Ensemble Methods

In [15]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy', max_features=10, n_estimators=100, random_state=0) 
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

print("Accuracy of Random Forest: {}".format(metrics.accuracy_score(y_test, y_pred)))

Accuracy of Random Forest: 0.5689954814042405


In [16]:
from sklearn.ensemble import ExtraTreesClassifier

forest2 = ExtraTreesClassifier(criterion='gini', n_estimators=10, max_depth=None,  min_samples_split=2, random_state=0)
forest2.fit(X_train, y_train)
y_pred = forest2.predict(X_test)

print("Accuracy of Extra Trees: {}".format(metrics.accuracy_score(y_test, y_pred)))

Accuracy of Extra Trees: 0.6545784574981656


The accuracy values are not overwhelming. Determine what parameter values the classifiers take, and vary them!