In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
flightVN = pd.read_csv("flight_data_cleaned.csv")

In [2]:
flightVN.head()

Unnamed: 0,price,airline,fare_class,depart_time,depart_date,arrival_time,arrival_date,src,dest,cabin_luggage,hand_luggage,urgent
0,2826000,VietnamAirlines,economy,morning,week_day,morning,week_day,CXR,HAN,23,12,yes
1,2189000,VietJetAir,economy,morning,week_day,morning,week_day,CXR,HAN,0,7,yes
2,1648000,VietnamAirlines,economy,night,week_day,night,week_day,CXR,HAN,23,12,yes
3,1854000,VietJetAir,economy,night,week_day,night,week_day,CXR,HAN,0,7,yes
4,1875000,VietnamAirlines,economy,night,week_day,night,week_day,CXR,HAN,23,12,yes


# XGBoost, AdaBoost, Random Forest, Decision Tree (Machine Learning)

## 1 Airline
## 2 Taxes
## 3 Source - Destination (planned to convert into distance in km)
## 4 Departure time ( morning, afternoon, evening, night )
## 5 Arrival time ( morning, noon, afternoon, evening, night )
## 6 Flight duration (hour)
## 7 Type of flight ticket (Economy or Business)
## 8 Days on week ( week day, weekend )

In [3]:
flight_VN = flightVN.copy()

In [4]:
flight_VN.head()

Unnamed: 0,price,airline,fare_class,depart_time,depart_date,arrival_time,arrival_date,src,dest,cabin_luggage,hand_luggage,urgent
0,2826000,VietnamAirlines,economy,morning,week_day,morning,week_day,CXR,HAN,23,12,yes
1,2189000,VietJetAir,economy,morning,week_day,morning,week_day,CXR,HAN,0,7,yes
2,1648000,VietnamAirlines,economy,night,week_day,night,week_day,CXR,HAN,23,12,yes
3,1854000,VietJetAir,economy,night,week_day,night,week_day,CXR,HAN,0,7,yes
4,1875000,VietnamAirlines,economy,night,week_day,night,week_day,CXR,HAN,23,12,yes


### Categorize airlines

In [5]:
airline = pd.get_dummies(flight_VN['airline'],dtype='int')
for airlines in airline.columns:
    flight_VN[airlines] = airline[airlines]
flight_VN = flight_VN.drop(['airline'],axis=1)

### Categorize ticket

In [6]:
flight_VN['fare_class'] = flight_VN['fare_class'].replace('economy',0)
flight_VN['fare_class'] = flight_VN['fare_class'].replace('business',1)

  flight_VN['fare_class'] = flight_VN['fare_class'].replace('business',1)


### Categorize departure time and arrival time

In [7]:
time = pd.get_dummies(flight_VN['depart_time'],dtype='int')
for times in time.columns:
    flight_VN[f'd_{times}'] = time[times]

In [8]:
time = pd.get_dummies(flight_VN['arrival_time'],dtype='int')
for times in time.columns:
    flight_VN[f'a_{times}'] = time[times]

In [9]:
flight_VN = flight_VN.drop(['depart_time','arrival_time'],axis=1)

### Categorize depart date and arrival date

In [10]:
flight_VN['depart_date'] = flight_VN['depart_date'].replace('week_day',0)
flight_VN['depart_date'] = flight_VN['depart_date'].replace('weekend',1)
flight_VN['arrival_date'] = flight_VN['arrival_date'].replace('weekend',1)
flight_VN['arrival_date'] = flight_VN['arrival_date'].replace('week_day',0)

  flight_VN['depart_date'] = flight_VN['depart_date'].replace('weekend',1)
  flight_VN['arrival_date'] = flight_VN['arrival_date'].replace('week_day',0)


### categorize source and destination

In [11]:
source = pd.get_dummies(flight_VN['src'],dtype='int')
for sources in source.columns:
    flight_VN[f'from_{sources}'] = source[sources]

In [12]:
destination = pd.get_dummies(flight_VN['dest'],dtype='int')
for destinations in destination.columns:
    flight_VN[f'to_{destinations}'] = destination[destinations]

In [13]:
flight_VN = flight_VN.drop(['src','dest'],axis=1)

### categorize urgent feature

In [14]:
flight_VN['urgent'] = flight_VN['urgent'].replace('yes',1)
flight_VN['urgent'] = flight_VN['urgent'].replace('no',0)

  flight_VN['urgent'] = flight_VN['urgent'].replace('no',0)


In [15]:
flight_VN.head()

Unnamed: 0,price,fare_class,depart_date,arrival_date,cabin_luggage,hand_luggage,urgent,BambooAirways,VietJetAir,VietnamAirlines,...,from_CXR,from_DAD,from_HAN,from_PQC,from_SGN,to_CXR,to_DAD,to_HAN,to_PQC,to_SGN
0,2826000,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
1,2189000,0,0,0,0,7,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,1648000,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
3,1854000,0,0,0,0,7,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,1875000,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0


In [16]:
flight_VN.columns

Index(['price', 'fare_class', 'depart_date', 'arrival_date', 'cabin_luggage',
       'hand_luggage', 'urgent', 'BambooAirways', 'VietJetAir',
       'VietnamAirlines', 'd_afternoon', 'd_evening', 'd_morning', 'd_night',
       'a_afternoon', 'a_evening', 'a_morning', 'a_night', 'from_CXR',
       'from_DAD', 'from_HAN', 'from_PQC', 'from_SGN', 'to_CXR', 'to_DAD',
       'to_HAN', 'to_PQC', 'to_SGN'],
      dtype='object')

In [17]:
per25 = flight_VN['price'].quantile(0.25)
per75 = flight_VN['price'].quantile(0.75)
IQR = per75 - per25
upper = per75 + 1.5*IQR
lower = per25 - 1.5*IQR
flight_VN['price'] = np.where(flight_VN['price'] > upper,upper, flight_VN['price'])
flight_VN['price'] =  np.where(flight_VN['price'] < lower, lower, flight_VN['price'])
flight_VN.head()

Unnamed: 0,price,fare_class,depart_date,arrival_date,cabin_luggage,hand_luggage,urgent,BambooAirways,VietJetAir,VietnamAirlines,...,from_CXR,from_DAD,from_HAN,from_PQC,from_SGN,to_CXR,to_DAD,to_HAN,to_PQC,to_SGN
0,2826000.0,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
1,2189000.0,0,0,0,0,7,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,1648000.0,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
3,1854000.0,0,0,0,0,7,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,1875000.0,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0


# Model

 ### R-squared (R2), Mean Absolute Error (MAE) and Mean Squared Error (MSE)

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [19]:
X_features = []
Y_features = []

for columns in flight_VN.columns:
    if columns == 'price':
        Y_features.append(columns)
    else:
        X_features.append(columns)

X = flight_VN[X_features]
Y = flight_VN[Y_features[0]]

In [20]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.33, random_state = 42)

In [21]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)

## Decision Tree

In [22]:
from sklearn.tree import DecisionTreeRegressor
clf_param ={
    "criterion" : ['squared_error'],
    "max_depth":[i for i in range(2,100,10)],
    "max_features":[1.0], #auto
    "min_samples_split":[i for i in range(2,100,20)]
}
clf_en = GridSearchCV(DecisionTreeRegressor(),clf_param)
clf_en.fit(X_train, Y_train)

In [23]:
clf_en.best_estimator_ 

In [24]:
clf_en_prediction = clf_en.best_estimator_.predict(X_test)
print("Mean Absolute Error: " + str(mean_absolute_error(clf_en_prediction, Y_test)))
print("R-square: " + str(r2_score(clf_en_prediction, Y_test)))
print("Mean squared Error: " + str(mean_squared_error(clf_en_prediction, Y_test)))

Mean Absolute Error: 430638.76793455624
R-square: 0.7375391575137242
Mean squared Error: 404241022088.2922


## Random Forest

In [29]:
from sklearn.ensemble import RandomForestRegressor
n_estimators = [i for i in range(10,100,10)]
min_sample_split = [i for i in range(2,100,20)]
rfc_param ={
    "n_estimators": n_estimators,
    "criterion" : ['squared_error'],
    "max_features":[1.0], #auto,  
}

rfc = GridSearchCV(RandomForestRegressor(), rfc_param)
rfc.fit(X_train, Y_train)

In [30]:
rfc.best_estimator_

In [31]:
rf_prediction = rfc.best_estimator_.predict(X_test)
print("Mean Absolute Error: " + str(mean_absolute_error(rf_prediction, Y_test)))
print("R-square: " + str(r2_score(rf_prediction, Y_test)))
print("Mean squared Error: " + str(mean_squared_error(rf_prediction, Y_test)))

Mean Absolute Error: 430474.4118906996
R-square: 0.7372431352596833
Mean squared Error: 404880585342.2036


## AdaBoost

In [32]:
from sklearn.ensemble import AdaBoostRegressor
adaboost_param = {
    "n_estimators":[i for i in range(10,100,10)],
    'loss':['linear','square', 'exponential']
}

adaboost = GridSearchCV(AdaBoostRegressor(), adaboost_param)
adaboost.fit(X_train,Y_train)

In [33]:
adaboost.best_estimator_

In [34]:
adaboost_prediction = adaboost.predict(X_test)
print("Mean Absolute Error: " + str(mean_absolute_error(adaboost_prediction, Y_test)))
print("R-square: " + str(r2_score(adaboost_prediction, Y_test)))
print("Mean squared Error: " + str(mean_squared_error(adaboost_prediction, Y_test)))

Mean Absolute Error: 709609.418509077
R-square: 0.24201506122898475
Mean squared Error: 770805615262.3269


## XGBoost

In [35]:
from xgboost import XGBRegressor


xgboost_param = {
    "max_depth":[1,2,3,4,5,6,7,8,9,10],
    "subsample":[1.0], #use all sample
    "colsample_bytree":[1.0] #use all features
}

xgboost = GridSearchCV(XGBRegressor(),xgboost_param)
xgboost.fit(X_train, Y_train)


In [36]:
xgboost.best_estimator_

In [37]:
xgboost_prediction = xgboost.predict(X_test)
print("Mean Absolute Error: " + str(mean_absolute_error(xgboost_prediction, Y_test)))
print("R-square: " + str(r2_score(xgboost_prediction, Y_test)))
print("Mean squared Error: " + str(mean_squared_error(xgboost_prediction, Y_test)))

Mean Absolute Error: 511089.15943490947
R-square: 0.6476012274343814
Mean squared Error: 520494111697.3286


In [38]:
import joblib
joblib.dump(xgboost.best_estimator_, 'xgboost.pkl')
joblib.dump(adaboost.best_estimator_, 'adaboost.pkl')
joblib.dump(rfc.best_estimator_, 'random_forest.pkl')
joblib.dump(clf_en.best_estimator_, 'decision_tree.pkl')

['decision_tree.pkl']

In [39]:
import joblib
adaboost=joblib.load('adaboost.pkl')
decision_tree=joblib.load('decision_tree.pkl')
random_forest=joblib.load('random_forest.pkl')
xgboost=joblib.load('xgboost.pkl')