In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import matplotlib as plt
import numpy as np
flightVN = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/flight_fare/flight_data_cleaned.csv")

In [4]:
flightVN.head()

Unnamed: 0,price,airline,fare_class,depart_time,depart_date,arrival_time,arrival_date,src,dest,cabin_luggage,hand_luggage,urgent
0,2826000,VietnamAirlines,economy,morning,week_day,morning,week_day,CXR,HAN,23,12,yes
1,2189000,VietJetAir,economy,morning,week_day,morning,week_day,CXR,HAN,0,7,yes
2,1648000,VietnamAirlines,economy,night,week_day,night,week_day,CXR,HAN,23,12,yes
3,1854000,VietJetAir,economy,night,week_day,night,week_day,CXR,HAN,0,7,yes
4,1875000,VietnamAirlines,economy,night,week_day,night,week_day,CXR,HAN,23,12,yes


# XGBoost, AdaBoost, Random Forest, Decision Tree (Machine Learning)

## 1 Airline
## 2 Taxes
## 3 Source - Destination (planned to convert into distance in km)
## 4 Departure time ( morning, afternoon, evening, night )
## 5 Arrival time ( morning, noon, afternoon, evening, night )
## 6 Flight duration (hour)
## 7 Type of flight ticket (Economy or Business)
## 8 Days on week ( week day, weekend )

In [5]:
flight_VN = flightVN.copy()

In [6]:
flight_VN.head()

Unnamed: 0,price,airline,fare_class,depart_time,depart_date,arrival_time,arrival_date,src,dest,cabin_luggage,hand_luggage,urgent
0,2826000,VietnamAirlines,economy,morning,week_day,morning,week_day,CXR,HAN,23,12,yes
1,2189000,VietJetAir,economy,morning,week_day,morning,week_day,CXR,HAN,0,7,yes
2,1648000,VietnamAirlines,economy,night,week_day,night,week_day,CXR,HAN,23,12,yes
3,1854000,VietJetAir,economy,night,week_day,night,week_day,CXR,HAN,0,7,yes
4,1875000,VietnamAirlines,economy,night,week_day,night,week_day,CXR,HAN,23,12,yes


### Categorize airlines

In [7]:
airline = pd.get_dummies(flight_VN['airline'],dtype='int')
for airlines in airline.columns:
    flight_VN[airlines] = airline[airlines]
flight_VN = flight_VN.drop(['airline'],axis=1)

### Categorize ticket

In [8]:
flight_VN['fare_class'] = flight_VN['fare_class'].replace('economy',0)
flight_VN['fare_class'] = flight_VN['fare_class'].replace('business',1)

### Categorize departure time and arrival time

In [9]:
time = pd.get_dummies(flight_VN['depart_time'],dtype='int')
for times in time.columns:
    flight_VN[f'd_{times}'] = time[times]

In [10]:
time = pd.get_dummies(flight_VN['arrival_time'],dtype='int')
for times in time.columns:
    flight_VN[f'a_{times}'] = time[times]

In [11]:
flight_VN = flight_VN.drop(['depart_time','arrival_time'],axis=1)

### Categorize depart date and arrival date

In [12]:
flight_VN['depart_date'] = flight_VN['depart_date'].replace('week_day',0)
flight_VN['depart_date'] = flight_VN['depart_date'].replace('weekend',1)
flight_VN['arrival_date'] = flight_VN['arrival_date'].replace('weekend',1)
flight_VN['arrival_date'] = flight_VN['arrival_date'].replace('week_day',0)

### categorize source and destination

In [13]:
source = pd.get_dummies(flight_VN['src'],dtype='int')
for sources in source.columns:
    flight_VN[f'from_{sources}'] = source[sources]

In [14]:
destination = pd.get_dummies(flight_VN['dest'],dtype='int')
for destinations in destination.columns:
    flight_VN[f'to_{destinations}'] = destination[destinations]

In [15]:
flight_VN = flight_VN.drop(['src','dest'],axis=1)

### categorize urgent feature

In [16]:
flight_VN['urgent'] = flight_VN['urgent'].replace('yes',1)
flight_VN['urgent'] = flight_VN['urgent'].replace('no',0)

In [17]:
flight_VN.head()

Unnamed: 0,price,fare_class,depart_date,arrival_date,cabin_luggage,hand_luggage,urgent,BambooAirways,VietJetAir,VietnamAirlines,...,from_CXR,from_DAD,from_HAN,from_PQC,from_SGN,to_CXR,to_DAD,to_HAN,to_PQC,to_SGN
0,2826000,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
1,2189000,0,0,0,0,7,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,1648000,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
3,1854000,0,0,0,0,7,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,1875000,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0


In [18]:
flight_VN.columns

Index(['price', 'fare_class', 'depart_date', 'arrival_date', 'cabin_luggage',
       'hand_luggage', 'urgent', 'BambooAirways', 'VietJetAir',
       'VietnamAirlines', 'd_afternoon', 'd_evening', 'd_morning', 'd_night',
       'a_afternoon', 'a_evening', 'a_morning', 'a_night', 'from_CXR',
       'from_DAD', 'from_HAN', 'from_PQC', 'from_SGN', 'to_CXR', 'to_DAD',
       'to_HAN', 'to_PQC', 'to_SGN'],
      dtype='object')

In [19]:
per25 = flight_VN['price'].quantile(0.25)
per75 = flight_VN['price'].quantile(0.75)
IQR = per75 - per25
upper = per75 + 1.5*IQR
lower = per25 - 1.5*IQR
flight_VN['price'] = np.where(flight_VN['price'] > upper,upper, flight_VN['price'])
flight_VN['price'] =  np.where(flight_VN['price'] < lower, lower, flight_VN['price'])
flight_VN.head()

Unnamed: 0,price,fare_class,depart_date,arrival_date,cabin_luggage,hand_luggage,urgent,BambooAirways,VietJetAir,VietnamAirlines,...,from_CXR,from_DAD,from_HAN,from_PQC,from_SGN,to_CXR,to_DAD,to_HAN,to_PQC,to_SGN
0,2826000.0,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
1,2189000.0,0,0,0,0,7,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,1648000.0,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
3,1854000.0,0,0,0,0,7,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,1875000.0,0,0,0,23,12,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0


# Model

 ### R-squared (R2), Mean Absolute Error (MAE) and Mean Squared Error (MSE), Accuracy

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [52]:
X_features = []
Y_features = []

for columns in flight_VN.columns:
    if columns == 'price':
        Y_features.append(columns)
    else:
        X_features.append(columns)

X = flight_VN[X_features]
Y = flight_VN[Y_features[0]]

In [53]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.33, random_state = 42)

In [54]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)

## XGBoost

In [55]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgboost_param = {
    'learning_rate':[0.1]
}

xgboost = GridSearchCV(XGBRegressor(),xgboost_param)
xgboost.fit(X_train, Y_train)


In [56]:
xgboost.best_params_

{'learning_rate': 0.1}

In [57]:
xgboost_prediction = xgboost.predict(X_test)
print("Mean Absolute Error: " + str(mean_absolute_error(xgboost_prediction, Y_test)))
print("R-square: " + str(r2_score(xgboost_prediction, Y_test)))
print("Mean squared Error: " + str(mean_squared_error(xgboost_prediction, Y_test)))

Mean Absolute Error: 514178.5045535917
R-square: 0.6177180467133168
Mean squared Error: 515361779812.0311


## AdaBoost

In [58]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_param = {
    'n_estimators':[50],
    'learning_rate':[0.2],
                      }

adaboost = GridSearchCV(AdaBoostClassifier(), adaboost_param)
adaboost.fit(X_train,Y_train)



In [59]:
adaboost_prediction = adaboost.predict(X_test)
print("Mean Absolute Error: " + str(mean_absolute_error(adaboost_prediction, Y_test)))
print("R-square: " + str(r2_score(adaboost_prediction, Y_test)))
print("Mean squared Error: " + str(mean_squared_error(adaboost_prediction, Y_test)))
print("AdaBoost Accuracy:", accuracy_score(Y_test, adaboost_prediction))

Mean Absolute Error: 1068833.2954459346
R-square: -2.954167480313793
Mean squared Error: 2146100709782.5264
AdaBoost Accuracy: 0.06272258846707585


## Random Forest

In [63]:
from sklearn.ensemble import RandomForestClassifier
rfc_param = {
    "n_estimators":[400],
    "random_state":[0]
}
rfc = GridSearchCV(RandomForestClassifier(),rfc_param)
rfc.fit(X_train, Y_train)



In [64]:
rf_prediction = rfc.predict(X_test)
print("Mean Absolute Error: " + str(mean_absolute_error(rf_prediction, Y_test)))
print("R-square: " + str(r2_score(rf_prediction, Y_test)))
print("Mean squared Error: " + str(mean_squared_error(rf_prediction, Y_test)))
print("Accuracy:", accuracy_score(Y_test, rf_prediction))

Mean Absolute Error: 507163.02947639616
R-square: 0.6390973855700627
Mean squared Error: 737842915662.6506
Accuracy: 0.262446010456922


### Decision Tree

In [65]:
from sklearn.tree import DecisionTreeClassifier
clf_param ={
    "criterion" : ['entropy']
}
clf_en = GridSearchCV(DecisionTreeClassifier(),clf_param)
clf_en.fit(X_train, Y_train)



In [66]:
clf_en_prediction = clf_en.predict(X_test)
print("Mean Absolute Error: " + str(mean_absolute_error(clf_en_prediction, Y_test)))
print("R-square: " + str(r2_score(clf_en_prediction, Y_test)))
print("Mean squared Error: " + str(mean_squared_error(clf_en_prediction, Y_test)))
print("Accuracy:", accuracy_score(Y_test, clf_en_prediction))

Mean Absolute Error: 508803.93271198
R-square: 0.6385947318335347
Mean squared Error: 742342528642.8734
Accuracy: 0.2618587557778283
