In [144]:
import pandas as pd
from math import sqrt
import glob
from pathlib import Path
import pandas_profiling
from matplotlib import pyplot
import seaborn as sns
from pandas.plotting import autocorrelation_plot
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt
from statistics import mean 

In [145]:
pd.set_option('display.max_rows', 1000)

In [146]:
df = pd.read_csv('/Users/ioneuk/Documents/flight-price-predictor/data/processed_dataset2.csv', parse_dates=['date_time', 'departure_date_time', 'arrival_date_time'])

In [147]:
df.dtypes

date_time                datetime64[ns]
departure_city                   object
departure_iata_code              object
arrival_city                     object
destination_iata_code            object
departure_date_time      datetime64[ns]
arrival_date_time        datetime64[ns]
flight_duration                   int64
carrier_name                     object
agent_name                       object
flight_number                     int64
price                           float64
prev_price_1                      int64
prev_price_2                      int64
prev_price_3                      int64
dtype: object

In [148]:
df.shape

(7168, 15)

In [149]:
df.head()

Unnamed: 0,date_time,departure_city,departure_iata_code,arrival_city,destination_iata_code,departure_date_time,arrival_date_time,flight_duration,carrier_name,agent_name,flight_number,price,prev_price_1,prev_price_2,prev_price_3
0,2020-02-03 16:13:33.764254,Barcelona,BCN,athens,ATH,2020-03-14 07:15:00,2020-03-14 11:10:00,175,Vueling Airlines,Vueling Airlines,8100,1381.03,0,0,0
1,2020-02-03 12:58:17.541189,Bucharest,OTP,Vienna,VIE,2020-03-14 16:40:00,2020-03-14 17:20:00,100,Laudamotion,Ryanair,347,1281.57,0,0,0
2,2020-02-03 20:44:16.139427,kopenhagen,CPH,Edinburgh,EDI,2020-03-11 22:05:00,2020-03-11 23:00:00,115,Ryanair,Ryanair,2675,384.56,0,0,0
3,2020-02-03 18:20:37.319330,Brussels,CRL,Venice,TSF,2020-03-11 06:40:00,2020-03-11 08:15:00,95,Ryanair,Ryanair,6033,386.49,0,0,0
4,2020-02-03 18:20:37.319330,Brussels,CRL,Venice,TSF,2020-03-11 17:00:00,2020-03-11 18:35:00,95,Ryanair,Ryanair,6055,275.99,0,0,0


In [150]:
df.columns

Index(['date_time', 'departure_city', 'departure_iata_code', 'arrival_city',
       'destination_iata_code', 'departure_date_time', 'arrival_date_time',
       'flight_duration', 'carrier_name', 'agent_name', 'flight_number',
       'price', 'prev_price_1', 'prev_price_2', 'prev_price_3'],
      dtype='object')

In [151]:
tscv = TimeSeriesSplit(n_splits=3)

In [152]:
df.dtypes

date_time                datetime64[ns]
departure_city                   object
departure_iata_code              object
arrival_city                     object
destination_iata_code            object
departure_date_time      datetime64[ns]
arrival_date_time        datetime64[ns]
flight_duration                   int64
carrier_name                     object
agent_name                       object
flight_number                     int64
price                           float64
prev_price_1                      int64
prev_price_2                      int64
prev_price_3                      int64
dtype: object

In [153]:
all_features = df.columns

In [154]:
categorical_features = ['departure_city','arrival_city', 'carrier_name']

In [155]:
all_columns_to_use = ['departure_city','arrival_city', 'flight_duration', 'carrier_name', 'flight_number', 'departure_hour', 'departure_minute', 'arrival_hour', 'arrival_minute', 'days_left_to_departure', 'is_holiday', 'prev_price_1']

In [156]:
def rmse(real, predicted):
    return sqrt(mean_squared_error(real, predicted))

In [157]:
parameters = {
    "loss_function": "RMSE",
    "eval_metric": "R2",
    "iterations": 1000,
    "learning_rate": 0.03,
    "random_seed": 42,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 10,
    "silent": True,
    "cat_features": categorical_features
}

In [158]:
def predict_by_previous(test):
    return test['prev_price_1']

In [159]:
def fit_transform_average(test):
    return test.loc[:, ['prev_price_1', 'prev_price_2', 'prev_price_3']].mean(axis=1)

In [160]:
cv_rmse = []
cv_mae = []
cv_r2 = []
for idx, split_idx_tuple in enumerate(tscv.split(df)):
    x_train_idx, x_test_idx = split_idx_tuple[0], split_idx_tuple[1]
    prediction = predict_by_previous(df.iloc[x_test_idx])
    current_rmse = rmse(prediction, df.loc[x_test_idx, 'price'])
    current_mae = mean_absolute_error(df.loc[x_test_idx, 'price'], prediction)
    current_r2 = r2_score(df.loc[x_test_idx, 'price'], prediction)
    cv_rmse.append(current_rmse)
    cv_mae.append(current_mae)
    cv_r2.append(current_r2)
    print("RMSE on {}th fold: {}".format(idx, current_rmse))
    print("MAE on {}th fold: {}".format(idx, current_mae))
    print("R2 on {}th fold: {}".format(idx, current_r2))

mean_rmse = sum(cv_rmse) / len(cv_rmse)
mean_mae = sum(cv_mae) / len(cv_mae)
mean_r2 = sum(cv_r2) / len(cv_r2)
print("mean rmse: {}".format(mean_rmse))
print("mean mae: {}".format(mean_mae))
print("mean r2: {}".format(mean_r2))

RMSE on 0th fold: 359.53144460757136
MAE on 0th fold: 202.21926339285713
R2 on 0th fold: 0.8245853860035136
RMSE on 1th fold: 364.82659924670094
MAE on 1th fold: 210.8104966517857
R2 on 1th fold: 0.8319544465008157
RMSE on 2th fold: 569.4432012123068
MAE on 2th fold: 319.7889174107143
R2 on 2th fold: 0.6516958735172547
mean rmse: 431.2670816888597
mean mae: 244.27289248511906
mean r2: 0.7694119020071947


In [161]:
cv_rmse = []
cv_mae = []
cv_r2 = []
for idx, split_idx_tuple in enumerate(tscv.split(df)):
    x_train_idx, x_test_idx = split_idx_tuple[0], split_idx_tuple[1]
    train, test = df.iloc[x_train_idx], df.iloc[x_test_idx]
    prediction = fit_transform_average(test)
    current_rmse = rmse(prediction, df.loc[x_test_idx, 'price'])
    current_mae = mean_absolute_error(df.loc[x_test_idx, 'price'], prediction)
    current_r2 = r2_score(df.loc[x_test_idx, 'price'], prediction)
    cv_rmse.append(current_rmse)
    cv_mae.append(current_mae)
    cv_r2.append(current_r2)
    print("RMSE on {}th fold: {}".format(idx, current_rmse))
    print("MAE on {}th fold: {}".format(idx, current_mae))
    print("R2 on {}th fold: {}".format(idx, current_r2))

mean_rmse = sum(cv_rmse) / len(cv_rmse)
mean_mae = sum(cv_mae) / len(cv_mae)
mean_r2 = sum(cv_r2) / len(cv_r2)
print("mean rmse: {}".format(mean_rmse))
print("mean mae: {}".format(mean_mae))
print("mean r2: {}".format(mean_r2))

RMSE on 0th fold: 526.2187545642579
MAE on 0th fold: 325.32567336309523
R2 on 0th fold: 0.624227791387404
RMSE on 1th fold: 398.1634464651398
MAE on 1th fold: 255.7011514136905
R2 on 1th fold: 0.7998402224958447
RMSE on 2th fold: 626.4479563764963
MAE on 2th fold: 386.07187500000003
R2 on 2th fold: 0.5784706788469847
mean rmse: 516.9433858019647
mean mae: 322.36623325892856
mean r2: 0.6675128975767445
