In [5]:
import pandas as pd
import glob
from pathlib import Path
from matplotlib import pyplot
import holidays
from datetime import datetime

In [6]:
pd.set_option('display.max_rows', 1000)

In [7]:
all_filenames = glob.glob('/Users/ioneuk/Documents/flight-price-predictor/data/sky-scanner/**/*.csv', recursive=True)

In [8]:
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "/Users/ioneuk/Documents/flight-price-predictor/data/aggregate.csv", index=False, encoding='utf-8-sig')

In [9]:
df = pd.read_csv('/Users/ioneuk/Documents/flight-price-predictor/data/aggregate.csv', parse_dates=['date_time','departure_date_time','arrival_date_time'])

In [10]:
df['measure_date'] = pd.to_datetime(pd.to_datetime(df['date_time']).dt.date)
df['departure_date'] = pd.to_datetime(pd.to_datetime(df['departure_date_time']).dt.date)
df['departure_hour'] = pd.to_datetime(df['departure_date_time']).dt.hour
df['departure_minute'] = pd.to_datetime(df['departure_date_time']).dt.minute
df['arrival_date'] = pd.to_datetime(pd.to_datetime(df['arrival_date_time']).dt.date)
df['arrival_hour'] = pd.to_datetime(df['arrival_date_time']).dt.hour
df['arrival_minute'] = pd.to_datetime(df['arrival_date_time']).dt.minute

In [11]:
df.head()

Unnamed: 0,date_time,departure_city,departure_iata_code,arrival_city,destination_iata_code,departure_date_time,arrival_date_time,flight_duration,carrier_name,agent_name,flight_number,price,measure_date,departure_date,departure_hour,departure_minute,arrival_date,arrival_hour,arrival_minute
0,2020-03-07 20:03:34.275655,Belgrade,BEG,kopenhagen,CDG,2020-05-01 12:45:00,2020-05-01 15:15:00,150,Air France,Air France,1661,2515.17,2020-03-07,2020-05-01,12,45,2020-05-01,15,15
1,2020-03-07 20:03:34.275655,Belgrade,BEG,kopenhagen,CDG,2020-05-19 12:45:00,2020-05-19 15:15:00,150,Air France,Air France,1661,2515.17,2020-03-07,2020-05-19,12,45,2020-05-19,15,15
2,2020-03-07 20:03:34.275655,Belgrade,BEG,kopenhagen,BCN,2020-05-19 22:00:00,2020-05-20 00:35:00,155,Vueling Airlines,Vueling Airlines,1287,2719.4,2020-03-07,2020-05-19,22,0,2020-05-20,0,35
3,2020-03-07 20:03:34.275655,Belgrade,BEG,kopenhagen,CDG,2020-05-19 12:45:00,2020-05-19 15:15:00,150,Air France,Air France,1661,3519.03,2020-03-07,2020-05-19,12,45,2020-05-19,15,15
4,2020-03-07 20:03:34.275655,Belgrade,BEG,Edinburgh,CDG,2020-03-29 12:45:00,2020-03-29 15:15:00,150,Air France,Air France,1661,3780.8,2020-03-07,2020-03-29,12,45,2020-03-29,15,15


In [12]:
df.dtypes

date_time                datetime64[ns]
departure_city                   object
departure_iata_code              object
arrival_city                     object
destination_iata_code            object
departure_date_time      datetime64[ns]
arrival_date_time        datetime64[ns]
flight_duration                   int64
carrier_name                     object
agent_name                       object
flight_number                     int64
price                           float64
measure_date             datetime64[ns]
departure_date           datetime64[ns]
departure_hour                    int64
departure_minute                  int64
arrival_date             datetime64[ns]
arrival_hour                      int64
arrival_minute                    int64
dtype: object

In [13]:
df.drop_duplicates(['departure_city','arrival_city','flight_duration','carrier_name','flight_number','measure_date','departure_date','departure_hour','departure_minute','arrival_date','arrival_hour','arrival_minute'],inplace = True)

In [14]:
df.shape

(478264, 19)

In [15]:
new_df = df.sort_values('measure_date').groupby(['departure_date', 'arrival_date', 'departure_city','arrival_city','flight_number','carrier_name']).filter(lambda x : len(x)>8)

In [16]:
new_df.sort_values('measure_date', inplace=True)

In [17]:
new_df.shape

(8258, 19)

In [18]:
new_df['prev_price_1'] = new_df.sort_values('measure_date').groupby(['departure_date', 'arrival_date', 'departure_city','arrival_city','flight_number','carrier_name'])['price'].shift(1).fillna(0)
new_df['prev_price_1'] = new_df.prev_price_1.astype(int)
new_df['prev_price_2'] = new_df.sort_values('measure_date').groupby(['departure_date', 'arrival_date', 'departure_city','arrival_city','flight_number','carrier_name'])['price'].shift(2).fillna(0)
new_df['prev_price_2'] = new_df.prev_price_2.astype(int)
new_df['prev_price_3'] = new_df.sort_values('measure_date').groupby(['departure_date', 'arrival_date', 'departure_city','arrival_city','flight_number','carrier_name'])['price'].shift(3).fillna(0)
new_df['prev_price_3'] = new_df.prev_price_3.astype(int)

In [19]:
new_df.sort_values('measure_date', inplace=True)

In [20]:
del new_df['measure_date']
del new_df['departure_date']
del new_df['departure_hour']
del new_df['departure_minute']
del new_df['arrival_date']
del new_df['arrival_hour']
del new_df['arrival_minute']

In [21]:
df.head()

Unnamed: 0,date_time,departure_city,departure_iata_code,arrival_city,destination_iata_code,departure_date_time,arrival_date_time,flight_duration,carrier_name,agent_name,flight_number,price,measure_date,departure_date,departure_hour,departure_minute,arrival_date,arrival_hour,arrival_minute
0,2020-03-07 20:03:34.275655,Belgrade,BEG,kopenhagen,CDG,2020-05-01 12:45:00,2020-05-01 15:15:00,150,Air France,Air France,1661,2515.17,2020-03-07,2020-05-01,12,45,2020-05-01,15,15
1,2020-03-07 20:03:34.275655,Belgrade,BEG,kopenhagen,CDG,2020-05-19 12:45:00,2020-05-19 15:15:00,150,Air France,Air France,1661,2515.17,2020-03-07,2020-05-19,12,45,2020-05-19,15,15
2,2020-03-07 20:03:34.275655,Belgrade,BEG,kopenhagen,BCN,2020-05-19 22:00:00,2020-05-20 00:35:00,155,Vueling Airlines,Vueling Airlines,1287,2719.4,2020-03-07,2020-05-19,22,0,2020-05-20,0,35
4,2020-03-07 20:03:34.275655,Belgrade,BEG,Edinburgh,CDG,2020-03-29 12:45:00,2020-03-29 15:15:00,150,Air France,Air France,1661,3780.8,2020-03-07,2020-03-29,12,45,2020-03-29,15,15
5,2020-03-07 20:03:34.275655,Belgrade,BEG,Edinburgh,CDG,2020-03-29 06:40:00,2020-03-29 09:15:00,155,Air France,Air France,6291,6039.49,2020-03-07,2020-03-29,6,40,2020-03-29,9,15


In [22]:
new_df.to_csv('/Users/ioneuk/Documents/flight-price-predictor/data/processed_dataset2.csv', index=False)

In [23]:
new_df.dtypes

date_time                datetime64[ns]
departure_city                   object
departure_iata_code              object
arrival_city                     object
destination_iata_code            object
departure_date_time      datetime64[ns]
arrival_date_time        datetime64[ns]
flight_duration                   int64
carrier_name                     object
agent_name                       object
flight_number                     int64
price                           float64
prev_price_1                      int64
prev_price_2                      int64
prev_price_3                      int64
dtype: object

In [24]:
new_df.iloc[:10]

Unnamed: 0,date_time,departure_city,departure_iata_code,arrival_city,destination_iata_code,departure_date_time,arrival_date_time,flight_duration,carrier_name,agent_name,flight_number,price,prev_price_1,prev_price_2,prev_price_3
294726,2020-02-03 05:26:15.387024,Kiev,IEV,Vienna,VIE,2020-03-14 13:10:00,2020-03-14 14:10:00,120,Wizz Air,Wizz Air,6127,975.05,0,0,0
298392,2020-02-03 18:15:23.872834,Sofia,SOF,Budapest,BUD,2020-03-05 13:00:00,2020-03-05 13:25:00,85,Wizz Air,Wizz Air,2472,409.49,0,0,0
295201,2020-02-03 11:01:54.768565,Berlin,SXF,Geneva,GVA,2020-03-14 17:00:00,2020-03-14 18:45:00,105,easyJet,easyJet,1596,623.1,0,0,0
295200,2020-02-03 11:01:54.768565,Berlin,SXF,Geneva,GVA,2020-03-14 06:10:00,2020-03-14 08:00:00,110,easyJet,easyJet,1592,623.1,0,0,0
295235,2020-02-03 11:01:54.768565,Berlin,TXL,Zurich,ZRH,2020-03-05 14:10:00,2020-03-05 15:40:00,90,easyJet,easyJet,5867,623.1,0,0,0
295234,2020-02-03 11:01:54.768565,Berlin,TXL,Zurich,ZRH,2020-03-05 08:25:00,2020-03-05 09:55:00,90,easyJet,easyJet,5863,623.1,0,0,0
295233,2020-02-03 11:01:54.768565,Berlin,TXL,Zurich,ZRH,2020-03-05 07:00:00,2020-03-05 08:30:00,90,easyJet,easyJet,5861,595.23,0,0,0
295232,2020-02-03 11:01:54.768565,Berlin,TXL,Zurich,ZRH,2020-03-02 08:25:00,2020-03-02 09:55:00,90,easyJet,easyJet,5863,846.03,0,0,0
295230,2020-02-03 11:01:54.768565,Berlin,TXL,Zurich,ZRH,2020-03-02 07:00:00,2020-03-02 08:30:00,90,easyJet,easyJet,5861,692.76,0,0,0
295228,2020-02-03 11:01:54.768565,Berlin,TXL,Zurich,ZRH,2020-02-28 08:25:00,2020-02-28 09:55:00,90,easyJet,easyJet,5863,1013.22,0,0,0
