# Descriptive Analysis
In this section we look at the data from a descriptive perspective. This gives a good understanding and overview on the test data. 

In [1]:
import pandas as pd
from currency_converter import CurrencyConverter
import datetime as dt
from datetime import datetime, timedelta
current_datetime = "{:%Y_%m_%d_%H-%M-%S}".format(datetime.now())

In [2]:
data = pd.read_csv('./data/test.csv',parse_dates=['Flight_Date','Request_Date'])
train = pd.read_csv('./data/train.csv',parse_dates=['Flight_Date','Request_Date'])
data

Unnamed: 0,Request_Date,Flight_Date,Departure_hour,flight_unique_id,route_abb,flight_number,Price,Currency
0,2019-07-28 11:00:00+00:00,2019-07-29,18,2019-07-29 FR 146,SXF-STN,FR 146,73.43,€
1,2019-07-28 11:00:00+00:00,2019-07-29,22,2019-07-29 FR 8545,SXF-STN,FR 8545,180.53,€
2,2019-07-28 11:00:00+00:00,2019-07-30,6,2019-07-30 FR 144,SXF-STN,FR 144,61.19,€
3,2019-07-28 23:00:00+00:00,2019-07-30,6,2019-07-30 FR 144,SXF-STN,FR 144,73.43,€
4,2019-07-28 11:00:00+00:00,2019-07-30,10,2019-07-30 FR 8543,SXF-STN,FR 8543,180.53,€
...,...,...,...,...,...,...,...,...
5119,2019-07-31 23:00:00+00:00,2019-09-10,10,2019-09-10 FR 8543,SXF-STN,FR 8543,35.69,€
5120,2019-08-01 11:00:00+00:00,2019-09-10,10,2019-09-10 FR 8543,SXF-STN,FR 8543,35.69,€
5121,2019-08-01 23:00:00+00:00,2019-09-10,10,2019-09-10 FR 8543,SXF-STN,FR 8543,46.83,€
5122,2019-08-02 11:00:00+00:00,2019-09-10,10,2019-09-10 FR 8543,SXF-STN,FR 8543,46.83,€


In [3]:
# Compute pound values to eur (currency_in_eur)
c = CurrencyConverter(fallback_on_missing_rate=True)
def convert_to_eur(row, column):
    if row['Currency'] == '£':
        dateString = datetime.strftime(row['Request_Date'], '%Y-%m-%d')
        date = dt.datetime.strptime(dateString, "%Y-%m-%d")
        return c.convert(row[column], 'EUR', 'GBP', date=datetime(date.year, date.month, date.day))
    return row[column]

data['price_in_eur'] = data.apply(lambda row : convert_to_eur(row, 'Price'), axis=1)
train['price_in_eur'] = train.apply(lambda row : convert_to_eur(row, 'Price'), axis=1)
train['min_future_price_in_eur'] = train.apply(lambda row : convert_to_eur(row, 'min_future_price'), axis=1)

In [4]:
def modify_date_columns(data):
    # Formatting necessary to do duration calculation
    data['Departure_hour'] = pd.to_numeric(data['Departure_hour'])
    data['Request_Date'] = data['Request_Date'].apply(lambda x: datetime.strftime(x, '%Y-%m-%d %H:%M:%S'))
    data['Flight_Date'] = data['Flight_Date'] + pd.to_timedelta(data['Departure_hour'], 'h')
    data['Request_Date'] = pd.to_datetime(data['Request_Date'])
    # Calculate duration between flight_date and request_date in hours
    Duration_Time = data['Flight_Date'] - data['Request_Date']
    Duration_Time = Duration_Time.apply(lambda x: x.total_seconds())     
    Duration_Time = divmod(Duration_Time, 3600)[0]
    data.insert(2, "Duration_Time", Duration_Time, True)
    return data

data = modify_date_columns(data)
train = modify_date_columns(train)

In [5]:
# New feature: Compute weekday from Request_date
data['Request_Weekday'] = data['Request_Date'].dt.day_name()
train['Request_Weekday'] = train['Request_Date'].dt.day_name()

In [6]:
# New feature: Compute day or night from Request_date
data['Request_AM_PM'] = data['Request_Date'].apply(lambda x: datetime.strftime(x, '%p'))
train['Request_AM_PM'] = train['Request_Date'].apply(lambda x: datetime.strftime(x, '%p'))

In [7]:
data.to_csv('./data/converted_test_' + current_datetime + '.csv')

In [8]:
train.to_csv('./data/converted_train_' + current_datetime + '.csv')