In [1]:
import sqlite3
import numpy as np
import pandas as pd
from datetime import datetime as dt, timedelta

In [2]:
airlines_df = pd.read_excel("../data/Airlines.xlsx", index_col="AirlineId")
airports_df = pd.read_excel("../data/Airports.xlsx", index_col="AirportId")

In [3]:
airports_df

Unnamed: 0_level_0,AirportCode,FullName,OriginCountry,OriginCity
AirportId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,PDL,Ponta Delgada Joao Paulo II,Portugal,Ponta Delgada
2,FAO,Faro Airport,Portugal,Faro
3,LIS,Portela,Portugal,Lisbon
4,OPO,Franciso De Sa Carneiro,Portugal,Porto
5,SJZ,Sao Jorge Airport,Portugal,Madeira
...,...,...,...,...
64,NCL,Newcastle International Airport,United Kingdom,Newcastle
65,LPL,Liverpool John Lennon Airport,United Kingdom,Liverpool
66,EDI,Edinburgh Airport,United Kingdom,Edinburgh
67,GLA,Glasgow Prestwick Airport,United Kingdom,Glasgow


In [4]:
INSERT_TEMPLATE = "INSERT INTO {table_name} {columns_tuple}\n\tVALUES {values_tuple}"

In [5]:
print(INSERT_TEMPLATE.format(table_name="Airports", columns_tuple=tuple(airports_df.columns), values_tuple=tuple(airports_df.iloc[0].tolist())).replace("\'", ""))

INSERT INTO Airports (AirportCode, FullName, OriginCountry, OriginCity)
	VALUES (PDL, Ponta Delgada Joao Paulo II, Portugal, Ponta Delgada)


In [6]:
airlines_df

Unnamed: 0_level_0,Name,AirlineCode
AirlineId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Ryanair,RYR
2,Lufthansa Group,DLH
3,International Airlines Group,AAN
4,Air France-KLM,SEU
5,Easyjet,EZY
6,Wizz Air,WZZ
7,American Airlines,AAL
8,Delta Air Lines,DAL
9,Southwest Airlines,SWA
10,Air Canada,ROU


In [7]:
with open(r"../scripts/sql/populate_db.sql", "w", encoding="utf8") as f:
    for index in airlines_df.index.tolist():
        #print(INSERT_TEMPLATE.format(table_name="Airline", columns_tuple=tuple(airlines_df.columns), values_tuple=tuple(airlines_df.iloc[index - 1].tolist())).replace("\'", ""))
        f.write(INSERT_TEMPLATE.format(table_name="Airline", columns_tuple=tuple(airlines_df.columns), values_tuple=tuple(airlines_df.iloc[index -1].tolist())).replace("\'", ""))
        f.write("\n")
        
    for index in airports_df.index.tolist():
        f.write(INSERT_TEMPLATE.format(table_name="Airport", columns_tuple=tuple(airports_df.columns), values_tuple=tuple(airports_df.iloc[index - 1].tolist())).replace("\'", ""))
        f.write("\n")

In [8]:
def get_airline_info(name):
    airline = airlines_df.loc[airlines_df["Name"] == name].to_dict(orient="index")
    index = list(airline.keys())[0]
    
    return index, airline[index]["AirlineCode"]

def get_airport_info(city):
    airport = airports_df.loc[airports_df["OriginCity"] == city].to_dict(orient="index")
    index = list(airport.keys())[0]
    
    return index, airport[index]["AirportCode"]

In [9]:
def get_flight_info(departure, arrival, start, duration, airline, route_index):
    airline_id, airline_code = get_airline_info(airline)
    departure_id, departure_code = get_airport_info(departure)
    arrival_id, arrival_code = get_airport_info(arrival)
    
    return airline_id, route_index, airline_code + "001" + departure_code + arrival_code, duration, np.random.randint(50, 250), 0

In [10]:
def clock_to_timedetla(time):
    return timedelta(hours=int(time[0:2]), minutes=int(time[3:5]), seconds=int(time[6:8]))

In [11]:
def get_route_info(departure, arrival, start, duration, airline, placeholder):
    airline_id, airline_code = get_airline_info(airline)
    departure_id, departure_code = get_airport_info(departure)
    arrival_id, arrival_code = get_airport_info(arrival)
    
    return departure_id, arrival_id, str(start), str(dt.strptime(start, "%Y-%m-%d %H:%M:%S") + clock_to_timedetla(duration))

In [12]:
get_flight_info("Rome", "Madrid", "2020-12-01 10:00:00", "02:20:00", "Ryanair", 1)

(1, 1, 'RYR001CIAMAD', '02:20:00', 101, 0)

In [13]:
date = dt.strptime("2020-12-01 10:00:00", "%Y-%m-%d %H:%M:%S")

In [14]:
get_route_info("Rome", "Madrid", "2020-12-01 10:00:00", "02:20:00", "Ryanair", 1)

(18, 6, '2020-12-01 10:00:00', '2020-12-01 12:20:00')

In [16]:
airports_df["OriginCity"].iloc[np.random.randint(len(airports_df["OriginCity"].index))]
airports_df["OriginCity"].iloc[np.random.randint(len(airports_df["OriginCity"].index))]

'Wroclaw'

In [17]:
start_date = "2020-01-01 8:00:00"

In [18]:
str(dt.strptime(start_date, "%Y-%m-%d %H:%M:%S") + clock_to_timedetla(f"{np.random.randint(4)}{np.random.randint(10)}:{np.random.randint(7)}0:00"))

'2020-01-02 23:50:00'

In [19]:
def random_date_offset():
    # Date offset in range 00:00:00 to 40:00:00
    # seconds are alwyas 00 and minutes have precision up to 10 min
    return f"0{np.random.randint(5)}:{np.random.randint(6)}0:00"

def random_flight_time():
    # Random flight time, with max value of 9H and min 1H
    return f"0{np.random.randint(1, 10)}:{np.random.randint(6)}0:00"

In [20]:
routes = []
flights = []

start_date = "2020-01-01 8:00:00"
date = dt.strptime(start_date, "%Y-%m-%d %H:%M:%S")

for _ in range(10_000):
    departure_airport = airports_df["OriginCity"].iloc[np.random.randint(len(airports_df["OriginCity"].index))]
    arrival_airport = airports_df["OriginCity"].iloc[np.random.randint(len(airports_df["OriginCity"].index))]
    date += clock_to_timedetla(random_date_offset())
    flight_time = random_flight_time()
    airline = airlines_df["Name"].iloc[np.random.randint(airlines_df.index.size)]
    
    flights.append(get_flight_info(departure_airport, arrival_airport, str(date), flight_time, airline, 1))
    routes.append(get_route_info(departure_airport, arrival_airport, str(date), flight_time, airline, 1))

In [21]:
pd.DataFrame.from_records(flights, columns=["AirlineId", "RouteId", "FlightCode", "Duration", "Passengers", "Intercontinental"])

Unnamed: 0,AirlineId,RouteId,FlightCode,Duration,Passengers,Intercontinental
0,6,1,WZZ001DMEAYT,06:10:00,91,0
1,11,1,LOT001PSANUE,07:30:00,228,0
2,7,1,AAL001DUBRIX,02:40:00,64,0
3,1,1,RYR001STNSTN,02:30:00,225,0
4,1,1,RYR001LPLMUC,02:50:00,80,0


In [22]:
pd.DataFrame.from_records(routes, columns=["DepartureAirportId", "ArrivalAirportId", "DepartureDateTime", "ArrivalDateTime"])

Unnamed: 0,DepartureAirportId,ArrivalAirportId,DepartureDateTime,ArrivalDateTime
0,55,33,2020-01-01 11:50:00,2020-01-01 18:00:00
1,19,44,2020-01-01 15:50:00,2020-01-01 23:20:00
2,58,53,2020-01-01 16:00:00,2020-01-01 18:40:00
3,59,59,2020-01-01 17:20:00,2020-01-01 19:50:00
4,65,40,2020-01-01 20:00:00,2020-01-01 22:50:00


In [25]:
clock_to_timedetla("12:00:00")

datetime.timedelta(seconds=43200)