<div align="center">

# <u> Flight delay forecasting </u>
### Machine Learning Project by:
Tamara Pallien & Frederic Baumeister



<img src="res/title.jpg" height=450>


 Photo by D. C. Cavalleri: https://www.pexels.com/de-de/foto/flughafen-2421196/

</div>

## Introduction:



In [1]:
# necessary imports 

""" Data Manipulation and Visualization"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
#import plotly.express as px


""" Machine Learning """
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import plot_tree
from sklearn.linear_model import LinearRegression

""" Time """
from datetime import datetime, timedelta
from time import sleep 

""" External APIs"""
import airportsdata
from meteostat import Point, Daily


""" PyScripts """
#import Flight_routes
#import Airport_efficiency

"""Global Values"""
RSEED = 42
figure(figsize=(10, 10), dpi=120)
airports = airportsdata.load('IATA')


"""other"""
import warnings
warnings.filterwarnings("ignore")


<Figure size 1200x1200 with 0 Axes>

In [31]:
""" Read dataset"""

df = pd.read_csv('data/Train.csv')
test_df = pd.read_csv('data/Test.csv')

df.head()

df = df.dtypes('float64')

df.head()

TypeError: 'Series' object is not callable

In [3]:
# Reading Airport API Data

"""To Avoid long wait times due to limits within the APIs, the API responses for the Training Data are saved in .csv files """

arr = df.DEPSTN.unique()
err_arr = []
port_dict = {}
for el in arr:
    try:
        port_dict[el] = tuple([airports[el].get(k) for k in ['lat', 'lon']])
    except KeyError:
        err_arr.append(el)

df['Dep_coor'] = df['DEPSTN'].map(port_dict)

print('Depart Coordinates')
for err in err_arr: 
    print(f'{err} missing in Airportsdata: {df.DEPSTN.value_counts()[err]} rows affected')

arr = df.ARRSTN.unique()
err_arr = []
port_dict = {}
for el in arr:
    try:
        port_dict[el] =tuple([airports[el].get(k) for k in ['lat', 'lon']])
    except KeyError:
        err_arr.append(el)

df['Arr_coor'] = df['ARRSTN'].map(port_dict)

print('Arrival Coordinates')
for err in err_arr: 
    print(f'{err} missing in Airportsdata: {df.ARRSTN.value_counts()[err]} rows affected')  


Depart Coordinates
SXF missing in Airportsdata: 332 rows affected
Arrival Coordinates
SXF missing in Airportsdata: 332 rows affected


In [30]:
# Reading Weather API Data 

dep_weather_df = df[['Dep_coor', 'DATOP', 'ID']]
records = dep_weather_df.to_records(index=False)
err_arr = []
weather_dict = {}
for i, record in enumerate(records):
    if i < 5:

        coor = Point(record[0][0],record[0][1])    
        start = datetime(2018, 1, 1)
        end = datetime(2018, 12, 31)
        data = Daily((coor, start, end))
        data = data.fetch()
        port_dict[record[0]] = data


print(err_arr)

InvalidURL: URL can't contain control characters. "/v2/daily/(<meteostat.interface.point.Point object at 0x15c6bbeb0>, Timestamp('2018-01-01 00:00:00'), Timestamp('2018-12-31 00:00:00')).csv.gz" (found at least ' ')

In [20]:
# Set time period

weather_dict = {}
start = datetime(2018, 1, 1)
end = datetime(2018, 12, 31)

# Create Point for Vancouver, BC
vancouver = Point(49.2497, -123.1193, 70)

# Get daily data for 2018
data = Daily(vancouver, start, start)
data = data.fetch()
weather_dict["record"] = data

start_weather = (weather_dict.get('record'))

df.rename('col_{}'.format, axis=1)



pandas.core.frame.DataFrame

In [22]:
df.head()

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target,Dep_coor,Arr_coor
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0,"(33.3675003052, -7.5899701118)","(36.8510017395, 10.2271995544)"
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0,"(45.6305999756, 8.7281103134)","(36.8510017395, 10.2271995544)"
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0,"(36.8510017395, 10.2271995544)","(41.275333, 28.752)"
3,train_id_3,2016-01-17,TU 0480,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,ATA,TU 736IOK,0.0,"(33.875, 10.7755002975)","(47.1531982422, -1.610730052)"
4,train_id_4,2016-01-17,TU 0338,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,ATA,TU 320IMU,22.0,"(36.8510017395, 10.2271995544)","(36.6910018921, 3.2154099941)"


In [34]:
# droping rows:

drop_arr = ['ID', 'DATOP', 'AC', 'FLTID', 'STATUS']

try:
    df = df.drop(labels=drop_arr, axis=1)
except KeyError:
    pass

df.head()

Unnamed: 0,DEPSTN,ARRSTN,STD,STA,target
0,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,260.0
1,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,20.0
2,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,0.0
3,DJE,NTE,2016-01-17 14:10:00,2016-01-17 17.00.00,0.0
4,TUN,ALG,2016-01-17 14:30:00,2016-01-17 15.50.00,22.0


### Data cleaning 


In [None]:
""" Converting timestamps to Datetime obj."""

df['STA'] = df['STA'].str.replace('.', ':', regex=False)
df['STA'] = pd.to_datetime(df['STA']).map(pd.Timestamp.timestamp)

df['STD'] = pd.to_datetime(df['STD']).map(pd.Timestamp.timestamp)

df.info()

In [None]:
""" create Target"""

X = df.drop('target', axis=1)
y= df['target']

"""Target info"""
print(f"We have {X.shape[0]} observations in our dataset and {X.shape[1]} features")
print(f"Our target vector has also {y.shape[0]} values")


In [None]:
df.describe()

## Preprocessing

In [None]:
cat_feats = ['DEPSTN', 'ARRSTN']

X = pd.get_dummies(X, columns=cat_feats, drop_first=True)

In [None]:
""" train/test split"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

## Very simple Baseline model 

In [None]:
dec_reg = DecisionTreeRegressor(max_depth=4)
dec_reg.fit(X_train, y_train)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
y_test_dec_tree_predicted = dec_reg.predict(X_test)
y_test_lin_predicted = lin_reg.predict(X_test)

print('Decision Tree')
print('_'*20)
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_test_dec_tree_predicted)))

print('\n')

print('Linear Regression')
print('_'*20)
print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_test_lin_predicted)))


## nice to have:

Plots: 

- Airport with most delay 

- cluster time of year (delay)

- find routes that are late most 

## Conclusion


____
This Project was Part of the Data Science Bootcamp at NeueFische, for more information visit: 

[NeueFische](https://www.neuefische.de)