# FDS Mini Project


**WARNING: Before making any git commit to this notebook please clear all output in this notebook**

## 1. Cleaning the data

### Invalid Columns: 
- delete unnamed column which was serving as index (index already exists - duplicated column)
- delete last column (contains only NaN values) - 'Unnamed 21'

### NaN values:
- check number of NaN values/location of NaN values
- leave NaN values that are required in order not to lose data (for example: a cancelled flight will always have NaN values for DEP_TIME, ARR_TIME, ARR_DEL15, DEP_DEL15 - as the flight did not happen)
- delete NaN values that would incommodate analysis and plotting later on (for example, flight timings that are simply missing without the flight having been cancelled)

### Times conversion (Note: 00:00 timings all represent cancelled flights)
- observation --> no flight leaves at 00:00, all *00:00 date/time values belong to flights that have been cancelled*
- converted DEP_TIME and ARR_TIME to 4-character string of the format: hhmm (error when attempting to convert to date/time) 
- added two extra columns: ARR_TIME_MINS and DEP_TIME_MINS representing the arrival and departure time in minutes for easier calculations

### Irrelevant columns (to this project) to be removed/ duplicated data:
- Remove both OP_CARRIER_AIRLINE_ID and OP_CARRIER
- Remove ORIGIN_AIRPORT_SEQ_ID
- Remove DEST_AIRPORT_SEQ_ID





In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Importing sklearn functions
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans

In [None]:
#--------------------------------------- Load dataset ------------------------------------------#
flight_data_path = os.path.join(os.getcwd(), 'datasets', 'flight_jan_2019.csv.gz')
flight_data = pd.read_csv(flight_data_path, compression = 'gzip')

# Delete 'Unnamed 1' and 'Unnamed 21'
del flight_data['Unnamed: 0']
del flight_data['Unnamed: 21']
flight_data

#---------------------------------------- Check for 'NaN' values ------------------------------#

# for col in flight_data.columns: 
#    print(col, ' :',flight_data[col].isna().sum())
    
    # NA VALUES: TAIL_NUM  : 2543
    #            DEP_TIME  : 16352
    #            DEP_DEL15  : 16355
    #            ARR_TIME  : 17061
    #            ARR_DEL15  : 18022
    #            Unnamed: 21  : 583985

# Dealing with DEP_TIME and ARR_TIME Nan values
flight_data[np.isnan(flight_data.DEP_TIME)] # Observation: cancelled flights have Nan values for DEP_TIME, ARR_TIME, DEP_DEL15,ARR_DEL15  
# NaN values therefore make sense in this case, eliminating rows with NaN values with plotting can be done by filtering:
#                       flight_data[~np.isnan(flight_data['DEP_TIME'])]['DEP_TIME'].isna().sum()    

# Eliminate rows with NaN values in place for DEP/ARR_DELL15 AND ARR_TIME where the DEP_TIME is registered (timings simply missing)
indices_to_eliminate = list(flight_data[(~np.isnan(flight_data['DEP_TIME']))][np.isnan(flight_data['DEP_DEL15'])].index.values) + list(flight_data[(~np.isnan(flight_data['DEP_TIME']))][np.isnan(flight_data['ARR_TIME'])].index.values) + list(flight_data[(~np.isnan(flight_data['DEP_TIME']))][np.isnan(flight_data['ARR_DEL15'])].index.values)
flight_data = flight_data.drop(indices_to_eliminate)

#--------------------------------------Modifying data types----------------------------------#
flight_data.dtypes
# CANCELLED/DIVERTED to integer value
flight_data['CANCELLED'] = flight_data['CANCELLED'].astype(int)
flight_data['DIVERTED'] = flight_data['DIVERTED'].astype(int)
flight_data.dtypes
flight_data
# Modifying timings date/time format
#flight_data['DEP_TIME'] = pd.to_datetime(flight_data['DEP_TIME'], format='%H%M').dt.time

# OBSERVATION: flights with value 0.0 - keeping in mind that timings are currently floats - are all NaN values - so no flight leaves at 00:00 (those are simply cancelled values)
len(flight_data[(flight_data['DEP_TIME'] == 0.0)][flight_data['CANCELLED'] == 1]['DEP_TIME']) - flight_data[flight_data['DEP_TIME'] == 0.0]['DEP_TIME'].isna().sum()
len(flight_data[(flight_data['DEP_TIME'] == 0.0)][flight_data['CANCELLED'] == 1]['DEP_TIME']) - flight_data[flight_data['DEP_TIME'] == 0.0]['DEP_TIME'].isna().sum()

# Convert DEP_TIME and ARR_TIME to int and add new columns: DEP_TIME_MINS and ARR_TIME_MINS for easy calculations
def convert_minutes(x):
    minutes = int(x[2])*10 + int(x[3])
    hr_minutes = (int(x[0])*10 + int(x[1]))*60
    return minutes+hr_minutes

def fill_in(x):
    if (len(x) == 4):
        return x
    if (len(x) == 3):
        return '0' + x
    if (len(x) == 2):
        return '00' + x
    if (len(x) == 1):
        return '000' + x
    if (len(x) == 0):
        return '000' + x
    return '0000'
    
flight_data['DEP_TIME'] = flight_data['DEP_TIME'].fillna(0)
flight_data['DEP_TIME'] = flight_data['DEP_TIME'].astype(int)
flight_data['DEP_TIME'] = flight_data['DEP_TIME'].astype(str)
flight_data['DEP_TIME'] = flight_data['DEP_TIME'].apply(fill_in)
flight_data['DEP_TIME_MINS'] = flight_data['DEP_TIME'].apply(convert_minutes)
flight_data['ARR_TIME'] = flight_data['ARR_TIME'].fillna(0)
flight_data['ARR_TIME'] = flight_data['ARR_TIME'].astype(int)
flight_data['ARR_TIME'] = flight_data['ARR_TIME'].astype(str)
flight_data['ARR_TIME'] = flight_data['ARR_TIME'].apply(fill_in)
flight_data['ARR_TIME_MINS'] = flight_data['ARR_TIME'].apply(convert_minutes)

#-------------------------------ATTEMPT AT CONVERTING TO DATE/TIME-----------------#
def fill_in(x):
    if (len(x) == 4):
        return x
    if (len(x) == 3):
        return '0' + x
    if (len(x) == 2):
        return '00' + x
    if (len(x) == 1):
        return '000' + x
    if (len(x) == 0):
        return '000' + x
    return '0000'
    
#def convert_time(x):
#    return datetime.datetime.strptime(x,'%H%M' )
    
#flight_data['DEP_TIME'] = flight_data['DEP_TIME'].apply(fill_in)
#flight_data['ARR_TIME'] = flight_data['ARR_TIME'].apply(fill_in)
#flight_data['DEP_TIME'] = flight_data['DEP_TIME'].apply(convert_time)
#flight_data['DEP_TIME'] = flight_data['DEP_TIME'].apply(check)
#flight_data['DEP_TIME'] = pd.to_datetime(flight_data['DEP_TIME'], format=)


#------------------------------------Eliminating extra columns------------------------------#

flight_data['OP_UNIQUE_CARRIER'].nunique()  # 17
flight_data['OP_CARRIER_AIRLINE_ID'].nunique()  # 17
flight_data['OP_CARRIER'].nunique() # 17
# Remove both OP_CARRIER_AIRLINE_ID and OP_CARRIER
del flight_data['OP_CARRIER_AIRLINE_ID']
del flight_data['OP_CARRIER']

flight_data['TAIL_NUM'].nunique() # 5445
flight_data['ORIGIN_AIRPORT_ID'].nunique() # 346
flight_data['ORIGIN_AIRPORT_SEQ_ID'].nunique() # 346
# Remove ORIGIN_AIRPORT_SEQ_ID
del flight_data['ORIGIN_AIRPORT_SEQ_ID']

flight_data['DEST_AIRPORT_ID'].nunique() # 346
flight_data['DEST_AIRPORT_SEQ_ID'].nunique() # 346
# Remove DEST_AIRPORT_SEQ_ID
del flight_data['DEST_AIRPORT_SEQ_ID']

del flight_data['ORIGIN_AIRPORT_ID']
del flight_data['DEST_AIRPORT_ID']

flight_data.head()


## 2. Data Analysis Preparation

* Cancelled flights are removed from original dataset as they are not relevant to delay prediction
* Dataset is split up into training data(60%), validation data(20%) and test data(20%)



In [None]:
# Generate dictionary for all categorical data
carrier_arr = flight_data['OP_UNIQUE_CARRIER'].unique()
tail_arr = flight_data['TAIL_NUM'].unique()
airport_arr = flight_data['ORIGIN'].append(flight_data['DEST']).unique()
carrier_dict = dict([(x,i) for i,x in enumerate(carrier_arr)])
tail_dict = dict([(x,i) for i,x in enumerate(tail_arr)])
airport_dict = dict([(x,i) for i,x in enumerate(airport_arr)])


In [None]:
# Turn all categorical data into number
flight_data['OP_UNIQUE_CARRIER'] = flight_data['OP_UNIQUE_CARRIER'].map(carrier_dict)
flight_data['TAIL_NUM'] = flight_data['TAIL_NUM'].map(tail_dict)
flight_data['ORIGIN'] = flight_data['ORIGIN'].map(airport_dict)
flight_data['DEST'] = flight_data['DEST'].map(airport_dict)

In [None]:
# remove irrelevent columns for analysis
flight_data.drop(columns=['DEP_TIME','DEP_TIME_BLK','ARR_TIME','DIVERTED'],inplace=True)

In [None]:
# split up data for analysis
cancelled_flight = flight_data[flight_data['CANCELLED'] == 1.0].drop(columns=['CANCELLED'])
normal_flight = flight_data[flight_data['CANCELLED'] == 0.0].drop(columns=['CANCELLED'])

## 3. PCA Analysis


In [None]:
# split up test and train data for normal flight
train_data, test_data = train_test_split(normal_flight, train_size=0.8)
train_data, val_data = train_test_split(train_data, train_size=0.75)
train_dep = train_data['DEP_DEL15']
train_arr = train_data['ARR_DEL15']
val_dep = val_data['DEP_DEL15']
val_arr = val_data['ARR_DEL15']
test_dep = test_data['DEP_DEL15']
test_arr = test_data['ARR_DEL15']
train_data.drop(columns=['DEP_DEL15','ARR_DEL15'],inplace=True)
test_data.drop(columns=['DEP_DEL15','ARR_DEL15'],inplace=True)
val_data.drop(columns=['DEP_DEL15','ARR_DEL15'],inplace=True)


In [None]:
std_data = StandardScaler().fit_transform(train_data)
pca = PCA().fit(std_data)
pca_scores = pca.transform(std_data)
#sns.scatterplot(x=pca_scores[:,0],y=pca_scores[:,1],hue=train_dep)

In [None]:
print(pca.components_[0])
train_data.head()

## 4. Flight Delay Prediction

In [None]:
# Calculate the accuracy of prediction against validation data given k-value
def test_accuracy(k, mode='DEP'):
    print('Running KNN with k =',k)
    train_target = train_dep if mode == 'DEP' else train_arr
    val_target = val_dep if mode == 'DEP' else val_arr
    knn = KNeighborsClassifier(n_neighbors=k, weights='distance').fit(train_data, train_target)
    return np.sum(knn.predict(val_data) == val_target) / len(val_data)

In [None]:
[test_accuracy(k, mode='DEP') for k in range(1,10,2)]