In [1]:
#Merging all the dataset files into one dataset as dataset is available on a day basis and not a aggregate dataset
import os
import pandas as pd
from scipy.stats import chisquare, chi2_contingency
import numpy as np
import datetime as dt

In [2]:
#Printing some data of the first file to be merged
arrival_dataset=pd.read_csv("../Datasets/Arrivals/AirplaneData09-02-2020.csv")
arrival_df=pd.DataFrame(
    arrival_dataset[
        ["Time","Flight Number","Source","Flight Name","Aircraft","Status"]
    ]
)
arrival_df

Unnamed: 0,Time,Flight Number,Source,Flight Name,Aircraft,Status
0,12:05 AM,AF218,Paris (CDG),Air France,B77W (F-GSQD),Landed 11:55 PM
1,12:05 AM,6E5383,Hyderabad (HYD),IndiGo,A320 (VT-IEC),Landed 12:16 AM
2,12:10 AM,G82608,Jaipur (JAI),GoAir,A20N (VT-WGU),Landed 11:42 PM
3,12:10 AM,6E665,Delhi (DEL),IndiGo,A320 (VT-IDX),Landed 12:04 AM
4,12:15 AM,BA139,London (LHR),British Airways,B772 (G-YMMO),Landed 12:12 AM
...,...,...,...,...,...,...
453,11:55 PM,LH766,Munich (MUC),Lufthansa,A359 (D-AIXK),Landed 12:26 AM
454,11:55 PM,UK844,Goa (GOI),Vistara,A20N (VT-TNK),Landed 11:27 PM
455,11:55 PM,UK774,Kolkata (CCU),Vistara,B738 (VT-TGA),Landed 12:20 AM
456,11:55 PM,6E6183,Chennai (MAA),IndiGo,A320 (VT-IFO),Landed 11:48 PM


In [3]:
#Printing some data of the second file to be merged
departure_dataset=pd.read_csv("../Datasets/Departures/AirplaneData09-02-2020.csv")
departure_df=pd.DataFrame(
    departure_dataset[
        ["Time","Flight Number","Destination","Flight Name","Aircraft","Status"]
    ]
)
departure_df

Unnamed: 0,Time,Flight Number,Destination,Flight Name,Aircraft,Status
0,12:05 AM,G82508,Delhi (DEL),GoAir,A320 (VT-GOJ),Departed 12:17 AM
1,12:05 AM,I5330,Delhi (DEL),AirAsia,A320 (VT-SIN),Departed 12:12 AM
2,12:05 AM,PG734,Bangkok (BKK),Bangkok Airways (Samui Livery),A320 (HS-PGW),Departed 12:10 AM
3,12:10 AM,UA49,New York (EWR),United Airlines,B77W (N2748U),Departed 12:50 AM
4,12:15 AM,6E848,Bengaluru (BLR),IndiGo,A20N (VT-IJU),Departed 12:25 AM
...,...,...,...,...,...,...
457,11:35 PM,SQ423,Singapore (SIN),Singapore Airlines,A388 (9V-SKT),Departed 12:28 AM
458,11:40 PM,IX251,Sharjah (SHJ),Air India Express (Sitar/Tabla Livery),B738 (VT-GHC),Departed 12:40 AM
459,11:50 PM,AI342,Singapore (SIN),Air India,A20N (VT-EXJ),Departed 12:23 AM
460,11:55 PM,AI685,Amritsar (ATQ),Air India,B788 (VT-ANK),Departed 1:01 AM


In [4]:
#Merging the whole Dataset from all the files
def update_time(value):
    time = pd.to_datetime(value, format="%I:%M %p").time()
    return time


def dataMerger():
    file_path = "../Datasets/"
    i = 0

    folder_name = ["Arrivals", "Departures"]
    arrival_path = file_path + "\\" + folder_name[0]
    departure_path = file_path + "\\" + folder_name[1]

    arrival_files = [
        file
        for file in os.listdir(arrival_path)
        if os.path.isfile(os.path.join(arrival_path, file))
    ]

    departure_files = [
        file
        for file in os.listdir(departure_path)
        if os.path.isfile(os.path.join(departure_path, file))
    ]

    file = pd.DataFrame()

    """
    Merges all the files and adds dates to the dataset
    """
    for arrival, departure in zip(arrival_files, departure_files):

        date = pd.to_datetime(arrival[12:-4], format="%d-%m-%Y").date()

        arrival_file = pd.DataFrame(
            pd.read_csv(arrival_path + "\\" + arrival, encoding="utf8")[
                ["Time", "Source", "Flight Name", "Status"]
            ]
        )
        departure_file = pd.DataFrame(
            pd.read_csv(departure_path + "\\" + departure, encoding="utf8")[
                ["Time", "Destination", "Flight Name", "Status"]
            ]
        )
        arrival_file["date"] = date
        arrival_file["type"] = "A"
        departure_file["date"] = date
        departure_file["type"] = "D"

        arrival_file["Time"] = arrival_file["Time"].apply(update_time, 1)
        departure_file["Time"] = departure_file["Time"].apply(update_time, 1)
        merged = arrival_file.append(departure_file, sort=False)
        file = file.append(merged, sort=False)
        file.sort_values(["date", "Time"], axis=0, inplace=True)
        # print(file)
        # file = file.append(file)

    with open("../Datasets/FinalMergedDataset/dataset.csv", "w") as csv_file:
        file.to_csv(path_or_buf=csv_file, index=False)

dataset = pd.read_csv("../Datasets/FinalMergedDataset/dataset.csv")
df = pd.DataFrame(
    dataset[
        ["Time", "date", "Source", "Flight Name",
            "Status", "type", "Destination"]
    ]
)
df #Printing the merged Dataframe

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination
0,00:05:00,2020-02-02,Paris (CDG),Air France,Landed 12:48 AM,A,
1,00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,Landed 12:23 AM,A,
2,00:05:00,2020-02-02,,GoAir,Departed 12:30 AM,D,Delhi (DEL)
3,00:05:00,2020-02-02,,AirAsia,Departed 12:55 AM,D,Delhi (DEL)
4,00:05:00,2020-02-02,,Bangkok Airways (Guilin Livery),Departed 12:50 AM,D,Bangkok (BKK)
...,...,...,...,...,...,...,...
25550,23:55:00,2020-03-01,,IndiGo,,D,Chennai (MAA)
25551,23:58:00,2020-03-01,Colombo (CMB),Air India,,A,
25552,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,Landed 11:49 PM,A,
25553,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,,A,


In [5]:
#Function to Update the Departure Source in the dataframe i.e. filling the missing values
def update_source(value):
    if value["type"] == "D":
        return "Mumbai"
    else:
        return value["Source"]

df["Source"] = df[["Source", "type"]].apply(update_source, 1) #Calling the above function and updating the Dataframe
df #Printing the Updated Dataframe

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination
0,00:05:00,2020-02-02,Paris (CDG),Air France,Landed 12:48 AM,A,
1,00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,Landed 12:23 AM,A,
2,00:05:00,2020-02-02,Mumbai,GoAir,Departed 12:30 AM,D,Delhi (DEL)
3,00:05:00,2020-02-02,Mumbai,AirAsia,Departed 12:55 AM,D,Delhi (DEL)
4,00:05:00,2020-02-02,Mumbai,Bangkok Airways (Guilin Livery),Departed 12:50 AM,D,Bangkok (BKK)
...,...,...,...,...,...,...,...
25550,23:55:00,2020-03-01,Mumbai,IndiGo,,D,Chennai (MAA)
25551,23:58:00,2020-03-01,Colombo (CMB),Air India,,A,
25552,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,Landed 11:49 PM,A,
25553,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,,A,


In [6]:
#Function to Update the Arrival Destination in the dataframe i.e. filling the missing values
def update_destination(value):
    if value["type"] == "A":
        return "Mumbai"
    else:
        return value["Destination"]
df["Destination"] = df[["Destination", "type"]].apply(update_destination, 1) #Calling the above function and updating the Dataframe
df #Printing the Updated Dataframe

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination
0,00:05:00,2020-02-02,Paris (CDG),Air France,Landed 12:48 AM,A,Mumbai
1,00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,Landed 12:23 AM,A,Mumbai
2,00:05:00,2020-02-02,Mumbai,GoAir,Departed 12:30 AM,D,Delhi (DEL)
3,00:05:00,2020-02-02,Mumbai,AirAsia,Departed 12:55 AM,D,Delhi (DEL)
4,00:05:00,2020-02-02,Mumbai,Bangkok Airways (Guilin Livery),Departed 12:50 AM,D,Bangkok (BKK)
...,...,...,...,...,...,...,...
25550,23:55:00,2020-03-01,Mumbai,IndiGo,,D,Chennai (MAA)
25551,23:58:00,2020-03-01,Colombo (CMB),Air India,,A,Mumbai
25552,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,Landed 11:49 PM,A,Mumbai
25553,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,,A,Mumbai


In [7]:
#Function to convert String of date and time to timestamp 
def update_timestamp_init(value):
    timestamp = str(value["date"]) + " " + str(value["Time"])
    timestamp = pd.to_datetime(timestamp, format="%Y-%m-%d %H:%M:%S")
    return timestamp
df["Time"] = df[["date", "Time"]].apply(update_timestamp_init, 1) #Calling the above function and updating the Dataframe
df #Printing the Updated Dataframe

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination
0,2020-02-02 00:05:00,2020-02-02,Paris (CDG),Air France,Landed 12:48 AM,A,Mumbai
1,2020-02-02 00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,Landed 12:23 AM,A,Mumbai
2,2020-02-02 00:05:00,2020-02-02,Mumbai,GoAir,Departed 12:30 AM,D,Delhi (DEL)
3,2020-02-02 00:05:00,2020-02-02,Mumbai,AirAsia,Departed 12:55 AM,D,Delhi (DEL)
4,2020-02-02 00:05:00,2020-02-02,Mumbai,Bangkok Airways (Guilin Livery),Departed 12:50 AM,D,Bangkok (BKK)
...,...,...,...,...,...,...,...
25550,2020-03-01 23:55:00,2020-03-01,Mumbai,IndiGo,,D,Chennai (MAA)
25551,2020-03-01 23:58:00,2020-03-01,Colombo (CMB),Air India,,A,Mumbai
25552,2020-03-01 23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,Landed 11:49 PM,A,Mumbai
25553,2020-03-01 23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,,A,Mumbai


In [8]:
#Function to add the actual time at which the Flight Landed or Departed
def update_actual_time(value):
    # print(value["Status"])
    if type(value["Status"]) == type("String"):
        status = value["Status"].split(" ")
        if len(status) == 3 and status[1] != "to":
            time = value["date"] + " " + status[1] + " " + status[2]
            scheduled_time = pd.to_datetime(
                value["Time"], format="%Y-%m-%d %H:%M:%S")
            actual_time = pd.to_datetime(time, format="%Y-%m-%d %I:%M %p")

            threshold_date = pd.to_datetime(
                value["date"]+" "+"00:00:00", format="%Y-%m-%d %H:%M:%S")

            lb_threshold_date = pd.to_datetime(
                value["date"]+" "+"20:30:00", format="%Y-%m-%d %H:%M:%S")

            ub_threshold_date = pd.to_datetime(
                value["date"]+" "+"03:00:00", format="%Y-%m-%d %H:%M:%S")

            if actual_time <= ub_threshold_date and scheduled_time >= lb_threshold_date:
                actual_time = actual_time + dt.timedelta(days=1)
            elif scheduled_time <= ub_threshold_date and actual_time >= lb_threshold_date:
                actual_time = actual_time - dt.timedelta(days=1)
            timedelta = pd.Timedelta(actual_time - scheduled_time).seconds/3600
            return actual_time
        else:
            return np.nan
    else:
        return np.nan
df["Actual_Time"] = df[["Time", "Status", "date"]].apply(update_actual_time, 1) #Calling the above function and updating the Dataframe
df #Printing the Updated Dataframe

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination,Actual_Time
0,2020-02-02 00:05:00,2020-02-02,Paris (CDG),Air France,Landed 12:48 AM,A,Mumbai,2020-02-02 00:48:00
1,2020-02-02 00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,Landed 12:23 AM,A,Mumbai,2020-02-02 00:23:00
2,2020-02-02 00:05:00,2020-02-02,Mumbai,GoAir,Departed 12:30 AM,D,Delhi (DEL),2020-02-02 00:30:00
3,2020-02-02 00:05:00,2020-02-02,Mumbai,AirAsia,Departed 12:55 AM,D,Delhi (DEL),2020-02-02 00:55:00
4,2020-02-02 00:05:00,2020-02-02,Mumbai,Bangkok Airways (Guilin Livery),Departed 12:50 AM,D,Bangkok (BKK),2020-02-02 00:50:00
...,...,...,...,...,...,...,...,...
25550,2020-03-01 23:55:00,2020-03-01,Mumbai,IndiGo,,D,Chennai (MAA),NaT
25551,2020-03-01 23:58:00,2020-03-01,Colombo (CMB),Air India,,A,Mumbai,NaT
25552,2020-03-01 23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,Landed 11:49 PM,A,Mumbai,2020-03-01 23:49:00
25553,2020-03-01 23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,,A,Mumbai,NaT


In [9]:
#Function to update the status of the flight
def update_status(value):
    if value != np.nan and type(value) == type("String"):
        value = value.split(" ")[0]
        return value
    else:
        return np.nan
df["Status"] = df["Status"].apply(update_status, 1) #Calling the above function and updating the Dataframe
df #Printing the Updated Dataframe

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination,Actual_Time
0,2020-02-02 00:05:00,2020-02-02,Paris (CDG),Air France,Landed,A,Mumbai,2020-02-02 00:48:00
1,2020-02-02 00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,Landed,A,Mumbai,2020-02-02 00:23:00
2,2020-02-02 00:05:00,2020-02-02,Mumbai,GoAir,Departed,D,Delhi (DEL),2020-02-02 00:30:00
3,2020-02-02 00:05:00,2020-02-02,Mumbai,AirAsia,Departed,D,Delhi (DEL),2020-02-02 00:55:00
4,2020-02-02 00:05:00,2020-02-02,Mumbai,Bangkok Airways (Guilin Livery),Departed,D,Bangkok (BKK),2020-02-02 00:50:00
...,...,...,...,...,...,...,...,...
25550,2020-03-01 23:55:00,2020-03-01,Mumbai,IndiGo,,D,Chennai (MAA),NaT
25551,2020-03-01 23:58:00,2020-03-01,Colombo (CMB),Air India,,A,Mumbai,NaT
25552,2020-03-01 23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,Landed,A,Mumbai,2020-03-01 23:49:00
25553,2020-03-01 23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,,A,Mumbai,NaT


In [10]:
#Handling missing values filling the missing values with global constant(-9999) 
dataset = pd.read_csv("../Datasets/FinalMergedDataset/dataset.csv")
df1 = pd.DataFrame(
    dataset[
        ["Time", "date", "Source", "Flight Name",
            "Status", "type", "Destination"]
    ]
)

df1.replace(to_replace=np.nan , value= -9999, inplace=True)
df1

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination
0,00:05:00,2020-02-02,Paris (CDG),Air France,Landed 12:48 AM,A,-9999
1,00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,Landed 12:23 AM,A,-9999
2,00:05:00,2020-02-02,-9999,GoAir,Departed 12:30 AM,D,Delhi (DEL)
3,00:05:00,2020-02-02,-9999,AirAsia,Departed 12:55 AM,D,Delhi (DEL)
4,00:05:00,2020-02-02,-9999,Bangkok Airways (Guilin Livery),Departed 12:50 AM,D,Bangkok (BKK)
...,...,...,...,...,...,...,...
25550,23:55:00,2020-03-01,-9999,IndiGo,-9999,D,Chennai (MAA)
25551,23:58:00,2020-03-01,Colombo (CMB),Air India,-9999,A,-9999
25552,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,Landed 11:49 PM,A,-9999
25553,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,-9999,A,-9999


In [11]:
#Replacing the missing value with the value that is occuring most number of times
dataset = pd.read_csv("../Datasets/FinalMergedDataset/dataset.csv")
df2 = pd.DataFrame(
    dataset[
        ["Time", "date", "Source", "Flight Name",
            "Status", "type", "Destination"]
    ]
)
def update_status(value):
    if value != np.nan and type(value) == type("String"):
        value = value.split(" ")[0]
        return value
    else:
        return np.nan
df2["Status"] = df2["Status"].apply(update_status, 1) #Calling the above function and updating the Dataframe
x=df2.Status.mode()
df2.replace(to_replace=np.nan , value= x[0], inplace=True)
df2

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination
0,00:05:00,2020-02-02,Paris (CDG),Air France,Landed,A,Landed
1,00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,Landed,A,Landed
2,00:05:00,2020-02-02,Landed,GoAir,Departed,D,Delhi (DEL)
3,00:05:00,2020-02-02,Landed,AirAsia,Departed,D,Delhi (DEL)
4,00:05:00,2020-02-02,Landed,Bangkok Airways (Guilin Livery),Departed,D,Bangkok (BKK)
...,...,...,...,...,...,...,...
25550,23:55:00,2020-03-01,Landed,IndiGo,Landed,D,Chennai (MAA)
25551,23:58:00,2020-03-01,Colombo (CMB),Air India,Landed,A,Landed
25552,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,Landed,A,Landed
25553,23:59:00,2020-03-01,Ahmedabad (AMD),SpiceJet,Landed,A,Landed


In [12]:
#Dropping the missing values within the Dataframe
df = df.dropna()
df

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination,Actual_Time
0,2020-02-02 00:05:00,2020-02-02,Paris (CDG),Air France,Landed,A,Mumbai,2020-02-02 00:48:00
1,2020-02-02 00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,Landed,A,Mumbai,2020-02-02 00:23:00
2,2020-02-02 00:05:00,2020-02-02,Mumbai,GoAir,Departed,D,Delhi (DEL),2020-02-02 00:30:00
3,2020-02-02 00:05:00,2020-02-02,Mumbai,AirAsia,Departed,D,Delhi (DEL),2020-02-02 00:55:00
4,2020-02-02 00:05:00,2020-02-02,Mumbai,Bangkok Airways (Guilin Livery),Departed,D,Bangkok (BKK),2020-02-02 00:50:00
...,...,...,...,...,...,...,...,...
25541,2020-03-01 23:55:00,2020-03-01,Munich (MUC),Lufthansa,Landed,A,Mumbai,2020-03-01 23:46:00
25542,2020-03-01 23:55:00,2020-03-01,Kolkata (CCU),Vistara,Landed,A,Mumbai,2020-03-02 00:02:00
25543,2020-03-01 23:55:00,2020-03-01,Chennai (MAA),IndiGo,Landed,A,Mumbai,2020-03-01 23:21:00
25547,2020-03-01 23:55:00,2020-03-01,Mumbai,Vistara,Departed,D,Singapore (SIN),2020-03-01 23:59:00


In [13]:
#Converting the Nominal Data To Numeric
map_dict = {
    "Landed": 0,
    "Departed": 1,
    "Unknown": 2,
    "Diverted": 3,
    "Canceled": 4,
    "Estimated": -1,
}
df["Status"] = df.Status.replace(map_dict) #replacing the Status with numeric data
df["Status"] = df["Status"].astype(int) #Converting it to int datatype

df #Printing the Updated Dataframe

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination,Actual_Time
0,2020-02-02 00:05:00,2020-02-02,Paris (CDG),Air France,0,A,Mumbai,2020-02-02 00:48:00
1,2020-02-02 00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,0,A,Mumbai,2020-02-02 00:23:00
2,2020-02-02 00:05:00,2020-02-02,Mumbai,GoAir,1,D,Delhi (DEL),2020-02-02 00:30:00
3,2020-02-02 00:05:00,2020-02-02,Mumbai,AirAsia,1,D,Delhi (DEL),2020-02-02 00:55:00
4,2020-02-02 00:05:00,2020-02-02,Mumbai,Bangkok Airways (Guilin Livery),1,D,Bangkok (BKK),2020-02-02 00:50:00
...,...,...,...,...,...,...,...,...
25541,2020-03-01 23:55:00,2020-03-01,Munich (MUC),Lufthansa,0,A,Mumbai,2020-03-01 23:46:00
25542,2020-03-01 23:55:00,2020-03-01,Kolkata (CCU),Vistara,0,A,Mumbai,2020-03-02 00:02:00
25543,2020-03-01 23:55:00,2020-03-01,Chennai (MAA),IndiGo,0,A,Mumbai,2020-03-01 23:21:00
25547,2020-03-01 23:55:00,2020-03-01,Mumbai,Vistara,1,D,Singapore (SIN),2020-03-01 23:59:00


In [14]:
#Function to compute the Delay of the flights and update it
def add_delay(value):
    time = pd.to_datetime(value["Time"], format="%Y-%m-%d %H:%M:%S")
    actual_time = pd.to_datetime(
        value["Actual_Time"], format="%Y-%m-%d %H:%M:%S")
    if pd.isnull(actual_time):
        return 0.0
    else:
        return pd.Timedelta(abs(time - actual_time)).seconds / 60
df["Delay"] = df[["Time", "Actual_Time"]].apply(add_delay, 1)
df

Unnamed: 0,Time,date,Source,Flight Name,Status,type,Destination,Actual_Time,Delay
0,2020-02-02 00:05:00,2020-02-02,Paris (CDG),Air France,0,A,Mumbai,2020-02-02 00:48:00,43.0
1,2020-02-02 00:05:00,2020-02-02,Hyderabad (HYD),IndiGo,0,A,Mumbai,2020-02-02 00:23:00,18.0
2,2020-02-02 00:05:00,2020-02-02,Mumbai,GoAir,1,D,Delhi (DEL),2020-02-02 00:30:00,25.0
3,2020-02-02 00:05:00,2020-02-02,Mumbai,AirAsia,1,D,Delhi (DEL),2020-02-02 00:55:00,50.0
4,2020-02-02 00:05:00,2020-02-02,Mumbai,Bangkok Airways (Guilin Livery),1,D,Bangkok (BKK),2020-02-02 00:50:00,45.0
...,...,...,...,...,...,...,...,...,...
25541,2020-03-01 23:55:00,2020-03-01,Munich (MUC),Lufthansa,0,A,Mumbai,2020-03-01 23:46:00,9.0
25542,2020-03-01 23:55:00,2020-03-01,Kolkata (CCU),Vistara,0,A,Mumbai,2020-03-02 00:02:00,7.0
25543,2020-03-01 23:55:00,2020-03-01,Chennai (MAA),IndiGo,0,A,Mumbai,2020-03-01 23:21:00,34.0
25547,2020-03-01 23:55:00,2020-03-01,Mumbai,Vistara,1,D,Singapore (SIN),2020-03-01 23:59:00,4.0


In [15]:
#Printing the clean Dataframe
cleaned_df = pd.DataFrame(
    df[
        [
            "Source",
            "Destination",
            "Flight Name",
            "type",
            "Status",
            "Time",
            "Actual_Time",
            "Delay",
        ]
    ]
)
with open("../Datasets/FinalMergedDataset/cleaned_dataset.csv", "w") as f:
        cleaned_df.to_csv(path_or_buf=f, index=False)
cleaned_df


Unnamed: 0,Source,Destination,Flight Name,type,Status,Time,Actual_Time,Delay
0,Paris (CDG),Mumbai,Air France,A,0,2020-02-02 00:05:00,2020-02-02 00:48:00,43.0
1,Hyderabad (HYD),Mumbai,IndiGo,A,0,2020-02-02 00:05:00,2020-02-02 00:23:00,18.0
2,Mumbai,Delhi (DEL),GoAir,D,1,2020-02-02 00:05:00,2020-02-02 00:30:00,25.0
3,Mumbai,Delhi (DEL),AirAsia,D,1,2020-02-02 00:05:00,2020-02-02 00:55:00,50.0
4,Mumbai,Bangkok (BKK),Bangkok Airways (Guilin Livery),D,1,2020-02-02 00:05:00,2020-02-02 00:50:00,45.0
...,...,...,...,...,...,...,...,...
25541,Munich (MUC),Mumbai,Lufthansa,A,0,2020-03-01 23:55:00,2020-03-01 23:46:00,9.0
25542,Kolkata (CCU),Mumbai,Vistara,A,0,2020-03-01 23:55:00,2020-03-02 00:02:00,7.0
25543,Chennai (MAA),Mumbai,IndiGo,A,0,2020-03-01 23:55:00,2020-03-01 23:21:00,34.0
25547,Mumbai,Singapore (SIN),Vistara,D,1,2020-03-01 23:55:00,2020-03-01 23:59:00,4.0
