# In this notebook...

## I will be downcasting datatypes for the combined filtered all_flights dataset.
## This dataset is quite large and consumes a lot of memory. 
## Downcasting datatypes (i.e. from int64 to int32 or int 8) will reduce memory requirements and increase efficiency.
## I will also be conducting another search for any mixed datatype columns to address the Dtype Warning that occurs when calling the dataset in pandas.

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Set path

path = r'D:\Data Analytics\Flight Delay Analysis\01 - Data\Wrangled Data'

In [3]:
# Import Datasets

df = pd.read_csv(os.path.join(path, 'all_flights.csv'), index_col = 0)

  df = pd.read_csv(os.path.join(path, 'all_flights.csv'), index_col = 0)


In [4]:
df.shape

(12608046, 47)

In [5]:
df.head()

Unnamed: 0,Year,Quarter,Month,Day_of_Month,Day_of_Week,Flight_Date,Marketing_Airline_Network_Code,Operating_Airline,Tail_Number,Flight_Number_Operating_Airline,...,Distance_Group,Carrier_Delay,Weather_Delay,NAS_Delay,Security_Delay,Late_Aircraft_Delay,Duplicate,Missing_Due_To_Cancellation,Airline,Marketing_Airline_Network
5,2019,1,1,4,5,2019-01-04,UA,OO,N114SY,5661,...,3,,,,,,N,False,SkyWest Airlines Inc.,United Airlines Inc.
6,2019,1,1,4,5,2019-01-04,UA,OO,N945SW,5664,...,3,,,,,,N,False,SkyWest Airlines Inc.,United Airlines Inc.
9,2019,1,1,4,5,2019-01-04,UA,OO,N145SY,5668,...,3,,,,,,N,False,SkyWest Airlines Inc.,United Airlines Inc.
11,2019,1,1,4,5,2019-01-04,UA,OO,N693BR,5670,...,3,13.0,0.0,3.0,0.0,10.0,N,False,SkyWest Airlines Inc.,United Airlines Inc.
12,2019,1,1,4,5,2019-01-04,UA,OO,N213SY,5671,...,1,,,,,,N,False,SkyWest Airlines Inc.,United Airlines Inc.


In [6]:
df.tail()

Unnamed: 0,Year,Quarter,Month,Day_of_Month,Day_of_Week,Flight_Date,Marketing_Airline_Network_Code,Operating_Airline,Tail_Number,Flight_Number_Operating_Airline,...,Distance_Group,Carrier_Delay,Weather_Delay,NAS_Delay,Security_Delay,Late_Aircraft_Delay,Duplicate,Missing_Due_To_Cancellation,Airline,Marketing_Airline_Network
7013492,2022,4,12,1,4,2022-12-01,UA,YX,N739YX,3414,...,1,,,,,,N,False,Republic Airlines,United Airlines Inc.
7013493,2022,4,12,1,4,2022-12-01,UA,YX,N643RW,3413,...,2,0.0,0.0,0.0,0.0,48.0,N,False,Republic Airlines,United Airlines Inc.
7013494,2022,4,12,1,4,2022-12-01,UA,YX,N729YX,3410,...,3,,,,,,N,False,Republic Airlines,United Airlines Inc.
7013495,2022,4,12,1,4,2022-12-01,UA,YX,N646RW,3409,...,1,,,,,,N,False,Republic Airlines,United Airlines Inc.
7013500,2022,4,12,1,4,2022-12-01,UA,YX,N750YX,3401,...,2,,,,,,N,False,Republic Airlines,United Airlines Inc.


In [7]:
datatypes = df.dtypes

In [8]:
print(datatypes)

Year                                 int64
Quarter                              int64
Month                                int64
Day_of_Month                         int64
Day_of_Week                          int64
Flight_Date                         object
Marketing_Airline_Network_Code      object
Operating_Airline                   object
Tail_Number                         object
Flight_Number_Operating_Airline      int64
Origin                              object
Origin_City_Name                    object
Origin_State                        object
Origin_State_Name                   object
Dest                                object
Dest_City_Name                      object
Dest_State                          object
Dest_State_Name                     object
Dep_Time                           float64
Dep_Delay                          float64
Dep_Delay_Minutes                  float64
Dep_Del_15                         float64
Departure_Delay_Groups             float64
Dep_Time_Bl

## Downcasting Datatypes

In [9]:
# use the pd.to_numeric() function to downcast numeric data:

df['Year'] = pd.to_numeric(df['Year'], downcast='integer')

In [10]:
# Check Year dtype to ensure the change

df['Year'].dtype

dtype('int16')

In [11]:
# Downcast remaining integers and floats

df['Quarter'] = pd.to_numeric(df['Quarter'], downcast='integer')

In [12]:
df['Month'] = pd.to_numeric(df['Month'], downcast='integer')

In [13]:
df['Day_of_Month'] = pd.to_numeric(df['Day_of_Month'], downcast='integer')

In [14]:
df['Day_of_Week'] = pd.to_numeric(df['Day_of_Week'], downcast='integer')

In [15]:
df['Flight_Number_Operating_Airline'] = pd.to_numeric(df['Flight_Number_Operating_Airline'], downcast='integer')

In [16]:
df['Dep_Time'] = pd.to_numeric(df['Dep_Time'], downcast='float')

In [17]:
df['Dep_Delay'] = pd.to_numeric(df['Dep_Delay'], downcast='float')

In [18]:
df['Dep_Delay_Minutes'] = pd.to_numeric(df['Dep_Delay_Minutes'], downcast='float')

In [19]:
df['Dep_Del_15'] = pd.to_numeric(df['Dep_Del_15'], downcast='float')

In [20]:
df['Departure_Delay_Groups'] = pd.to_numeric(df['Departure_Delay_Groups'], downcast='float')

In [21]:
# Inspect Dep_Time_Blk to see why it is an object

df['Dep_Time_Blk'].value_counts()

Dep_Time_Blk
0800-0859    1022248
1000-1059     908269
0900-0959     838549
1500-1559     815428
1100-1159     768284
1800-1859     760737
1900-1959     748944
1200-1259     737611
0700-0759     730142
1300-1359     729329
1700-1759     729248
1400-1459     709326
2000-2059     693076
1600-1659     674666
0600-0659     568006
2100-2159     513913
2200-2259     343971
0001-0559     187426
2300-2359     128873
Name: count, dtype: int64

### The time blocks are an object datatype because they contain strings (with '-' between each time range).

In [22]:
# Continue downcasting remaining variables

df['Arr_Time'] = pd.to_numeric(df['Arr_Time'], downcast='float')

In [23]:
df['Arr_Delay_Minutes'] = pd.to_numeric(df['Arr_Delay_Minutes'], downcast='float')

In [24]:
df['Arr_Del_15'] = pd.to_numeric(df['Arr_Del_15'], downcast='float')

In [25]:
df['Arrival_Delay_Groups'] = pd.to_numeric(df['Arrival_Delay_Groups'], downcast='float')

In [26]:
# Inspect Cancelled to see why its a float

df['Cancelled'].value_counts()

Cancelled
0.0    12262329
1.0      345717
Name: count, dtype: int64

In [27]:
# Downcast Cancelled and continue with remaining columns

df['Cancelled'] = pd.to_numeric(df['Cancelled'], downcast='float')

In [28]:
# Convert Cancellation Code to a Category

df['Cancellation_Code'] = df['Cancellation_Code'].astype('category')

In [29]:
df['Cancellation_Code'].value_counts()

Cancellation_Code
B    120333
D    112048
A     82459
C     30877
Name: count, dtype: int64

In [30]:
df['Diverted'] = pd.to_numeric(df['Diverted'], downcast='float')

In [31]:
df['Actual_Elapsed_Time'] = pd.to_numeric(df['Actual_Elapsed_Time'], downcast='float')

In [32]:
df['Air_Time'] = pd.to_numeric(df['Air_Time'], downcast='float')

In [33]:
df['Flights'] = pd.to_numeric(df['Flights'], downcast='float')

In [34]:
df['Distance'] = pd.to_numeric(df['Distance'], downcast='float')

In [35]:
df['Distance_Group'] = pd.to_numeric(df['Distance_Group'], downcast='integer')

In [36]:
df['Carrier_Delay'] = pd.to_numeric(df['Carrier_Delay'], downcast='float')

In [37]:
df['Weather_Delay'] = pd.to_numeric(df['Weather_Delay'], downcast='float')

In [38]:
df['NAS_Delay'] = pd.to_numeric(df['NAS_Delay'], downcast='float')

In [39]:
df['Security_Delay'] = pd.to_numeric(df['Security_Delay'], downcast='float')

In [40]:
df['Late_Aircraft_Delay'] = pd.to_numeric(df['Late_Aircraft_Delay'], downcast='float')

In [41]:
# Check to ensure changes made

datatypes = df.dtypes

In [42]:
print(datatypes)

Year                                  int16
Quarter                                int8
Month                                  int8
Day_of_Month                           int8
Day_of_Week                            int8
Flight_Date                          object
Marketing_Airline_Network_Code       object
Operating_Airline                    object
Tail_Number                          object
Flight_Number_Operating_Airline       int16
Origin                               object
Origin_City_Name                     object
Origin_State                         object
Origin_State_Name                    object
Dest                                 object
Dest_City_Name                       object
Dest_State                           object
Dest_State_Name                      object
Dep_Time                            float32
Dep_Delay                           float32
Dep_Delay_Minutes                   float32
Dep_Del_15                          float32
Departure_Delay_Groups          

## Search for any remaining Mixed Datatypes and correct.

In [44]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [45]:
for col in df.columns.tolist():
      weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
      if len (df[weird]) > 0:
        print (col)

Cancellation_Code


In [46]:
# Correct the Cancellation_Code Mixed Datatype
# There are no missing values

df['Cancellation_Code'] = df['Cancellation_Code'].astype(str)

In [47]:
# Export as .csv for further analysis

df.to_csv(os.path.join(path, 'all_flights.csv'))