Data Description: Google "BTS On Time Performance"

Walkthrough for dealing with Big Data / multiple CSVs in Pandas [link](https://www.dataquest.io/blog/pandas-big-data/)

In [0]:
# download zip
from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving on_time_12mos.zip to on_time_12mos.zip
User uploaded file "on_time_12mos.zip" with length 303256386 bytes


In [0]:
!ls

on_time_12mos.zip  sample_data


In [0]:
!unzip on_time_12mos.zip -d data/

Archive:  on_time_12mos.zip
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2014_10.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2014_11.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2014_12.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2015_1.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2015_2.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2015_3.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2015_4.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2015_5.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2015_6.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2015_7.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2015_8.csv  
  inflating: data/on_time_12mos/On_Time_On_Time_Performance_2015_9.csv  


In [0]:
!ls

data  on_time_12mos.zip  sample_data


In [0]:
!rm on_time_12mos.zip

In [0]:
%cd data/on_time_12mos/

/content/data/on_time_12mos


In [0]:
!ls

On_Time_On_Time_Performance_2014_10.csv  On_Time_On_Time_Performance_2015_4.csv
On_Time_On_Time_Performance_2014_11.csv  On_Time_On_Time_Performance_2015_5.csv
On_Time_On_Time_Performance_2014_12.csv  On_Time_On_Time_Performance_2015_6.csv
On_Time_On_Time_Performance_2015_1.csv	 On_Time_On_Time_Performance_2015_7.csv
On_Time_On_Time_Performance_2015_2.csv	 On_Time_On_Time_Performance_2015_8.csv
On_Time_On_Time_Performance_2015_3.csv	 On_Time_On_Time_Performance_2015_9.csv


# Imports

In [0]:
import pandas as pd
import numpy as np
import os
import glob

# Load Data Sample

In [0]:
df = pd.read_csv('On_Time_On_Time_Performance_2014_10.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
df.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,UniqueCarrier,AirlineID,Carrier,TailNum,...,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Unnamed: 109
0,2014,4,10,23,4,2014-10-23,DL,19790,DL,N918DL,...,,,,,,,,,,
1,2014,4,10,23,4,2014-10-23,DL,19790,DL,N358NW,...,,,,,,,,,,
2,2014,4,10,23,4,2014-10-23,DL,19790,DL,N893AT,...,,,,,,,,,,
3,2014,4,10,23,4,2014-10-23,DL,19790,DL,N693DL,...,,,,,,,,,,
4,2014,4,10,23,4,2014-10-23,DL,19790,DL,N998AT,...,,,,,,,,,,


In [0]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491011 entries, 0 to 491010
Columns: 110 entries, Year to Unnamed: 109
dtypes: float64(70), int64(21), object(19)
memory usage: 831.9 MB


In [0]:
df.shape

(491011, 110)

In [0]:
for dtype in ['float','int','object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))


Average memory usage for float columns: 3.69 MB
Average memory usage for int columns: 3.58 MB
Average memory usage for object columns: 24.55 MB


# Eliminate Unneeded Columns

In [0]:
rows = df.shape[0]
delete_col_list = []

for i in df:
    if df[i].dtype == 'object':
        if df[i].isnull().sum() > 0: 
            if (df[i].isnull().sum() / rows) > 0.98:
                delete_col_list.append(i)

In [0]:
df_obj = df.select_dtypes(include=['object']).copy()
print(df_obj.describe())

        FlightDate UniqueCarrier Carrier TailNum  Origin OriginCityName  \
count       491011        491011  491011  490459  491011         491011   
unique          31            14      14    4344     309            305   
top     2014-10-23            WN      WN  N480HA     ATL    Chicago, IL   
freq         16753         99257   99257     380   31618          34108   

       OriginState OriginStateName    Dest DestCityName DestState  \
count       491011          491011  491011       491011    491011   
unique          53              53     308          304        53   
top             CA      California     ATL  Chicago, IL        CA   
freq         62370           62370   31613        34160     62358   

       DestStateName DepTimeBlk ArrTimeBlk CancellationCode Div1Airport  \
count         491011     491011     491011             5206         841   
unique            53         19         19                3         149   
top       California  0800-0859  1600-1659           

In [0]:
object_list = []

for i in df:
    if df[i].dtype == 'O':
        object_list.append(i)
        
print(object_list)

['FlightDate', 'UniqueCarrier', 'Carrier', 'TailNum', 'Origin', 'OriginCityName', 'OriginState', 'OriginStateName', 'Dest', 'DestCityName', 'DestState', 'DestStateName', 'DepTimeBlk', 'ArrTimeBlk', 'CancellationCode', 'Div1Airport', 'Div1TailNum', 'Div2Airport', 'Div2TailNum']


In [0]:
# delete redundant columns
redundant_cols = ['UniqueCarrier', 'OriginCityName', 'OriginStateName', 'DestCityName', 'DestStateName', 'FlightDate', 
                 'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'OriginStateFips', 
                 'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestStateFips']

In [0]:
high_null_list = []

for i in df:
    if df[i].notnull().sum() < 10:
        high_null_list.append(i)

In [0]:
df.columns

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'UniqueCarrier', 'AirlineID', 'Carrier', 'TailNum',
       ...
       'Div4TailNum', 'Div5Airport', 'Div5AirportID', 'Div5AirportSeqID',
       'Div5WheelsOn', 'Div5TotalGTime', 'Div5LongestGTime', 'Div5WheelsOff',
       'Div5TailNum', 'Unnamed: 109'],
      dtype='object', length=110)

In [0]:
other_divert_measures = ['Div1AirportID', 'Div1AirportSeqID', 'Div1WheelsOn', 'Div1TotalGTime', 
                         'Div1LongestGTime', 'Div1WheelsOff', 'DivActualElapsedTime', 'DivArrDelay', 'DivDistance']

In [0]:
df.columns

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'UniqueCarrier', 'AirlineID', 'Carrier', 'TailNum',
       ...
       'Div4TailNum', 'Div5Airport', 'Div5AirportID', 'Div5AirportSeqID',
       'Div5WheelsOn', 'Div5TotalGTime', 'Div5LongestGTime', 'Div5WheelsOff',
       'Div5TailNum', 'Unnamed: 109'],
      dtype='object', length=110)

In [0]:
unpredictive_measures = ['FlightNum', 'DepTime', 'DepDelay', 'DepDelayMinutes', 'DepDel15', 
                         'DepartureDelayGroups', 'ArrTime', 'ArrDelay', 'ArrDelayMinutes', 
                         'ArrDel15', 'ArrivalDelayGroups', 'TaxiOut', 'WheelsOff', 'WheelsOn', 'TaxiIn', 
                         'FirstDepTime', 'TotalAddGTime', 'LongestAddGTime', 'AirTime', 'ActualElapsedTime']

In [0]:
delete_col_list = delete_col_list + high_null_list + redundant_cols + other_divert_measures + unpredictive_measures
print(delete_col_list)

['CancellationCode', 'Div1Airport', 'Div1TailNum', 'Div2Airport', 'Div2TailNum', 'Div2Airport', 'Div2AirportID', 'Div2AirportSeqID', 'Div2WheelsOn', 'Div2TotalGTime', 'Div2LongestGTime', 'Div2WheelsOff', 'Div2TailNum', 'Div3Airport', 'Div3AirportID', 'Div3AirportSeqID', 'Div3WheelsOn', 'Div3TotalGTime', 'Div3LongestGTime', 'Div3WheelsOff', 'Div3TailNum', 'Div4Airport', 'Div4AirportID', 'Div4AirportSeqID', 'Div4WheelsOn', 'Div4TotalGTime', 'Div4LongestGTime', 'Div4WheelsOff', 'Div4TailNum', 'Div5Airport', 'Div5AirportID', 'Div5AirportSeqID', 'Div5WheelsOn', 'Div5TotalGTime', 'Div5LongestGTime', 'Div5WheelsOff', 'Div5TailNum', 'Unnamed: 109', 'UniqueCarrier', 'OriginCityName', 'OriginStateName', 'DestCityName', 'DestStateName', 'FlightDate', 'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'OriginStateFips', 'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestStateFips', 'Div1AirportID', 'Div1AirportSeqID', 'Div1WheelsOn', 'Div1TotalGTime', 'Div1LongestGTime', 'Di

In [0]:
df = df.drop(delete_col_list, axis=1)

In [0]:
df.info(memory_usage='deep', verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491011 entries, 0 to 491010
Columns: 31 entries, Year to DivReachedDest
dtypes: float64(11), int64(12), object(8)
memory usage: 316.5 MB


# Some Dictionary Functions

In [0]:
# function to return first 10 key-value pairs in a dictionary
def dict_top_ten(dict):
    return {k: dict[k] for k in list(dict)[:10]}

# function to return last 10 key-value paris in a dictionary
def dict_last_ten(dict):
    return {k: dict[k] for k in list(dict)[-10:]}

# convert defaultdict into a dict
def defaultdict_to_dict(dict):
    return {k: dict[k] for k in list(dict)[:]}

# Object Value Counts

In [0]:
def print_object_vcs_and_nulls(df, num_values_print=10):
    for i in df:
        if df[i].dtype == 'O':
            num_nulls = df[i].isnull().sum()
            total_values = df[i].shape[0]
            null_per = str(np.round(((num_nulls / total_values) * 100),3)) + "%"
            unique_values = df[i].nunique()
            
            print(df[i].value_counts()[:num_values_print])  
            print("Number of Null Values: " + str(num_nulls))
            print("Percentage of Nulls = " + str(null_per) + "%")
            print("Number of Unique Values: " + str(unique_values))
            print("\n")

In [0]:
print_object_vcs_and_nulls(df)

WN    99257
DL    70676
EV    56533
OO    51019
AA    45113
UA    43335
US    35634
MQ    32010
B6    20113
AS    13398
Name: Carrier, dtype: int64
Number of Null Values: 0
Percentage of Nulls = 0.0%%
Number of Unique Values: 14


N480HA    380
N479HA    367
N476HA    365
N477HA    353
N484HA    348
N493HA    344
N485HA    339
N487HA    331
N483HA    322
N478HA    311
Name: TailNum, dtype: int64
Number of Null Values: 552
Percentage of Nulls = 0.112%%
Number of Unique Values: 4344


ATL    31618
ORD    26257
DFW    23519
DEN    19416
LAX    18199
IAH    14742
SFO    14499
PHX    13347
LAS    11779
CLT     9755
Name: Origin, dtype: int64
Number of Null Values: 0
Percentage of Nulls = 0.0%%
Number of Unique Values: 309


CA    62370
TX    61000
IL    35559
GA    33089
FL    32096
NY    22448
CO    20966
AZ    15210
NC    14606
VA    14580
Name: OriginState, dtype: int64
Number of Null Values: 0
Percentage of Nulls = 0.0%%
Number of Unique Values: 53


ATL    31613
ORD    26310
DFW    235

In [0]:
object_list = []

for i in df:
    if df[i].dtype  == 'object':
        object_list.append(i)
        
object_list

['Carrier',
 'TailNum',
 'Origin',
 'OriginState',
 'Dest',
 'DestState',
 'DepTimeBlk',
 'ArrTimeBlk']

# Map TailNum; Fill in Nulls

In [0]:
print(df['TailNum'].isnull().sum())
print(df['TailNum'].nunique())

552
4344


In [0]:
df['TailNum'].fillna('Unknown', inplace=True)

In [0]:
tailnum_list = df['TailNum'].value_counts().index.tolist()
tailnum_list[:5]

['Unknown', 'N480HA', 'N479HA', 'N476HA', 'N477HA']

In [0]:
tailnum_dict = dict()
for x in range(0, (len(tailnum_list))):
    tailnum_dict[tailnum_list[x]] = x
    
dict_top_ten(tailnum_dict)

{'N476HA': 3,
 'N477HA': 4,
 'N479HA': 2,
 'N480HA': 1,
 'N483HA': 9,
 'N484HA': 5,
 'N485HA': 7,
 'N487HA': 8,
 'N493HA': 6,
 'Unknown': 0}

In [0]:
df['TailNum'] = df['TailNum'].map(tailnum_dict).astype(int)

# Map Carriers

In [0]:
df['Carrier'].value_counts()

WN    99257
DL    70676
EV    56533
OO    51019
AA    45113
UA    43335
US    35634
MQ    32010
B6    20113
AS    13398
F9     8211
HA     6329
VX     4722
FL     4661
Name: Carrier, dtype: int64

In [0]:
carrier_list = df['Carrier'].value_counts().index.tolist()
carrier_list

['WN',
 'DL',
 'EV',
 'OO',
 'AA',
 'UA',
 'US',
 'MQ',
 'B6',
 'AS',
 'F9',
 'HA',
 'VX',
 'FL']

In [0]:
carrier_dict = dict()
for x in range(0, (len(carrier_list))):
    carrier_dict[carrier_list[x]] = x
    
carrier_dict

{'AA': 4,
 'AS': 9,
 'B6': 8,
 'DL': 1,
 'EV': 2,
 'F9': 10,
 'FL': 13,
 'HA': 11,
 'MQ': 7,
 'OO': 3,
 'UA': 5,
 'US': 6,
 'VX': 12,
 'WN': 0}

In [0]:
df['Carrier'] = df['Carrier'].map(carrier_dict).astype(int)

# Map States

In [0]:
state_list = df['OriginState'].value_counts().index.tolist()
state_list[:5]

['CA', 'TX', 'IL', 'GA', 'FL']

In [0]:
state_dict = dict()
for x in range(0, (len(state_list))):
    state_dict[state_list[x]] = x
    
dict_top_ten(state_dict)

{'AZ': 7,
 'CA': 0,
 'CO': 6,
 'FL': 4,
 'GA': 3,
 'IL': 2,
 'NC': 8,
 'NY': 5,
 'TX': 1,
 'VA': 9}

In [0]:
state_dict

{'AK': 30,
 'AL': 31,
 'AR': 33,
 'AZ': 7,
 'CA': 0,
 'CO': 6,
 'CT': 36,
 'DE': 51,
 'FL': 4,
 'GA': 3,
 'HI': 19,
 'IA': 37,
 'ID': 38,
 'IL': 2,
 'IN': 28,
 'KS': 40,
 'KY': 26,
 'LA': 23,
 'MA': 13,
 'MD': 20,
 'ME': 47,
 'MI': 11,
 'MN': 14,
 'MO': 16,
 'MS': 42,
 'MT': 41,
 'NC': 8,
 'ND': 39,
 'NE': 34,
 'NH': 46,
 'NJ': 18,
 'NM': 32,
 'NV': 10,
 'NY': 5,
 'OH': 22,
 'OK': 27,
 'OR': 24,
 'PA': 17,
 'PR': 35,
 'RI': 43,
 'SC': 29,
 'SD': 44,
 'TN': 21,
 'TT': 52,
 'TX': 1,
 'UT': 15,
 'VA': 9,
 'VI': 49,
 'VT': 48,
 'WA': 12,
 'WI': 25,
 'WV': 50,
 'WY': 45}

In [0]:
df['OriginState'] = df['OriginState'].map(state_dict).astype(int)

In [0]:
df['DestState'] = df['DestState'].map(state_dict).astype(int)

# Map Time Blocks

In [0]:
time_list = sorted(df['DepTimeBlk'].value_counts().index.tolist())
time_list

['0001-0559',
 '0600-0659',
 '0700-0759',
 '0800-0859',
 '0900-0959',
 '1000-1059',
 '1100-1159',
 '1200-1259',
 '1300-1359',
 '1400-1459',
 '1500-1559',
 '1600-1659',
 '1700-1759',
 '1800-1859',
 '1900-1959',
 '2000-2059',
 '2100-2159',
 '2200-2259',
 '2300-2359']

In [0]:
time_dict = dict()
for x in range(0, (len(time_list))):
    time_dict[time_list[x]] = x
    
dict_top_ten(time_dict)

{'0001-0559': 0,
 '0600-0659': 1,
 '0700-0759': 2,
 '0800-0859': 3,
 '0900-0959': 4,
 '1000-1059': 5,
 '1100-1159': 6,
 '1200-1259': 7,
 '1300-1359': 8,
 '1400-1459': 9}

In [0]:
df['DepTimeBlk'] = df['DepTimeBlk'].map(time_dict).astype(int)
df['ArrTimeBlk'] = df['ArrTimeBlk'].map(time_dict).astype(int)

# Map Destination and Origin

In [0]:
airport_list = df['Origin'].value_counts().index.tolist()
airport_list.append('Other')
airport_list = airport_list[::-1]
print(airport_list[:5])
print(airport_list[-10:])

['Other', 'GFK', 'EGE', 'ADK', 'PPG']
['CLT', 'LAS', 'PHX', 'SFO', 'IAH', 'LAX', 'DEN', 'DFW', 'ORD', 'ATL']


In [0]:
airport_dict = dict()
for x in range(0, (len(airport_list))):
    airport_dict[airport_list[x]] = x
    
dict_top_ten(airport_dict)

{'ACK': 7,
 'ADK': 3,
 'BKG': 8,
 'EGE': 2,
 'GFK': 1,
 'MQT': 9,
 'MTJ': 6,
 'MVY': 5,
 'Other': 0,
 'PPG': 4}

In [0]:
dict_last_ten(airport_dict)

{'ATL': 309,
 'CLT': 300,
 'DEN': 306,
 'DFW': 307,
 'IAH': 304,
 'LAS': 301,
 'LAX': 305,
 'ORD': 308,
 'PHX': 302,
 'SFO': 303}

In [0]:
df['Dest'] = df['Dest'].map(airport_dict)
df['Origin'] = df['Origin'].map(airport_dict)

In [0]:
df['Dest'] = df['Dest'].fillna(0).astype(int)
df['Origin'] = df['Origin'].fillna(0).astype(int)

In [0]:
df.info(memory_usage='deep', verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491011 entries, 0 to 491010
Columns: 31 entries, Year to DivReachedDest
dtypes: float64(11), int64(20)
memory usage: 116.1 MB


# Impute Nulls for Floats

In [0]:
def print_float_nulls(df):
    for i in df:
        if df[i].dtype == 'float':
            num_nulls = df[i].isnull().sum()
            total_values = df[i].shape[0]
            non_nulls = total_values - num_nulls
            null_per = str(np.round(((num_nulls / total_values) * 100),3)) + "%"
            unique_values = df[i].nunique()
            
            print("========" + i + "========")
            print("Number of Null Values: " + str(num_nulls))
            print("Number of Non-Null Values: " + str(non_nulls))
            print("Percentage of Nulls = " + str(null_per) + "%")
            print("Number of Unique Values: " + str(unique_values))
            print("\n")

In [0]:
print_float_nulls(df)

Number of Null Values: 0
Number of Non-Null Values: 491011
Percentage of Nulls = 0.0%%
Number of Unique Values: 2


Number of Null Values: 0
Number of Non-Null Values: 491011
Percentage of Nulls = 0.0%%
Number of Unique Values: 2


Number of Null Values: 3
Number of Non-Null Values: 491008
Percentage of Nulls = 0.001%%
Number of Unique Values: 424


Number of Null Values: 0
Number of Non-Null Values: 491011
Percentage of Nulls = 0.0%%
Number of Unique Values: 1


Number of Null Values: 0
Number of Non-Null Values: 491011
Percentage of Nulls = 0.0%%
Number of Unique Values: 1193


Number of Null Values: 398726
Number of Non-Null Values: 92285
Percentage of Nulls = 81.205%%
Number of Unique Values: 493


Number of Null Values: 398726
Number of Non-Null Values: 92285
Percentage of Nulls = 81.205%%
Number of Unique Values: 246


Number of Null Values: 398726
Number of Non-Null Values: 92285
Percentage of Nulls = 81.205%%
Number of Unique Values: 329


Number of Null Values: 398726
Number o

In [0]:
delay_list = [ 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

In [0]:
for i in df:
    if i in delay_list:
        df[i] = df[i].fillna(0.0).astype(int)

In [0]:
df.columns

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'AirlineID',
       'Carrier', 'TailNum', 'Origin', 'OriginState', 'OriginWac', 'Dest',
       'DestState', 'DestWac', 'CRSDepTime', 'DepTimeBlk', 'CRSArrTime',
       'ArrTimeBlk', 'Cancelled', 'Diverted', 'CRSElapsedTime', 'Flights',
       'Distance', 'DistanceGroup', 'CarrierDelay', 'WeatherDelay', 'NASDelay',
       'SecurityDelay', 'LateAircraftDelay', 'DivAirportLandings',
       'DivReachedDest'],
      dtype='object')

In [0]:
df['CRSDepTime'][3]

2015

In [0]:
df['CRSArrTime'][3]

2127

In [0]:
df['CRSElapsedTime'][3]

132.0

In [0]:
df['CRSElapsedTime'] = df['CRSElapsedTime'].fillna(df['CRSElapsedTime'].mean()).astype(int)

In [0]:
df.info(memory_usage='deep', verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491011 entries, 0 to 491010
Columns: 31 entries, Year to DivReachedDest
dtypes: float64(5), int64(26)
memory usage: 116.1 MB


# Downcast Numeric Features

In [0]:
int_types = ["uint8", "uint16", "uint32", "int8", "int16", "int32"]
for it in int_types:
    print(np.iinfo(it))

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for uint16
---------------------------------------------------------------
min = 0
max = 65535
---------------------------------------------------------------

Machine parameters for uint32
---------------------------------------------------------------
min = 0
max = 4294967295
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -

In [0]:
def col_min_max(df):
    for i in df:
        if df[i].dtype == 'int64':
            if df[i].min() >= 0:
                if df[i].max() > 4294967295:
                    print(i, 'uint64')
                elif df[i].max() > 65535:
                    print(i, 'uint32')
                elif df[i].max() > 255:
                    print(i, 'uint16')
                else: 
                    print(i, 'uint8')
      
col_min_max(df)

Year uint16
Quarter uint8
Month uint8
DayofMonth uint8
DayOfWeek uint8
AirlineID uint16
Carrier uint8
TailNum uint16
Origin uint16
OriginState uint8
OriginWac uint8
Dest uint16
DestState uint8
DestWac uint8
CRSDepTime uint16
DepTimeBlk uint8
CRSArrTime uint16
ArrTimeBlk uint8
CRSElapsedTime uint16
DistanceGroup uint8
CarrierDelay uint16
WeatherDelay uint16
NASDelay uint16
SecurityDelay uint8
LateAircraftDelay uint16
DivAirportLandings uint8


In [0]:
float_list = []

for i in df:
    if df[i].dtype == 'float64':
        float_list.append(i)
        
float_list[:5]

['Cancelled', 'Diverted', 'Flights', 'Distance', 'DivReachedDest']

In [0]:
# check for nulls
float_df = df[float_list]

In [0]:
df[float_list] = df[float_list].fillna(0.0).astype(int)

In [0]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

df_int = df.select_dtypes(include=['int'])
converted_int = df_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(df_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([df_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

116.13 MB
20.60 MB


Unnamed: 0,before,after
uint8,,18.0
uint16,,13.0
int64,31.0,


In [0]:
df_float = df.select_dtypes(include=['float'])
converted_float = df_float.apply(pd.to_numeric,downcast='float')

print(mem_usage(df_float))
print(mem_usage(converted_float))

compare_floats = pd.concat([df_float.dtypes,converted_float.dtypes],axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)

0.00 MB
0.00 MB


Unnamed: 0,before,after


In [0]:
# df = df.copy()

df[converted_int.columns] = converted_int
df[converted_float.columns] = converted_float

print(mem_usage(df))
# print(mem_usage(optimized_df))

20.60 MB


In [0]:
# https://www.dataquest.io/blog/pandas-big-data/

In [0]:
df.info(memory_usage='deep', verbose=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491011 entries, 0 to 491010
Columns: 31 entries, Year to DivReachedDest
dtypes: uint16(13), uint8(18)
memory usage: 20.6 MB


In [0]:
int_types = ['uint16', 'uint8']
float_types = ['float32']

In [0]:
df_uint8 = df.select_dtypes(include=['uint8'])
df_uint16 = df.select_dtypes(include=['uint16'])
df_float = df.select_dtypes(include=float_types)
df_object = df.select_dtypes(include=['object'])

In [0]:
print("U-Int8: ", mem_usage(df_uint8))
print("U-Int16: ", mem_usage(df_uint16))
print("Floats: ", mem_usage(df_float))
print("Objects: ", mem_usage(df_object))
print("Total: ", mem_usage(df))

U-Int8:  8.43 MB
U-Int16:  12.17 MB
Floats:  0.00 MB
Objects:  0.00 MB
Total:  20.60 MB
