## Data Cleaning/Pre-processing

- Sort by DAY_OF_MONTH
- Delete rows in CANCELLED if CANCELLED is 1 rows
- Delete rows in ORIGIN if ORIGIN is 'LAS'
- Delete rows in DEST if DEST is not 'LAS' 
- Remove CANCELLED and Unnamed: 19 columns
- Fill 'NaN's with 0
- Rename columns to lowercase letters
- Export 

In [25]:
import pandas as pd
import numpy as np

In [26]:
# January 2019, LAS example
csv_name = 'LAS_delays_January2019.csv'
my_data = pd.read_csv('Datasets/Unprocessed/' + csv_name)

In [27]:
my_data.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,...,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
0,2019,1,5,6,WN,N8626B,OMA,"Omaha, NE",LAS,"Las Vegas, NV",...,24.0,24.0,1.0,0.0,0.0,0.0,24.0,0.0,0.0,
1,2019,1,5,6,WN,N7818L,OMA,"Omaha, NE",LAS,"Las Vegas, NV",...,8.0,8.0,0.0,0.0,,,,,,
2,2019,1,5,6,WN,N746SW,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,-7.0,0.0,0.0,0.0,,,,,,
3,2019,1,5,6,WN,N790SW,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,-11.0,0.0,0.0,0.0,,,,,,
4,2019,1,5,6,WN,N7745A,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,5.0,5.0,0.0,0.0,,,,,,


In [28]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29058 entries, 0 to 29057
Data columns (total 21 columns):
YEAR                   29058 non-null int64
MONTH                  29058 non-null int64
DAY_OF_MONTH           29058 non-null int64
DAY_OF_WEEK            29058 non-null int64
OP_UNIQUE_CARRIER      29058 non-null object
TAIL_NUM               28982 non-null object
ORIGIN                 29058 non-null object
ORIGIN_CITY_NAME       29058 non-null object
DEST                   29058 non-null object
DEST_CITY_NAME         29058 non-null object
ARR_TIME               28717 non-null float64
ARR_DELAY              28697 non-null float64
ARR_DELAY_NEW          28697 non-null float64
ARR_DEL15              28697 non-null float64
CANCELLED              29058 non-null float64
CARRIER_DELAY          4715 non-null float64
WEATHER_DELAY          4715 non-null float64
NAS_DELAY              4715 non-null float64
SECURITY_DELAY         4715 non-null float64
LATE_AIRCRAFT_DELAY    4715 non-nul

In [29]:
# Create copy, checkpoint 
df = my_data.copy()

In [30]:
df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,...,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
0,2019,1,5,6,WN,N8626B,OMA,"Omaha, NE",LAS,"Las Vegas, NV",...,24.0,24.0,1.0,0.0,0.0,0.0,24.0,0.0,0.0,
1,2019,1,5,6,WN,N7818L,OMA,"Omaha, NE",LAS,"Las Vegas, NV",...,8.0,8.0,0.0,0.0,,,,,,
2,2019,1,5,6,WN,N746SW,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,-7.0,0.0,0.0,0.0,,,,,,
3,2019,1,5,6,WN,N790SW,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,-11.0,0.0,0.0,0.0,,,,,,
4,2019,1,5,6,WN,N7745A,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,5.0,5.0,0.0,0.0,,,,,,


In [31]:
# Sort by DAY_OF_MONTH
df = df.sort_values('DAY_OF_MONTH')

In [32]:
# Check first 5 of data
df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,...,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
18825,2019,1,1,2,WN,N237WN,LAS,"Las Vegas, NV",SJC,"San Jose, CA",...,73.0,73.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,
3438,2019,1,1,2,AA,N667AW,RNO,"Reno, NV",PHX,"Phoenix, AZ",...,0.0,0.0,0.0,0.0,,,,,,
23647,2019,1,1,2,NK,N647NK,SEA,"Seattle, WA",LAS,"Las Vegas, NV",...,-31.0,0.0,0.0,0.0,,,,,,
18921,2019,1,1,2,WN,N913WN,PDX,"Portland, OR",LAS,"Las Vegas, NV",...,4.0,4.0,0.0,0.0,,,,,,
8137,2019,1,1,2,DL,N897DN,MSP,"Minneapolis, MN",LAS,"Las Vegas, NV",...,-30.0,0.0,0.0,0.0,,,,,,


In [33]:
# Check last 5 of data
df.tail()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,...,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
10024,2019,1,31,4,OO,N240SY,LAS,"Las Vegas, NV",SNA,"Santa Ana, CA",...,1.0,1.0,0.0,0.0,,,,,,
18100,2019,1,31,4,AA,N130AN,LAX,"Los Angeles, CA",LAS,"Las Vegas, NV",...,396.0,396.0,1.0,0.0,20.0,0.0,4.0,0.0,372.0,
10023,2019,1,31,4,OO,N241SY,LAS,"Las Vegas, NV",LAX,"Los Angeles, CA",...,1.0,1.0,0.0,0.0,,,,,,
10029,2019,1,31,4,OO,N241SY,SNA,"Santa Ana, CA",LAS,"Las Vegas, NV",...,12.0,12.0,0.0,0.0,,,,,,
6601,2019,1,31,4,DL,N833DN,LAS,"Las Vegas, NV",DTW,"Detroit, MI",...,-13.0,0.0,0.0,0.0,,,,,,


In [34]:
# Remove if flight was cancelled 
# Remove if flight is a departure 
df = df[df['CANCELLED'] != 1]
df = df[df['ORIGIN'] != 'LAS']
df = df[df['DEST'] == 'LAS']

In [35]:
# Delete columns 'CANCELLED' and 'Unnamed: __'
df = df.drop(['CANCELLED'], axis=1)
df = df.drop(df.columns[-1], axis=1)

In [36]:
# Check if columns are deleted
df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
23647,2019,1,1,2,NK,N647NK,SEA,"Seattle, WA",LAS,"Las Vegas, NV",1911.0,-31.0,0.0,0.0,,,,,
18921,2019,1,1,2,WN,N913WN,PDX,"Portland, OR",LAS,"Las Vegas, NV",1319.0,4.0,4.0,0.0,,,,,
8137,2019,1,1,2,DL,N897DN,MSP,"Minneapolis, MN",LAS,"Las Vegas, NV",803.0,-30.0,0.0,0.0,,,,,
8140,2019,1,1,2,DL,N308DN,DTW,"Detroit, MI",LAS,"Las Vegas, NV",2226.0,10.0,10.0,0.0,,,,,
18920,2019,1,1,2,WN,N7751A,PDX,"Portland, OR",LAS,"Las Vegas, NV",1903.0,-12.0,0.0,0.0,,,,,


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13064 entries, 23647 to 10029
Data columns (total 19 columns):
YEAR                   13064 non-null int64
MONTH                  13064 non-null int64
DAY_OF_MONTH           13064 non-null int64
DAY_OF_WEEK            13064 non-null int64
OP_UNIQUE_CARRIER      13064 non-null object
TAIL_NUM               13064 non-null object
ORIGIN                 13064 non-null object
ORIGIN_CITY_NAME       13064 non-null object
DEST                   13064 non-null object
DEST_CITY_NAME         13064 non-null object
ARR_TIME               13063 non-null float64
ARR_DELAY              13059 non-null float64
ARR_DELAY_NEW          13059 non-null float64
ARR_DEL15              13059 non-null float64
CARRIER_DELAY          2136 non-null float64
WEATHER_DELAY          2136 non-null float64
NAS_DELAY              2136 non-null float64
SECURITY_DELAY         2136 non-null float64
LATE_AIRCRAFT_DELAY    2136 non-null float64
dtypes: float64(9), int64(4), ob

In [38]:
# Replace NaN's with 0's 
df.fillna(0, inplace=True)

In [39]:
# Check data 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13064 entries, 23647 to 10029
Data columns (total 19 columns):
YEAR                   13064 non-null int64
MONTH                  13064 non-null int64
DAY_OF_MONTH           13064 non-null int64
DAY_OF_WEEK            13064 non-null int64
OP_UNIQUE_CARRIER      13064 non-null object
TAIL_NUM               13064 non-null object
ORIGIN                 13064 non-null object
ORIGIN_CITY_NAME       13064 non-null object
DEST                   13064 non-null object
DEST_CITY_NAME         13064 non-null object
ARR_TIME               13064 non-null float64
ARR_DELAY              13064 non-null float64
ARR_DELAY_NEW          13064 non-null float64
ARR_DEL15              13064 non-null float64
CARRIER_DELAY          13064 non-null float64
WEATHER_DELAY          13064 non-null float64
NAS_DELAY              13064 non-null float64
SECURITY_DELAY         13064 non-null float64
LATE_AIRCRAFT_DELAY    13064 non-null float64
dtypes: float64(9), int64(4

In [40]:
# Check data if 0's are applied
df

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
23647,2019,1,1,2,NK,N647NK,SEA,"Seattle, WA",LAS,"Las Vegas, NV",1911.0,-31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18921,2019,1,1,2,WN,N913WN,PDX,"Portland, OR",LAS,"Las Vegas, NV",1319.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
8137,2019,1,1,2,DL,N897DN,MSP,"Minneapolis, MN",LAS,"Las Vegas, NV",803.0,-30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8140,2019,1,1,2,DL,N308DN,DTW,"Detroit, MI",LAS,"Las Vegas, NV",2226.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
18920,2019,1,1,2,WN,N7751A,PDX,"Portland, OR",LAS,"Las Vegas, NV",1903.0,-12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8145,2019,1,1,2,DL,N829DN,JFK,"New York, NY",LAS,"Las Vegas, NV",1430.0,-8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8147,2019,1,1,2,DL,N320NB,SEA,"Seattle, WA",LAS,"Las Vegas, NV",1332.0,-15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8149,2019,1,1,2,DL,N813DN,SLC,"Salt Lake City, UT",LAS,"Las Vegas, NV",1707.0,-26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3609,2019,1,1,2,AA,N982NN,JFK,"New York, NY",LAS,"Las Vegas, NV",947.0,-20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18912,2019,1,1,2,WN,N295WN,OMA,"Omaha, NE",LAS,"Las Vegas, NV",721.0,-24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
# Check datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13064 entries, 23647 to 10029
Data columns (total 19 columns):
YEAR                   13064 non-null int64
MONTH                  13064 non-null int64
DAY_OF_MONTH           13064 non-null int64
DAY_OF_WEEK            13064 non-null int64
OP_UNIQUE_CARRIER      13064 non-null object
TAIL_NUM               13064 non-null object
ORIGIN                 13064 non-null object
ORIGIN_CITY_NAME       13064 non-null object
DEST                   13064 non-null object
DEST_CITY_NAME         13064 non-null object
ARR_TIME               13064 non-null float64
ARR_DELAY              13064 non-null float64
ARR_DELAY_NEW          13064 non-null float64
ARR_DEL15              13064 non-null float64
CARRIER_DELAY          13064 non-null float64
WEATHER_DELAY          13064 non-null float64
NAS_DELAY              13064 non-null float64
SECURITY_DELAY         13064 non-null float64
LATE_AIRCRAFT_DELAY    13064 non-null float64
dtypes: float64(9), int64(4

In [42]:
cols = ['ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']
df[cols] = df[cols].applymap(np.int64)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13064 entries, 23647 to 10029
Data columns (total 19 columns):
YEAR                   13064 non-null int64
MONTH                  13064 non-null int64
DAY_OF_MONTH           13064 non-null int64
DAY_OF_WEEK            13064 non-null int64
OP_UNIQUE_CARRIER      13064 non-null object
TAIL_NUM               13064 non-null object
ORIGIN                 13064 non-null object
ORIGIN_CITY_NAME       13064 non-null object
DEST                   13064 non-null object
DEST_CITY_NAME         13064 non-null object
ARR_TIME               13064 non-null int64
ARR_DELAY              13064 non-null int64
ARR_DELAY_NEW          13064 non-null int64
ARR_DEL15              13064 non-null int64
CARRIER_DELAY          13064 non-null int64
WEATHER_DELAY          13064 non-null int64
NAS_DELAY              13064 non-null int64
SECURITY_DELAY         13064 non-null int64
LATE_AIRCRAFT_DELAY    13064 non-null int64
dtypes: int64(13), object(6)
memory usage: 2.

In [44]:
# Checkpoint!!
df = df.copy()

In [45]:
# Look at uppercase columns
df.columns

Index(['YEAR', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER',
       'TAIL_NUM', 'ORIGIN', 'ORIGIN_CITY_NAME', 'DEST', 'DEST_CITY_NAME',
       'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'],
      dtype='object')

In [46]:
# Convert to lowercase columns
df.columns = df.columns.str.lower()
df.columns

Index(['year', 'month', 'day_of_month', 'day_of_week', 'op_unique_carrier',
       'tail_num', 'origin', 'origin_city_name', 'dest', 'dest_city_name',
       'arr_time', 'arr_delay', 'arr_delay_new', 'arr_del15', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'],
      dtype='object')

In [47]:
# Export to csv
df.to_csv('Datasets/Preprocessed/' + csv_name, index=False)