## Data Cleaning/Pre-processing

- Sort by DAY_OF_MONTH
- Delete CANCELLED WHERE CANCELLED == 1 rows
- Remove CANCELLED and Unnamed: 19 columns
- Fill 'NaN's with 0

In [19]:
import pandas as pd
import zipfile as zf

In [25]:
with zf.ZipFile('Datasets/Unprocessed', 'r') as zip_ref:
    zip_ref.extractall('Datasets/Unprocessed')

PermissionError: [Errno 13] Permission denied: 'Datasets/Unprocessed'

In [2]:
my_data = pd.read_csv('Datasets/Unprocessed/LAS_delays_Jan2019.csv')

In [3]:
my_data.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,...,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
0,2019,1,5,6,WN,N8626B,OMA,"Omaha, NE",LAS,"Las Vegas, NV",...,24.0,24.0,1.0,0.0,0.0,0.0,24.0,0.0,0.0,
1,2019,1,5,6,WN,N7818L,OMA,"Omaha, NE",LAS,"Las Vegas, NV",...,8.0,8.0,0.0,0.0,,,,,,
2,2019,1,5,6,WN,N746SW,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,-7.0,0.0,0.0,0.0,,,,,,
3,2019,1,5,6,WN,N790SW,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,-11.0,0.0,0.0,0.0,,,,,,
4,2019,1,5,6,WN,N7745A,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,5.0,5.0,0.0,0.0,,,,,,


In [4]:
my_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29058 entries, 0 to 29057
Data columns (total 21 columns):
YEAR                   29058 non-null int64
MONTH                  29058 non-null int64
DAY_OF_MONTH           29058 non-null int64
DAY_OF_WEEK            29058 non-null int64
OP_UNIQUE_CARRIER      29058 non-null object
TAIL_NUM               28982 non-null object
ORIGIN                 29058 non-null object
ORIGIN_CITY_NAME       29058 non-null object
DEST                   29058 non-null object
DEST_CITY_NAME         29058 non-null object
ARR_TIME               28717 non-null float64
ARR_DELAY              28697 non-null float64
ARR_DELAY_NEW          28697 non-null float64
ARR_DEL15              28697 non-null float64
CANCELLED              29058 non-null float64
CARRIER_DELAY          4715 non-null float64
WEATHER_DELAY          4715 non-null float64
NAS_DELAY              4715 non-null float64
SECURITY_DELAY         4715 non-null float64
LATE_AIRCRAFT_DELAY    4715 non-nul

In [5]:
# Create copy, checkpoint 
df = my_data.copy()

In [6]:
df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,...,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
0,2019,1,5,6,WN,N8626B,OMA,"Omaha, NE",LAS,"Las Vegas, NV",...,24.0,24.0,1.0,0.0,0.0,0.0,24.0,0.0,0.0,
1,2019,1,5,6,WN,N7818L,OMA,"Omaha, NE",LAS,"Las Vegas, NV",...,8.0,8.0,0.0,0.0,,,,,,
2,2019,1,5,6,WN,N746SW,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,-7.0,0.0,0.0,0.0,,,,,,
3,2019,1,5,6,WN,N790SW,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,-11.0,0.0,0.0,0.0,,,,,,
4,2019,1,5,6,WN,N7745A,ONT,"Ontario, CA",LAS,"Las Vegas, NV",...,5.0,5.0,0.0,0.0,,,,,,


In [7]:
# Sort by DAY_OF_MONTH
df = df.sort_values('DAY_OF_MONTH')

In [8]:
# Check first 5 of data
df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,...,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
18825,2019,1,1,2,WN,N237WN,LAS,"Las Vegas, NV",SJC,"San Jose, CA",...,73.0,73.0,1.0,0.0,0.0,0.0,0.0,0.0,73.0,
3438,2019,1,1,2,AA,N667AW,RNO,"Reno, NV",PHX,"Phoenix, AZ",...,0.0,0.0,0.0,0.0,,,,,,
23647,2019,1,1,2,NK,N647NK,SEA,"Seattle, WA",LAS,"Las Vegas, NV",...,-31.0,0.0,0.0,0.0,,,,,,
18921,2019,1,1,2,WN,N913WN,PDX,"Portland, OR",LAS,"Las Vegas, NV",...,4.0,4.0,0.0,0.0,,,,,,
8137,2019,1,1,2,DL,N897DN,MSP,"Minneapolis, MN",LAS,"Las Vegas, NV",...,-30.0,0.0,0.0,0.0,,,,,,


In [9]:
# Check last 5 of data
df.tail()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,...,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CANCELLED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 20
10024,2019,1,31,4,OO,N240SY,LAS,"Las Vegas, NV",SNA,"Santa Ana, CA",...,1.0,1.0,0.0,0.0,,,,,,
18100,2019,1,31,4,AA,N130AN,LAX,"Los Angeles, CA",LAS,"Las Vegas, NV",...,396.0,396.0,1.0,0.0,20.0,0.0,4.0,0.0,372.0,
10023,2019,1,31,4,OO,N241SY,LAS,"Las Vegas, NV",LAX,"Los Angeles, CA",...,1.0,1.0,0.0,0.0,,,,,,
10029,2019,1,31,4,OO,N241SY,SNA,"Santa Ana, CA",LAS,"Las Vegas, NV",...,12.0,12.0,0.0,0.0,,,,,,
6601,2019,1,31,4,DL,N833DN,LAS,"Las Vegas, NV",DTW,"Detroit, MI",...,-13.0,0.0,0.0,0.0,,,,,,


In [10]:
# Keep if CANCELLED is not 1 
df = df[df['CANCELLED'] != 1]

In [11]:
# Delete columns 'CANCELLED' and 'Unnamed: __'
df = df.drop(['CANCELLED'], axis=1)
df = df.drop(df.columns[-1], axis=1)

In [12]:
# Check if columns are deleted
df.head()

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
18825,2019,1,1,2,WN,N237WN,LAS,"Las Vegas, NV",SJC,"San Jose, CA",2213.0,73.0,73.0,1.0,0.0,0.0,0.0,0.0,73.0
3438,2019,1,1,2,AA,N667AW,RNO,"Reno, NV",PHX,"Phoenix, AZ",1527.0,0.0,0.0,0.0,,,,,
23647,2019,1,1,2,NK,N647NK,SEA,"Seattle, WA",LAS,"Las Vegas, NV",1911.0,-31.0,0.0,0.0,,,,,
18921,2019,1,1,2,WN,N913WN,PDX,"Portland, OR",LAS,"Las Vegas, NV",1319.0,4.0,4.0,0.0,,,,,
8137,2019,1,1,2,DL,N897DN,MSP,"Minneapolis, MN",LAS,"Las Vegas, NV",803.0,-30.0,0.0,0.0,,,,,


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28720 entries, 18825 to 6601
Data columns (total 19 columns):
YEAR                   28720 non-null int64
MONTH                  28720 non-null int64
DAY_OF_MONTH           28720 non-null int64
DAY_OF_WEEK            28720 non-null int64
OP_UNIQUE_CARRIER      28720 non-null object
TAIL_NUM               28720 non-null object
ORIGIN                 28720 non-null object
ORIGIN_CITY_NAME       28720 non-null object
DEST                   28720 non-null object
DEST_CITY_NAME         28720 non-null object
ARR_TIME               28717 non-null float64
ARR_DELAY              28697 non-null float64
ARR_DELAY_NEW          28697 non-null float64
ARR_DEL15              28697 non-null float64
CARRIER_DELAY          4715 non-null float64
WEATHER_DELAY          4715 non-null float64
NAS_DELAY              4715 non-null float64
SECURITY_DELAY         4715 non-null float64
LATE_AIRCRAFT_DELAY    4715 non-null float64
dtypes: float64(9), int64(4), obj

In [14]:
# Replace NaN's with 0's 
df.fillna(0, inplace=True)

In [15]:
# Check data 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28720 entries, 18825 to 6601
Data columns (total 19 columns):
YEAR                   28720 non-null int64
MONTH                  28720 non-null int64
DAY_OF_MONTH           28720 non-null int64
DAY_OF_WEEK            28720 non-null int64
OP_UNIQUE_CARRIER      28720 non-null object
TAIL_NUM               28720 non-null object
ORIGIN                 28720 non-null object
ORIGIN_CITY_NAME       28720 non-null object
DEST                   28720 non-null object
DEST_CITY_NAME         28720 non-null object
ARR_TIME               28720 non-null float64
ARR_DELAY              28720 non-null float64
ARR_DELAY_NEW          28720 non-null float64
ARR_DEL15              28720 non-null float64
CARRIER_DELAY          28720 non-null float64
WEATHER_DELAY          28720 non-null float64
NAS_DELAY              28720 non-null float64
SECURITY_DELAY         28720 non-null float64
LATE_AIRCRAFT_DELAY    28720 non-null float64
dtypes: float64(9), int64(4)

In [16]:
# Check data if 0's are applied
df

Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
18825,2019,1,1,2,WN,N237WN,LAS,"Las Vegas, NV",SJC,"San Jose, CA",2213.0,73.0,73.0,1.0,0.0,0.0,0.0,0.0,73.0
3438,2019,1,1,2,AA,N667AW,RNO,"Reno, NV",PHX,"Phoenix, AZ",1527.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23647,2019,1,1,2,NK,N647NK,SEA,"Seattle, WA",LAS,"Las Vegas, NV",1911.0,-31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18921,2019,1,1,2,WN,N913WN,PDX,"Portland, OR",LAS,"Las Vegas, NV",1319.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
8137,2019,1,1,2,DL,N897DN,MSP,"Minneapolis, MN",LAS,"Las Vegas, NV",803.0,-30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8138,2019,1,1,2,DL,N361NB,RNO,"Reno, NV",SLC,"Salt Lake City, UT",1314.0,33.0,33.0,1.0,7.0,0.0,26.0,0.0,0.0
8139,2019,1,1,2,DL,N361NB,SLC,"Salt Lake City, UT",RNO,"Reno, NV",845.0,-47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8140,2019,1,1,2,DL,N308DN,DTW,"Detroit, MI",LAS,"Las Vegas, NV",2226.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
8141,2019,1,1,2,DL,N861DN,LAS,"Las Vegas, NV",SLC,"Salt Lake City, UT",1429.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
8142,2019,1,1,2,DL,N679DA,ATL,"Atlanta, GA",RNO,"Reno, NV",2008.0,-27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Export to csv
df.to_csv('Datasets/Preprocessed/LAS_delays_Jan2019.csv')