## Data Cleaning/Pre-processing

- Sort by DAY_OF_MONTH
- Delete rows in CANCELLED if CANCELLED is 1 rows
- Delete rows in ORIGIN if ORIGIN is 'LAS'
- Delete rows in DEST if DEST is not 'LAS' 
- Remove CANCELLED and Unnamed: 19 columns
- Fill 'NaN's with 0
- Replace last 6 columns from float64 -> int64
- Rename columns to lowercase letters
- Export 

In [21]:
import pandas as pd
import numpy as np
import csv

In [28]:
# File path shortcut
csv_name = '\Detailed_Statistics_Arrivals.csv'
filename = r'Datasets\Unprocessed'+csv_name
filename

filename_cleaned = 'Datasets\Processing'+csv_name

In [29]:
# Write to skip first 6 lines
# Problem solved:
# https://www.kite.com/python/answers/how-to-delete-a-line-from-a-file-in-python

# Open unprocessed file 
file = open(filename, 'r+')
lines = file.readlines()

file.close()

for i in range(7): 
    # print(i, lines[0])
    del lines[0]

new_file = open(filename_cleaned, 'w+')
for line in lines:
    new_file.write(line)

new_file.close()

In [30]:
# January 2019, LAS example
filename = r'Datasets\Processing'+csv_name
my_data = pd.read_csv(filename)

FileNotFoundError: [Errno 2] File b'Datasets\\Processing\\Detailed_Statistics_Arrivals.csv' does not exist: b'Datasets\\Processing\\Detailed_Statistics_Arrivals.csv'

In [None]:
my_data.head()

In [None]:
my_data.info()

In [None]:
# Create copy, checkpoint 
df = my_data.copy()

In [None]:
df.head()

In [None]:
# Sort by DAY_OF_MONTH
df = df.sort_values('DAY_OF_MONTH')

In [None]:
# Check first 5 of data
df.head()

In [None]:
# Check last 5 of data
df.tail()

In [None]:
# Remove if flight was cancelled 
# Remove if flight is a departure 
df = df[df['CANCELLED'] != 1]
df = df[df['ORIGIN'] != 'LAS']
df = df[df['DEST'] == 'LAS']

In [None]:
# Delete columns 'CANCELLED' and 'Unnamed: __'
df = df.drop(['CANCELLED'], axis=1)
df = df.drop(df.columns[-1], axis=1)

In [None]:
# Check if columns are deleted
df.head()

In [None]:
df.info()

In [None]:
# Replace NaN's with 0's 
df.fillna(0, inplace=True)

In [None]:
# Check data 
df.info()

In [None]:
# Check data if 0's are applied
df

In [None]:
# Check datatypes
df.info()

In [None]:
cols = ['ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW', 'ARR_DEL15', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']
df[cols] = df[cols].applymap(np.int64)

In [None]:
df.info()

In [None]:
# Checkpoint!!
df = df.copy()

In [None]:
# Look at uppercase columns
df.columns

In [None]:
# Convert to lowercase columns
df.columns = df.columns.str.lower()
df.columns

In [None]:
# Export to csv
df.to_csv('Datasets/Preprocessed/' + csv_name, index=False)