In [None]:
# First load the modules we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Pandas Data Structures
1. Series
2. DataFrame



# Step 1: Get the Data

In [None]:
# we load the file 
flights = pd.read_csv("Data/flights.csv")

# Step 2: Examine the Data - Looking for issues that require fixing

In [None]:
# get number of row, columns with Shape
flights.shape

In [None]:
# Let's check out first 5 row and last 5 rows
# top 5 with head
flights.head()

In [None]:
# bottom 5 with head
flights.tail()

In [None]:
# list column names
flights.columns

In [None]:
# inspect data types for each column to make sure they are correct
flights.dtypes

In [None]:
# using info() we can soo how many non-null entries for each column exist
flights.info()

In [None]:
# Check if we have any duplicate records we use duplicated() function
flights.duplicated().sum()

**Count number of NaN values: either `isna()` or `isnull()`**

In [None]:
# counting the number of NaN values. Since isna() and isnull() return bools True (1) or False (0) 
flights.isna().sum()

In [None]:
# isnull and isna in Pandas are the same
flights.isnull().sum()

In [None]:
# Let us examine one column that we think is important to our analysis to examin the NaN rows
flights.loc[flights["AIR_TIME"].isnull()].head(10)                                              

In [None]:
# describe gives us general statistics for numeric columns
flights.describe()

In [None]:
# we can also use describe for categorical data 
flights.describe(include='object')

In [None]:
flights["AIRLINE"].value_counts(normalize=True)

In [None]:
flights["ORG_AIR"].value_counts(normalize=True).plot(kind='bar')

In [None]:
flights["AIRLINE"].value_counts(normalize=True).plot(kind='bar')

In [None]:
flights["CANCELLED"].value_counts(normalize=True)

In [None]:
flights["CANCELLED"].value_counts(normalize=True).plot(kind='pie')

In [None]:
flights["DIVERTED"].value_counts(normalize=True)

In [None]:
flights["DIVERTED"].value_counts(normalize=True).plot(kind='pie')

# Step 3: Data Wrangling and Preparation

In [None]:
flights_dropped = flights.dropna()
flights_dropped.info()

### Using `fillna()`

In [None]:
flights[flights["AIR_TIME"].isna()].head()

In [None]:
flights_fill = flights.fillna(method='backfill')

In [None]:
flights_fill.iloc[[24,25,51,52,73,74],:]

In [None]:
flights_fill2 = flights.fillna(method='pad')
flights_fill2.iloc[[23,24,25,50,51,52,72,73,74],:]

In [None]:
flights.AIR_TIME.mean()

In [None]:
air_time_avg = flights.AIR_TIME.mean()
air_time = flights["AIR_TIME"].fillna(air_time_avg)
air_time[[23,24,25,50,51,52,72,73,74]]

In [None]:
# lets examin a specific airline, org_air and dest_air 
flights[(flights["AIRLINE"] == "MQ") & (flights["ORG_AIR"] == "DFW") & (flights["DEST_AIR"] == "BTR")]

In [None]:
# using group by to get a better avg for this particular route by airline and org and des airports
at_avg = flights["AIR_TIME"].fillna(flights.groupby(["ORG_AIR","DEST_AIR"])["AIR_TIME"].transform("mean"))
at_avg[[24,51,73,78]]

In [None]:
flights.info()

In [None]:
flights["AIR_TIME"].fillna(flights.groupby(["ORG_AIR","DEST_AIR"])["AIR_TIME"].transform("mean"), inplace=True)

In [None]:
flights.info()

# Step 4: Conduct Formal Data Analysis:  What are the Questions we want to Answer

In [None]:
flights.groupby(["AIRLINE","ORG_AIR"])["DEP_DELAY", "ARR_DELAY", "CANCELLED", "AIR_TIME"].mean()

In [None]:
flights_by_airline = flights.groupby(["AIRLINE","ORG_AIR"])["DEP_DELAY", "ARR_DELAY", "CANCELLED", "AIR_TIME"].mean()
flights_by_airline.head()

In [None]:
flights_by_airline.index

In [None]:
flights_by_airline.columns

In [None]:
flights_by_airline.groupby(level=1).mean()

In [None]:
flights_by_airline.groupby(level=0).mean()

In [None]:
flights_by_airline.groupby(level=2).mean()