In [2]:
import numpy as np
import pandas as pd
import requests

#### Requirements
```
pip3 install pandas lxml
```

In [133]:
# Fetching data from BTS database

import zipfile
import io

def get_data_csv():
    years_interval = ["2015","2018"]
    req_header = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Content-Type": "application/x-www-form-urlencoded",
        "Cookie": "ASPSESSIONIDCQCDRTCQ=FKGMOCEDMCDHINELAEGCEIKN",
        "Host": "www.transtats.bts.gov",
        "Origin": "https://www.transtats.bts.gov",
        "Pragma": "no-cache",
        "Referer": "https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
    }
    data_url = "https://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=236&Has_Group=3&Is_Zipped=0"
    pair_form_data = {
    "UserTableName":"On_Time_Performance",
    "DBShortName":"On_Time",
    "RawDataTable":"T_ONTIME",
    "sqlstr":\
            "SELECT YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_GROUP,DISTANCE,CANCELLED,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY \
            FROM T_ONTIME \
            WHERE (\
                DEST IN ('EWR','JFK','LGA') OR\
                ORIGIN IN ('EWR','JFK','LGA')\
            )\
            AND \
            Month BETWEEN 1 AND 12 AND \
            YEAR BETWEEN {} AND {}".format(years_interval[0],years_interval[1]),
    "varlist":"YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_GROUP,DISTANCE,CANCELLED,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY",
    }
    r = requests.post(data_url,
                      data=pair_form_data,
                      headers=req_header,
                     )
    z = zipfile.ZipFile(io.BytesIO(r.content))
    return pd.read_csv(io.BytesIO(z.read(z.infolist()[0])),
                       dtype={
                          "CRS_DEP_TIME":str,
                          "DEP_TIME":str,
                          "CRS_ARR_TIME":str,
                          "ARR_TIME":str,
                          "CANCELLED":bool,
                          "DIVERTED":bool},
                      usecols=list(range(24))
                      )

In [12]:
# scraping hourly weather data from wunderground.com

import lxml
from multiprocessing.dummy import Pool
from bs4 import BeautifulSoup

NYC_AIRPORTS = ["EWR","JFK","LGA"]
MIA_AIRPORTS = ["FLL","MIA"]

def wind_speed_conv(x):
    try:
        ret = 0.0 if x == "Calm" else float(x)
    except:
        ret = np.NaN
    return ret
DIR_ANGLE={
            "North": 0,"NNE": 1,"NE": 2,"ENE": 3,"East": 4,"ESE": 5,"SE": 6,"SSE": 7,
            "South": 8,"SSW": 9,"SW":10,"WSW":11,"West":12,"WNW":13,"NW":14,"NNW":15,
          }
def wind_dir_conv(x):
    return np.NaN if x not in DIR_ANGLE else 22.5*DIR_ANGLE[x]
def humidity_conv(x):
    try:
        ret = float(x.strip("%"))/100
    except:
        ret = np.NaN
    return ret
def precip_conv(x):
    try:
        ret = 0.0 if x == "N/A" else float(x)
    except:
        ret = 0.0
    return ret
def event_conv(x):
    return "".join(x.split("\t"))


def table_parser(pair):
    url, date, airport = pair
    content = requests.get(url).content
    def time_conv(x):
        return date + " " + x
    bs = BeautifulSoup(content, "html5lib")
    table_html = bs.find(id="obsTable")
    for unit_tag in table_html.find_all("span",class_="wx-unit"):
        unit_tag.decompose()
    table_df = pd.read_html(str(table_html), converters= \
                            {
                                "Time (EST)":time_conv,
                                "Time (EDT)":time_conv,
                                "Wind Speed":wind_speed_conv,
                                "Wind Dir":wind_dir_conv,
                                "Precip":precip_conv,
                                "Humidity":humidity_conv,
                                "Events":event_conv
                            })[0]
    table_df = table_df.drop(["Windchill", "Dew Point", "Gust Speed"],axis=1, errors="ignore")
    table_df.rename(columns={"Time (EST)":"Time", "Time (EDT)":"Time", "Temp.":"Temp"}, inplace=True)
    table_df["Time"] = pd.to_datetime(table_df["Time"])
    table_df.set_index("Time", inplace=True)
    table_df["Airport"] = airport
    return table_df

def weather_scraper(dates):
    # Run once. Slow to run.
    base_url = "https://www.wunderground.com/history/airport/K{}/{}/DailyHistory.html"
    
    pool = Pool(8)
    pairs = []
    for airport in NYC_AIRPORTS + MIA_AIRPORTS:
        for date in dates:
            date_str = date.strftime("%Y/%m/%d")
            url = base_url.format(airport, date_str)
            pairs.append((url, date_str, airport))
            
    weather_df_list = pool.map(table_parser, pairs)
    pool.close()
    pool.join()
            
    result = pd.concat(weather_df_list)
    result.to_csv("weather.csv")
    return result

In [None]:
weather = pd.read_csv("weather.csv", low_memory=False, parse_dates=["Time"], na_values=["-"], index_col="Time")

In [313]:
# handle weather with DST
new_weather_list = []
for a in NYC_AIRPORTS+MIA_AIRPORTS:
    weather_airport_idx = weather[weather["Airport"]==a].index
    weather_airport_tz_idx = weather_airport_idx.tz_localize('US/Eastern', ambiguous='infer')
    new_weather = weather[weather["Airport"]==a].set_index(weather_airport_tz_idx)
    new_weather_list.append(new_weather[~new_weather.index.duplicated()])
new_weather = pd.concat(new_weather_list)
    
new_weather.sort_index(inplace=True)

In [134]:
raw_data = get_data_csv()
raw_data.to_csv("data.csv", index=False)
raw_data = pd.read_csv("data.csv",dtype={
                          "CRS_DEP_TIME":str,
                          "DEP_TIME":str,
                          "CRS_ARR_TIME":str,
                          "ARR_TIME":str,
                          "CANCELLED":bool,
                          "DIVERTED":bool})


In [485]:
raw_data[["DEP_DELAY","ARR_DELAY"]] = raw_data[["DEP_DELAY","ARR_DELAY"]].fillna(0)
data = raw_data[~(raw_data["DIVERTED"] | raw_data["CANCELLED"])].drop(["CANCELLED","DIVERTED"], axis=1)

time_cols = ["CRS_DEP_TIME", "CRS_ARR_TIME"]
for col in time_cols:
    data.loc[data[col]=="2400",col] = "0000"
    data[col.rsplit("_",1)[0]+"_DATETIME"] = \
    pd.to_datetime(data["FL_DATE"]+" "+data[col], format="%Y-%m-%d %H%M")
    
# check if arrival time is after departure
data.loc[(data['CRS_ARR_DATETIME'] < data['CRS_DEP_DATETIME']), 'CRS_ARR_DATETIME'] += pd.Timedelta(days=1)
data['CRS_DEP_DATETIME'] = pd.DatetimeIndex(data["CRS_DEP_DATETIME"]).tz_localize("US/Eastern", ambiguous=False, errors="coerce")
data['CRS_ARR_DATETIME'] = pd.DatetimeIndex(data["CRS_ARR_DATETIME"]).tz_localize("US/Eastern", ambiguous=False, errors="coerce")

data["DEP_DATETIME"] = data["CRS_DEP_DATETIME"] + pd.to_timedelta(data["DEP_DELAY"], unit="m")
data["ARR_DATETIME"] = data["CRS_ARR_DATETIME"] + pd.to_timedelta(data["ARR_DELAY"], unit="m")
data = data[~(data[["DEP_DATETIME","ARR_DATETIME"]].isnull().any(axis=1))]

In [537]:
mia_data = data.drop(time_cols+["DEP_TIME", "ARR_TIME"],axis=1)
mia_data = mia_data[mia_data["DEST"].isin(MIA_AIRPORTS)]# | mia_data["ORIGIN"].isin(MIA_AIRPORTS)]
mia_data["date"] = pd.to_datetime(mia_data["FL_DATE"])
mia_data = mia_data.set_index("date").sort_index()

In [538]:
# weather_scraper(mia_data.index.unique())

In [539]:
temp_df = pd.merge_asof(mia_data.sort_values("CRS_DEP_DATETIME"), new_weather, left_by=["ORIGIN"], right_by=["Airport"], left_on="CRS_DEP_DATETIME", right_index=True)
weather_flight_data = pd.merge_asof(temp_df.sort_values("CRS_ARR_DATETIME"), new_weather, left_by=["DEST"], right_by=["Airport"], left_on="CRS_ARR_DATETIME", right_index=True, suffixes=["_dep","_arr"])

In [540]:
weather_flight_data = pd.merge_asof(weather_flight_data.sort_values("CRS_DEP_DATETIME"),
                                    data[["ARR_DATETIME","TAIL_NUM","DEST","ARR_DELAY"]].sort_values("ARR_DATETIME"),
                                    left_by=["TAIL_NUM","ORIGIN"],
                                    right_by=["TAIL_NUM","DEST"],
                                    left_on="CRS_DEP_DATETIME",
                                    right_on="ARR_DATETIME",
                                    suffixes=("","_PREV"),
                                    allow_exact_matches=False)
weather_flight_data["PREV_ARR_LT_ONE_DAY"] = (weather_flight_data["CRS_DEP_DATETIME"] - weather_flight_data["ARR_DATETIME_PREV"] < pd.Timedelta(days=1)).astype(int)
weather_flight_data.drop(["ARR_DATETIME_PREV", "DEST_PREV"],axis=1, inplace=True)
weather_flight_data["ARR_DELAY_PREV"] = weather_flight_data["ARR_DELAY_PREV"].fillna(0)

(86325, 46)

In [None]:
from sklearn import preprocessing
#['Month','Carrier','origin','Dest','distance'] 
total_table = data[['MONTH','CARRIER','ORIGIN','DEST','DISTANCE','DEP_DELAY_GROUP']]
#y_table = data['DEP_DELAY_GROUP'].dropna(axis=0, how='any')
total_table = total_table.dropna(axis=0, how='any')
total_table = total_table[total_table.DEP_DELAY_GROUP >0]
y_table = total_table['DEP_DELAY_GROUP']
y_table = y_table.reset_index(drop=True)
feature_table = total_table.drop(['DEP_DELAY_GROUP'],axis=1)
feature_table = feature_table.reset_index(drop=True)


feature_table['MONTH']= feature_table['MONTH'].astype('category')
feature_table = pd.get_dummies(feature_table)
feature_table.head()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(feature_table[['DISTANCE']].values.astype(float))
#feature_table['DISTANCE']= 
feature_table['DISTANCE']= pd.DataFrame(x_scaled)
feature_table.head()


In [None]:
#y_table = data['DEP_DELAY_GROUP']
#y_table = pd.get_dummies(y_table)
y_table.head()
#print (y_table.shape)

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)
print (feature_table.values)

clf.fit(feature_table.values, y_table.values)

In [None]:
prediction = clf.predict(feature_table.values)
count_predict =0
for i in prediction:
    if i != 1:
        print (i)
        count_predict+=1

count = 0
for i in y_table.values:
    if i !=1:
        count+=1
        #print (i)
print ('count = ',count)
print ("count_predict =",count_predict)
print (y_table.shape)
print (prediction)