In [72]:
import numpy as np
import pandas as pd


#### Requirements
```
pip3 install pandas lxml
```

In [61]:
# Fetching data from BTS database

import requests
import zipfile
import io

def get_data_csv():
    years_interval = ["2015","2018"]
    req_header = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Content-Type": "application/x-www-form-urlencoded",
        "Cookie": "ASPSESSIONIDCQCDRTCQ=FKGMOCEDMCDHINELAEGCEIKN",
        "Host": "www.transtats.bts.gov",
        "Origin": "https://www.transtats.bts.gov",
        "Pragma": "no-cache",
        "Referer": "https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
    }
    data_url = "https://www.transtats.bts.gov/DownLoad_Table.asp?Table_ID=236&Has_Group=3&Is_Zipped=0"
    pair_form_data = {
    "UserTableName":"On_Time_Performance",
    "DBShortName":"On_Time",
    "RawDataTable":"T_ONTIME",
    "sqlstr":"SELECT YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,\
            DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_GROUP,DISTANCE,CANCELLED,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,\
            SECURITY_DELAY,LATE_AIRCRAFT_DELAY \
            FROM T_ONTIME \
            WHERE (\
                DEST IN ('EWR','JFK','LGA') OR\
                ORIGIN IN ('EWR','JFK','LGA')\
            )\
            AND \
            Month BETWEEN 1 AND 12 AND \
            YEAR BETWEEN {} AND {}".format(years_interval[0],years_interval[1]),
    "varlist":"YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER,TAIL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_GROUP,DISTANCE,CANCELLED,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY",
    }
    r = requests.post(data_url,
                      data=pair_form_data,
                      headers=req_header,
                     )
    z = zipfile.ZipFile(io.BytesIO(r.content))
    return pd.read_csv(io.BytesIO(z.read(z.infolist()[0])),
                       dtype={
                          "CRS_DEP_TIME":str,
                          "DEP_TIME":str,
                          "CRS_ARR_TIME":str,
                          "ARR_TIME":str,
                          "CANCELLED":bool,
                          "DIVERTED":bool},
                      parse_dates=["FL_DATE"])

In [121]:
# scraping hourly weather data from wunderground.com

import lxml
from bs4 import BeautifulSoup

def wind_speed_conv(x):
    try:
        ret = 0.0 if x == "Calm" else float(x)
    except:
        ret = np.NaN
    return ret
DIR_ANGLE={
            "North": 0,"NNE": 1,"NE": 2,"ENE": 3,"East": 4,"ESE": 5,"SE": 6,"SSE": 7,
            "South": 8,"SSW": 9,"SW":10,"WSW":11,"West":12,"WNW":13,"NW":14,"NNW":15,
          }
def wind_dir_conv(x):
    return 0 if x not in DIR_ANGLE else 22.5*DIR_ANGLE[x]
def humidity_conv(x):
    try:
        ret = float(x.strip("%"))/100
    except:
        ret = np.NaN
    return ret
def precip_conv(x):
    try:
        ret = 0.0 if x == "N/A" else float(x)
    except:
        ret = 0.0
    return ret
def event_conv(x):
    return "".join(x.split("\t"))


def table_parser(content, date, airport):
    def time_conv(x):
        return date + " " + x
    bs = BeautifulSoup(content, "html5lib")
    table_html = bs.find(id="obsTable")
    for unit_tag in table_html.find_all("span",class_="wx-unit"):
        unit_tag.decompose()
    table_df = pd.read_html(str(table_html), converters= \
                            {
                                "Time (EST)":time_conv,
                                "Time (EDT)":time_conv,
                                "Wind Speed":wind_speed_conv,
                                "Wind Dir":wind_dir_conv,
                                "Precip":precip_conv,
                                "Humidity":humidity_conv,
                                "Events":event_conv
                            })[0]
    table_df = table_df.drop(["Windchill", "Dew Point", "Gust Speed"],axis=1, errors="ignore")
    table_df.rename(columns={"Time (EST)":"Time", "Time (EDT)":"Time", "Temp.":"Temp"}, inplace=True)
    table_df["Time"] = pd.to_datetime(table_df["Time"])
    table_df.set_index("Time", inplace=True)
    table_df["Airport"] = airport
    return table_df

def weather_scraper()
    # Run once. Slow to run.
    weather_dates = data.index.unique()
    base_url = "https://www.wunderground.com/history/airport/K{}/{}/DailyHistory.html"

    weather_df_list = []
    for airport in NYC_AIRPORTS+MIA_AIRPORTS:
        for date in weather_dates:
            date_str = date.strftime("%Y/%m/%d")
            url = base_url.format(airport, date_str)
            table_df = table_parser(requests.get(url).content, date_str, airport)
            weather_df_list.append(table_df)
    result = pd.concat(weather_df_list)
    result.to_csv("weather.csv")

In [88]:
# raw_data = get_data_csv()
# raw_data.to_csv("data.csv")

raw_data = pd.read_csv("data.csv",dtype={
                          "CRS_DEP_TIME":str,
                          "DEP_TIME":str,
                          "CRS_ARR_TIME":str,
                          "ARR_TIME":str,
                          "CANCELLED":bool,
                          "DIVERTED":bool},
                      parse_dates=["FL_DATE"])

In [74]:
weather = pd.read_csv("weather.csv", low_memory=False, parse_dates=["Time"], na_values=["-"], index_col="Time")

In [89]:
MIA_AIRPORTS = ["MCO", "FLL"]

data = raw_data[~(raw_data["DIVERTED"] | raw_data["CANCELLED"])].drop(["CANCELLED","DIVERTED"], axis=1)
data = data[(data["DEST"].isin(MIA_AIRPORTS) | data["ORIGIN"].isin(MIA_AIRPORTS))]
data["date"] = data["FL_DATE"]
data = data.set_index("date").sort_index()

data = data[data['CRS_DEP_TIME'].notna()]
data = data[data['CRS_ARR_TIME'].notna()]
data = data[data['ARR_TIME'].notna()]
data = data[data['DEP_TIME'].notna()]
#,'DEP_TIME','CRS_ARR_TIME','ARR_TIME'].notna()]]
#data.dropna(axis=0, how='any')
data

AttributeError: 'Series' object has no attribute 'notna'

In [92]:
print (data[:5])
data.loc[data.CRS_DEP_TIME =='2400','CRS_DEP_TIME'] = '0000'
data.loc[data.DEP_TIME =='2400','DEP_TIME'] = '0000'
data.loc[data.CRS_ARR_TIME =='2400','CRS_ARR_TIME'] = '0000'
data.loc[data.ARR_TIME =='2400','ARR_TIME'] = '0000'
data.head()

            YEAR  QUARTER  MONTH  DAY_OF_MONTH  DAY_OF_WEEK    FL_DATE  \
date                                                                     
2015-01-01  2015        1      1             1            4 2015-01-01   
2015-01-01  2015        1      1             1            4 2015-01-01   
2015-01-01  2015        1      1             1            4 2015-01-01   
2015-01-01  2015        1      1             1            4 2015-01-01   
2015-01-01  2015        1      1             1            4 2015-01-01   

           CARRIER TAIL_NUM ORIGIN DEST          ...           ARR_TIME  \
date                                             ...                      
2015-01-01      AA   N3AXAA    JFK  MCO          ...               1453   
2015-01-01      B6   N804JB    EWR  MCO          ...               1638   
2015-01-01      B6   N905JB    JFK  FLL          ...               1010   
2015-01-01      B6   N508JB    LGA  MCO          ...               0901   
2015-01-01      B6   N629JB    

Unnamed: 0_level_0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,CARRIER,TAIL_NUM,ORIGIN,DEST,...,ARR_TIME,ARR_DELAY,ARR_DELAY_GROUP,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,CRS_DEP_TIME_DATETIME
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,2015,1,1,1,4,2015-01-01,AA,N3AXAA,JFK,MCO,...,1453,13.0,0.0,944.0,,,,,,2015-01-01 11:55:00
2015-01-01,2015,1,1,1,4,2015-01-01,B6,N804JB,EWR,MCO,...,1638,-13.0,-1.0,937.0,,,,,,2015-01-01 14:00:00
2015-01-01,2015,1,1,1,4,2015-01-01,B6,N905JB,JFK,FLL,...,1010,-27.0,-2.0,1069.0,,,,,,2015-01-01 07:18:00
2015-01-01,2015,1,1,1,4,2015-01-01,B6,N508JB,LGA,MCO,...,901,-30.0,-2.0,950.0,,,,,,2015-01-01 06:30:00
2015-01-01,2015,1,1,1,4,2015-01-01,B6,N629JB,JFK,MCO,...,2246,-7.0,-1.0,944.0,,,,,,2015-01-01 20:00:00


In [93]:
#data.head()
from datetime import datetime,timedelta
war_start = '2011-01-03 12:00'
datetime.strptime(war_start, '%Y-%m-%d %H:%M')


def to_time(fl_date,time_string):
    year_time  = fl_date.strftime('%Y-%m-%d')
    hour_time = ' '+time_string[:2]+':'+time_string[2:]
    time = year_time+hour_time
    return datetime.strptime(time, '%Y-%m-%d %H:%M')


In [None]:
data['CRS_DEP_TIME_DATETIME'] = data.apply(lambda row: to_time(row['FL_DATE'], row['CRS_DEP_TIME']), axis=1)
#print (data.ix[197660])
data['DEP_TIME_DATETIME'] = data.apply(lambda row: to_time(row['FL_DATE'], row['DEP_TIME']), axis=1)
#print (data.ix[1584706])
data['CRS_ARR_TIME_DATETIME'] = data.apply(lambda row: to_time(row['FL_DATE'], row['CRS_ARR_TIME']), axis=1)
data['ARR_TIME_DATETIME'] = data.apply(lambda row: to_time(row['FL_DATE'], row['ARR_TIME']), axis=1)

In [60]:
data['ARR_TIME_DATETIME'] = np.where(data['ARR_TIME_DATETIME'] < data['DEP_TIME_DATETIME'],data['ARR_TIME_DATETIME']+timedelta(days=1) ,data['ARR_TIME_DATETIME'])
data['CRS_ARR_TIME_DATETIME'] = np.where(data['CRS_ARR_TIME_DATETIME'] < data['CRS_DEP_TIME_DATETIME'],data['CRS_ARR_TIME_DATETIME']+timedelta(days=1) ,data['CRS_ARR_TIME_DATETIME'])

In [46]:
from sklearn import preprocessing
#['Month','Carrier','origin','Dest','distance'] 
total_table = data[['MONTH','CARRIER','ORIGIN','DEST','DISTANCE','DEP_DELAY_GROUP']]
#y_table = data['DEP_DELAY_GROUP'].dropna(axis=0, how='any')
total_table = total_table.dropna(axis=0, how='any')
total_table = total_table[total_table.DEP_DELAY_GROUP >0]
y_table = total_table['DEP_DELAY_GROUP']
y_table = y_table.reset_index(drop=True)
feature_table = total_table.drop(['DEP_DELAY_GROUP'],axis=1)
feature_table = feature_table.reset_index(drop=True)


feature_table['MONTH']= feature_table['MONTH'].astype('category')
feature_table = pd.get_dummies(feature_table)
feature_table.head()
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(feature_table[['DISTANCE']].values.astype(float))
#feature_table['DISTANCE']= 
feature_table['DISTANCE']= pd.DataFrame(x_scaled)
feature_table.head()


Unnamed: 0,DISTANCE,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,...,ORIGIN_EWR,ORIGIN_FLL,ORIGIN_JFK,ORIGIN_LGA,ORIGIN_MCO,DEST_EWR,DEST_FLL,DEST_JFK,DEST_LGA,DEST_MCO
0,0.94964,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0.94964,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,0.94964,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,0.94964,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0.94964,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [77]:
#y_table = data['DEP_DELAY_GROUP']
#y_table = pd.get_dummies(y_table)
y_table.head()
#print (y_table.shape)

0    11.0
1     6.0
2     1.0
3     6.0
4     1.0
Name: DEP_DELAY_GROUP, dtype: float64

In [78]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)
print (feature_table.values)

clf.fit(feature_table.values, y_table.values)

[[ 0.94964029  1.          0.         ...,  0.          0.          0.        ]
 [ 0.94964029  1.          0.         ...,  1.          0.          0.        ]
 [ 0.94964029  1.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          1.         ...,  0.          0.          0.        ]
 [ 0.92086331  0.          1.         ...,  0.          0.          0.        ]
 [ 0.92086331  0.          1.         ...,  0.          0.          0.        ]]


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0)

In [83]:
prediction = clf.predict(feature_table.values)
count_predict =0
for i in prediction:
    if i != 1:
        print (i)
        count_predict+=1

count = 0
for i in y_table.values:
    if i !=1:
        count+=1
        #print (i)
print ('count = ',count)
print ("count_predict =",count_predict)
print (y_table.shape)
print (prediction)

2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
2.0
count =  35139
count_predict = 24
(49910,)
[ 1.  1.  1. ...,  1.  1.  1.]
