# 02.Selenium

In this notebook, I decided to pull more data of aircrafts from the website by using Selenium. The original dataset contains the tail number of the each flights recorded, which are unique identifier. Flightradar24.com has aircrafts information of manufacturer, model and ages of the each aircrafts. I decided to use those new data as my variables in my model. 

### Scrap aircraft information by tailno, unique identifier, to use a variable
##### Which one has higher chances of delays?
- By company: Boeing v. Airbus 
- By aircraft models: B747 / A380 / B777 / A350 / B787 : certain aircraft models have higher chances?
- By ages: Older v. Newer aircraft

In [1]:
# Libraries and packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import datetime
import requests
from bs4 import BeautifulSoup

import time
from selenium import webdriver
import re

In [2]:
# Read dataset
holidays   = pd.read_csv('./assets/holidays.csv', low_memory=False) 
washington = pd.read_csv('./assets/washington.csv', low_memory=False)

print("Holidays dataframe dimensions(shape)  : ", holidays.shape)
print("Washington dataframe dimensions(shape): ", washington.shape)

Holidays dataframe dimensions(shape)  :  (7084008, 31)
Washington dataframe dimensions(shape):  (2204853, 31)


In [33]:
# currently all the departure/arrival time were set only 4 digits interger
# between 0001 to 2400. Therefore, convert it into datetime function

def time_converter(hr):
    if pd.isnull(hr):
        return np.nan
    else:
        if hr == 2400: hr = 0
        # set 4 digits 
        hr = "{0:04d}".format(int(hr)) 
        hour = datetime.time(int(hr[0:2]), int(hr[2:4]))
        return hour

# change columns name
mapping ={
    'Year': 'year',
    'Month': 'month',
    'DayofMonth': 'day',
    'DayOfWeek': 'dayofweek',
    'Carrier': 'carrier',
    'TailNum': 'tail_no',
    'FlightNum': 'flight_no',
    'Origin': 'origin',
    'Dest': 'dest',
    'CRSDepTime': 'sched_dep',
    'DepTime': 'dep_time',
    'DepDelay': 'dep_delay',
    'TaxiOut': 'taxi_out',
    'WheelsOff': 'wheels_off',
    'WheelsOn': 'wheels_on',
    'TaxiIn': 'taxi_in',
    'CRSArrTime': 'sched_arr',
    'ArrTime': 'arr_time',
    'ArrDelay': 'arr_delay',
    'Cancelled': 'cancelled',
    'CancellationCode': 'cancel_code',
    'Diverted': 'diverted',
    'CRSElapsedTime': 'sched_elapsed',
    'ActualElapsedTime': 'elapsed_time',
    'AirTime': 'air_time',
    'Distance': 'distance',
    'CarrierDelay': 'carrier_delay',
    'WeatherDelay': 'weather_delay',
    'NASDelay': 'system_delay',
    'SecurityDelay': 'security_delay',
    'LateAircraftDelay': 'late_aircraft_delay'
}

day_map = {
    '1': 'Mon', '2': 'Tue', '3': 'Wed', '4': 'Thu',
    '5': 'Fri', '6': 'Sat', '7': 'Sun'
}

def preprocessing(df):
    # easier to work with lowercase columns
    df = df.rename(columns=mapping)
    
    # convert it to datetime
    df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
    
    # apply time converter to dep/arr time
    times = ['sched_dep', 'dep_time', 'sched_arr', 'arr_time']
    for x in times:
        df[x] = df[x].apply(time_converter)
    
    # apply dayofweek
    df['dayofweek'] = df['dayofweek'].astype(str).map(day_map)
    
    # fill tail_no
    df['tail_no'] = df['tail_no'].fillna('na')
    return df

In [34]:
holidays   = preprocessing(holidays)
washington = preprocessing(washington)

# Selenium code below won't be working, since I diabled to share the log-in credential.

In [None]:
stop_dont_run_the_code_afterward

##### Selenium

In [15]:
tails=list(set(holidays.tail_no))

# set detour redirect address, in case the website doesn't have the page
detour = 'https://www.flightradar24.com/data/aircraft/'

In [21]:
len(tails)

6459

In [None]:
# In this public notebook, the login credential won't be specify
EMAIL = 'email'
PASSWORD = 'password'

In [27]:
# courtesy of Ben Shaver
driver = webdriver.Chrome(executable_path="../chromedriver")
url = 'https://www.flightradar24.com/premium/signup'
driver.get(url)

login_btn = driver.find_element_by_id('btnSignin')

login_btn.click()

email_field = driver.find_element_by_id('txtEmail')

email_field.send_keys(EMAIL)

password_field = driver.find_element_by_id('txtPassword')

password_field.send_keys(PASSWORD)

login_btn = driver.find_element_by_id('fr24-logIn')
login_btn.click()

##### To check my code is working properly, I initially set the range values below as smaller size and increased inclemently.

In [23]:
foo = {'tailno':[], 'aircraft':[], 'type_code':[], 'shipped':[], 'age(year)':[]}

for i in range(1500,6459):
    url = 'https://www.flightradar24.com/data/aircraft/' + tails[i]
    driver.get(url)
    
    # if the current url redirected to detour url, ask to put None value and proceed
    if driver.current_url == detour:
        foo['tailno'].append(tails[i])
        foo['aircraft'].append(None)
        foo['type_code'].append(None)
        foo['age(year)'].append(None)
        foo['shipped'].append(None)
        time.sleep(3)
    else:
        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')

        foo['tailno'].append(tails[i])
        elems = soup.find_all('span', {'class':'details'})
        try:
            foo['aircraft'].append(elems[0].text.strip())
        except:
            foo['aircraft'].append(None)
        try:
            foo['type_code'].append(elems[3].text.strip())
        except:
            foo['type_code'].append(None)
        try:
            foo['age(year)'].append(elems[8].text.strip())
        except:
            foo['age(year)'].append(None)

        age_found = False
        for elem in soup.find_all('label'):
            if 'AGE (' in elem.text:
                foo['shipped'].append(elem.text.strip())
                age_found = True
        if not age_found:
            foo['shipped'].append(None)
        # sleep every iteration
        time.sleep(2)
    if i % 100 == 1:
        print("{}th url".format(i), url)


aircrafts=pd.read_csv('./assets/aircrafts.csv', low_memory=False)
bar=pd.DataFrame(foo)
pd.concat([aircrafts,bar], ignore_index=True).to_csv('./assets/aircrafts.csv', index=False)

1501th url https://www.flightradar24.com/data/aircraft/N13914
1601th url https://www.flightradar24.com/data/aircraft/N955AT
1701th url https://www.flightradar24.com/data/aircraft/N8507C
1801th url https://www.flightradar24.com/data/aircraft/N837MQ
1901th url https://www.flightradar24.com/data/aircraft/N915AA
2001th url https://www.flightradar24.com/data/aircraft/N713TW
2101th url https://www.flightradar24.com/data/aircraft/N454AA
2201th url https://www.flightradar24.com/data/aircraft/N14933
2301th url https://www.flightradar24.com/data/aircraft/N906DA
2401th url https://www.flightradar24.com/data/aircraft/N952WN
2501th url https://www.flightradar24.com/data/aircraft/N612NK
2601th url https://www.flightradar24.com/data/aircraft/N306DN
2701th url https://www.flightradar24.com/data/aircraft/N587NW
2801th url https://www.flightradar24.com/data/aircraft/N202HA
2901th url https://www.flightradar24.com/data/aircraft/N658AW
3001th url https://www.flightradar24.com/data/aircraft/N121UA
3101th u

In [29]:
driver.close()

In [30]:
aircrafts=pd.read_csv('./assets/aircrafts.csv', low_memory=False)
aircrafts.shape

(6459, 5)

In [31]:
aircrafts.tail(20)

Unnamed: 0,age(year),aircraft,shipped,tailno,type_code
6439,-,-,,N513MQ,-
6440,14 years,Boeing 737-7H4,AGE (Jul 2004),N472WN,B737
6441,5 years,Airbus A321-211,AGE (2013),N198UW,A321
6442,1 year,Boeing 737-8H4,AGE (Dec 2016),N8515X,B738
6443,30 years,Boeing 757-232,AGE (1988),N640DL,B752
6444,16 years,Boeing 717-2BD,AGE (Jan 2002),N974AT,B712
6445,13 years,Boeing 737-7H4,AGE (Aug 2004),N479WN,B737
6446,,,,N3ATAA,
6447,14 years,Embraer ERJ-145XR,AGE (Aug 2003),N17146,E45X
6448,20 years,McDonnell Douglas MD-90-30,AGE (Oct 1997),N943DN,MD90
