In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from bs4 import BeautifulSoup
import requests
import time


### Transform Delay DataFrame

In [None]:
#importing files 
delay_2008 = "Resources/2008.csv"
delay_df = pd.read_csv(delay_2008)
delay_df.tail()

In [None]:
#Filtering Data with final destination SFO
delay_df = delay_df.loc[delay_df['Dest'] == 'SFO']

In [None]:
#Filtering Data with delays
delay_df = delay_df.loc[delay_df['ArrDelay'] > 0]

In [None]:
#Creating column to set as 'id'
delay_df['ID'] = range(0, len(delay_df))
delay_df.head()

In [None]:
list(delay_df)

In [None]:
# Create a filtered dataframe from specific columns
delay_cols = ["ID", "FlightNum","TailNum", "UniqueCarrier", "Year", "Month", "DayOfWeek", 'Origin',
 'Dest']
delay_transformed = delay_df[delay_cols].copy()

In [None]:
# Rename the column headers
delay_transformed = delay_transformed.rename(columns={"ID": "id",
                                                      "FlightNum": "flight_number",
                                                      "TailNum": "tail_number",
                                                          "UniqueCarrier": "unique_carrier",
                                                          "Year": "year",
                                                     "Month": "month",
                                                     "DayOfWeek": "week_day",
                                                     "Origin": "origin",
                                                     "Dest": "destination"})

# Clean the data by dropping duplicates and setting the index
delay_transformed.drop_duplicates("id", inplace=True)
delay_transformed.set_index("id", inplace=True)

delay_transformed.head()

### Transform Landing DataFrame

In [None]:
#importing files 
landings_file = "Resources/air-traffic-landings-statistics.csv"
landings_df = pd.read_csv(landings_file)
landings_df.head()

In [None]:
# Changing column type to split the values
landings_df['Activity Period'] = landings_df['Activity Period'].astype(str)
landings_df['Year'] = landings_df['Activity Period'].str[0:4]
landings_df['Month'] = landings_df['Activity Period'].str[4:6]

In [None]:
# Changing type back into int
landings_df['Year'] = landings_df['Year'].astype(int)
landings_df['Month'] = landings_df['Month'].astype(int)

In [None]:
#Filtering data
landings_df = landings_df.loc[landings_df['Year'] == 2008]

In [None]:
#Dropping duplicates
landings_df.drop_duplicates(["Year", "Month", "Operating Airline IATA Code", "GEO Region","Landing Aircraft Type", "Aircraft Model", "Aircraft Version"], keep= 'last').head()

In [None]:
#Dropping splited column
landings_df.drop(["Activity Period"], axis=1).head()

In [None]:
#Creating column to set as 'id'
landings_df['ID'] = range(0, 0+len(landings_df))

In [None]:
# Create a filtered dataframe from specific columns
landings_cols = ["ID","Operating Airline IATA Code","Year", "Month", "GEO Region", 
                 "Landing Aircraft Type", "Aircraft Manufacturer", "Aircraft Model"]
landings_transformed = landings_df[landings_cols].copy()

In [None]:
# Rename the column headers
landings_transformed = landings_transformed.rename(columns={"ID":"id",
                                                            "Operating Airline IATA Code": "unique_carrier",
                                                          "Year": "year",
                                                          "Month": "month",
                                                     "GEO Region": "geo_region",
                                                     "Landing Aircraft Type": "aircraft_type",
                                                           "Aircraft Manufacturer": "aircraft_manufacturer",
                                                     "Aircraft Model": "aircraft_Model"})

#landings_transformed.drop_duplicates(["unique_carrier","geo_region"], keep= 'last')
landings_transformed.drop_duplicates("id", inplace=True)
landings_transformed.set_index("id", inplace=True)
landings_transformed.head()

### Transform Passenger DataFrame

In [None]:
#importing files 
passenger_file = "Resources/air-traffic-passenger-statistics.csv"
passenger_df = pd.read_csv(passenger_file)
passenger_df.head()

In [None]:
# Changing column type to split the values
passenger_df['Activity Period'] = passenger_df['Activity Period'].astype(str)
passenger_df['Year'] = passenger_df['Activity Period'].str[0:4]
passenger_df['Month'] = passenger_df['Activity Period'].str[4:6]

In [None]:
# Changing type back into int
passenger_df['Year'] = passenger_df['Year'].astype(int)
passenger_df['Month'] = passenger_df['Month'].astype(int)

In [None]:
#Filtering data
passenger_df = passenger_df.loc[passenger_df['Year'] == 2008]
passenger_df.head()

In [None]:
#Dropping duplicates
passenger_df.drop_duplicates(["Year","Operating Airline IATA Code", "GEO Region",
                              "Terminal","Boarding Area", "Passenger Count"], keep= 'last').head()

In [None]:
#Creating column to set as 'id'
passenger_df['ID'] = range(0, 0+len(passenger_df))

In [None]:
#Dropping splited column
passenger_df.drop(["Activity Period"], axis=1).head()

In [None]:
# Create a filtered dataframe from specific columns
passenger_cols = ["ID","Operating Airline IATA Code","Year", "Month", "GEO Region", 
                 "Terminal", "Boarding Area", "Passenger Count"]
passenger_transformed = passenger_df[passenger_cols].copy()

# Rename the column headers
passenger_transformed = passenger_transformed.rename(columns={"ID":"id",
                                                            "Operating Airline IATA Code": "unique_carrier",
                                                          "Year": "year",
                                                          "Month": "month",
                                                     "GEO Region": "geo_region",
                                                     "Terminal": "terminal",
                                                           "Boarding Area": "boarding_area",
                                                     "Passenger Count": "passengers_number"})

#landings_transformed.drop_duplicates(["unique_carrier","geo_region"], keep= 'last')
passenger_transformed.drop_duplicates("id", inplace=True)
passenger_transformed.set_index("id", inplace=True)
passenger_transformed.head()

In [None]:
#connection_string = "root:password@localhost/sfo_db"
#engine = create_engine(f'mysql://{connection_string}')

In [None]:
# Confirm tables
#engine.table_names()

In [None]:
#delay_transformed.to_sql(name='delays', con=engine, if_exists='append', index=True)


In [None]:
#landings_transformed.to_sql(name='landings', con=engine, if_exists='append', index=True)

In [None]:
#passenger_transformed.to_sql(name='passengers', con=engine, if_exists='append', index=True)

In [None]:
#Creating an array of unique values
unique_tailnum = delay_df["TailNum"].unique()

In [None]:
uniquelist = unique_tailnum + " is Not Assigned/Reserved"

In [None]:
uniquelist[2]

In [None]:
#Creating a list
tail_number = unique_tailnum.tolist()

In [None]:
#Creating list of urls for each unique Tail Number  
new_urls = ["https://registry.faa.gov/aircraftinquiry/NNum_Results.aspx?nNumberTxt=" + x for x in tail_number]

In [None]:
new_urls

In [None]:
name_list = []
manufacturer_list = []
model_list = []
year_list = []
tail_list = []

In [None]:
#Creating a function which will scrape information from a website   
def scrape():
    counter = 0
    for one_url in new_urls[:3]:
        try:
            response = requests.get(one_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            print(f"passed1")
            tail_flag = soup.find("span", class_='Inquiry_InputLabel')
            print(f"passed1-1")
            if(tail_flag != uniquelist):
               tail = tail_flag.text
               tail_list.append(tail)
            print(f"passed2")
            model_flag = soup.find("span", class_='Results_DataText', attrs = {'id':'content_drptrDeRegAircraft_lbDeRegModel_0'})
            print(f"passed3")
            if(model_flag == None):
                model = soup.find("td", class_='Results_DataText', attrs = {'id':'content_Label7'})
            else:
                model = model_flag.text
                model_list.append(model)
            print(f"passed4")
            manufacturer_flag = soup.find("span", class_='Results_DataText', attrs = {'id':'content_lbMfrName'})
            print(f"passed5")
            if(manufacturer_flag == None):
                manufacturer = soup.find("span", class_='Results_DataText', attrs = {'id':'content_drptrDeRegAircraft_lbDeRegMfrName_0'})
            else:
                manufacturer = manufacturer_flag.text
                manufacturer_list.append(manufacturer)
            print(f"passed6")
            year_flag = soup.find("span", class_='Results_DataText', attrs = {'id':'content_Label17'})
            print(f"passed7")
            if(year_flag == None):
                year = soup.find("span", class_='Results_DataText', attrs = {'id':'content_drptrDeRegAircraft_lbDeRegYearMfr_0'})
            else:
                year = year_flag.text
                year_list.append(year)
            print(f"passed8")
            name_flag = soup.find("span", class_='Results_DataText', attrs = {'id':'content_lbOwnerName'})
            print(f"passed9")
            if(name_flag == None):
                name = soup.find("span", class_='Results_DataText', attrs = {'id':'content_drptrDeRegAircraft_lbDROwnerName_0'})
            else:
                name = name_flag .text
                name_list.append(name)
            print(f"passed10")
            counter = counter + 1
            print(str(counter)+" completed")
            time.sleep(1)
        except KeyError:
            print(f"Error. Skipping...")     

In [None]:
scrape()

In [None]:
print(tail_list)

In [None]:
print(model_list)

In [None]:
print(manufacturer_list)

In [None]:
print(name_list)

In [None]:
print(year_list)

In [None]:
tail_dict ={"model_list":model_list}

In [None]:
tail_df = pd.DataFrame(tail_dict)

In [None]:
tail_df.drop_duplicates("model_list", inplace=True)

In [None]:
tail_df

In [None]:
aircraft_dict ={"tail_number":tail_number,
                 "model": model_list,
                 "Manufacturer": manufacturer_list,
                    "year": year_list
                  "airline_name": name_list}

In [None]:
aircraft_df = pd.DataFrame(aircraft_dict)

In [None]:
#Create column id 
aircraft_df["id"] = range(0, len(aircraft_df))

In [None]:
aircraft_df

In [None]:
#Drop duplicates 
aircraft_df.drop_duplicates("tail_number", inplace=True)

In [None]:
aircraft_df.to_csv("Resources/aircraft_df.csv", index = False)

In [None]:
len(model_list)

In [None]:
len(tail_number)

In [None]:
len(manufacturer_list)

In [None]:
len(name_list)