In [3]:
import pandas as pd
from sqlalchemy import create_engine


### Transform Delay DataFrame

In [4]:
#importing files 
delay_2008 = "Resources/2008.csv"
delay_df = pd.read_csv(delay_2008)
delay_df.tail()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
7009723,2008,12,13,6,1002.0,959,1204.0,1150,DL,1636,...,6.0,45.0,0,,0,,,,,
7009724,2008,12,13,6,834.0,835,1021.0,1023,DL,1637,...,5.0,23.0,0,,0,,,,,
7009725,2008,12,13,6,655.0,700,856.0,856,DL,1638,...,24.0,12.0,0,,0,,,,,
7009726,2008,12,13,6,1251.0,1240,1446.0,1437,DL,1639,...,13.0,13.0,0,,0,,,,,
7009727,2008,12,13,6,1110.0,1103,1413.0,1418,DL,1641,...,8.0,11.0,0,,0,,,,,


In [5]:
# Create a filtered dataframe from specific columns
delay_cols = ["FlightNum", "UniqueCarrier", "Year", "Month", "DayOfWeek"]
delay_transformed = delay_df[delay_cols].copy()

# Rename the column headers
delay_transformed = delay_transformed.rename(columns={"FlightNum": "id",
                                                          "UniqueCarrier": "unique_carrier",
                                                          "Year": "year",
                                                     "Month": "month",
                                                     "DayOfWeek": "week_day"})

# Clean the data by dropping duplicates and setting the index
delay_transformed.drop_duplicates("id", inplace=True)
delay_transformed.set_index("id", inplace=True)

delay_transformed.head()

Unnamed: 0_level_0,unique_carrier,year,month,week_day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
335,WN,2008,1,4
3231,WN,2008,1,4
448,WN,2008,1,4
1746,WN,2008,1,4
3920,WN,2008,1,4


### Transform Landing DataFrame

In [43]:
#importing files 
landings_file = "Resources/air-traffic-landings-statistics.csv"
landings_df = pd.read_csv(landings_file)
landings_df.head()

Unnamed: 0,Activity Period,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Landing Aircraft Type,Aircraft Body Type,Aircraft Manufacturer,Aircraft Model,Aircraft Version,Landing Count,Total Landed Weight
0,200204,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Passenger,Narrow Body,Boeing,757,200,83,16434000
1,200204,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Passenger,Narrow Body,Boeing,757,300,3,672000
2,200204,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Passenger,Wide Body,Lockheed,L1011,0,27,9666000
3,200204,Aeroflot Russian International Airlines,,Aeroflot Russian International Airlines,,International,Europe,Passenger,Wide Body,Boeing,777,0,9,4139946
4,200204,Air Canada,AC,Air Canada,AC,International,Canada,Passenger,Narrow Body,Boeing,737,200,5,525000


In [44]:
# Changing column type to split the values
landings_df['Activity Period'] = landings_df['Activity Period'].astype(str)
landings_df['Year'] = landings_df['Activity Period'].str[0:4]
landings_df['Month'] = landings_df['Activity Period'].str[4:6]

In [45]:
# Changing type back into int
landings_df['Year'] = landings_df['Year'].astype(int)
landings_df['Month'] = landings_df['Month'].astype(int)

In [46]:
#Filtering data
landings_df = landings_df.loc[landings_df['Year'] == 2008]

In [47]:
#Dropping duplicates
landings_df.drop_duplicates(["Year","Operating Airline IATA Code", "GEO Region","Landing Aircraft Type","Aircraft Model"], keep= 'last').head

Unnamed: 0,Activity Period,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Landing Aircraft Type,Aircraft Body Type,Aircraft Manufacturer,Aircraft Model,Aircraft Version,Landing Count,Total Landed Weight,Year,Month
4210,200801,Air Canada,AC,Air Canada,AC,International,Canada,Passenger,Wide Body,Boeing,767,3Y0,1,199000,2008,1
4216,200801,Air France,AF,Air France,AF,International,Europe,Passenger,Wide Body,Airbus,A330,300,1,401200,2008,1
4347,200802,ATA Airlines,TZ,ATA Airlines,TZ,Domestic,US,Passenger,Narrow Body,Boeing,737,800,1,146300,2008,2
4637,200804,Astar Air Cargo,ER,Astar Air Cargo,ER,Domestic,US,Freighter,Wide Body,McDonnell Douglas,DC-8,73,2,550000,2008,4
4761,200805,American Airlines,AA,American Airlines,AA,Domestic,US,Passenger,Narrow Body,Boeing,737,,1,144000,2008,5
4768,200805,Ameriflight,A8,Ameriflight,A8,Domestic,US,Freighter,Regional Jet,BAE-Avro,BAE-200,,2,25000,2008,5
4913,200806,EVA Airways,BR,EVA Airways,BR,International,Asia,Passenger,Wide Body,Boeing,747,400,30,18900000,2008,6
4919,200806,Federal Express,FX,Federal Express,FX,Domestic,US,Freighter,Wide Body,Airbus,A300,,1,308650,2008,6
5095,200807,SkyWest Airlines,OO,United Airlines - Pre 07/01/2013,UA,Domestic,US,Passenger,Turbo Prop,Embraer,ERJ,120,2154,55573200,2008,7
5156,200808,Alaska Airlines,AS,Alaska Airlines,AS,International,Canada,Passenger,Narrow Body,Boeing,737,700,24,3100800,2008,8


In [70]:
#Dropping splited column
landings_df.drop(["Activity Period"], axis=1).head()

Unnamed: 0,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Landing Aircraft Type,Aircraft Body Type,Aircraft Manufacturer,Aircraft Model,Aircraft Version,Landing Count,Total Landed Weight,Year,Month,ID
4206,ABX Air,GB,ABX Air,GB,Domestic,US,Freighter,Wide Body,Boeing,767,,46,13018000,2008,1,0
4207,ABX Air,GB,ABX Air,GB,Domestic,US,Freighter,Narrow Body,McDonnell Douglas,DC-9,41,22,2244000,2008,1,1
4208,Aer Lingus,EI,Aer Lingus,EI,International,Europe,Passenger,Wide Body,Airbus,A330,200,17,6770000,2008,1,2
4209,Air Canada,AC,Air Canada,AC,International,Canada,Passenger,Wide Body,Boeing,767,233,1,278000,2008,1,3
4210,Air Canada,AC,Air Canada,AC,International,Canada,Passenger,Wide Body,Boeing,767,3Y0,1,199000,2008,1,4
4211,Air Canada,AC,Air Canada,AC,International,Canada,Passenger,Narrow Body,Airbus,A319,114,90,12105000,2008,1,5
4212,Air Canada,AC,Air Canada,AC,International,Canada,Passenger,Narrow Body,Airbus,A320,211,30,4260000,2008,1,6
4213,Air Canada,AC,Air Canada,AC,International,Canada,Passenger,Narrow Body,Airbus,A321,211,13,2229500,2008,1,7
4214,Air Canada,AC,Air Canada,AC,International,Canada,Passenger,Regional Jet,Embraer,ERJ,190,101,9574800,2008,1,8
4215,Air China,CA,Air China,CA,International,Asia,Passenger,Wide Body,Boeing,747,400,31,19530000,2008,1,9


In [71]:
#Creating column to set as 'id'
landings_df['ID'] = range(0, 0+len(landings_df))

In [55]:
# Create a filtered dataframe from specific columns
landings_cols = ["ID","Operating Airline IATA Code","Year", "Month", "GEO Region", 
                 "Landing Aircraft Type", "Aircraft Manufacturer", "Aircraft Model"]
landings_transformed = landings_df[landings_cols].copy()

# Rename the column headers
landings_transformed = landings_transformed.rename(columns={"ID":"id",
                                                            "Operating Airline IATA Code": "unique_carrier",
                                                          "Year": "year",
                                                          "Month": "month",
                                                     "GEO Region": "geo_region",
                                                     "Landing Aircraft Type": "aircraft_type",
                                                           "Aircraft Manufacturer": "aircraft_manufacturer",
                                                     "Aircraft Model": "aircraft_Model"})

#landings_transformed.drop_duplicates(["unique_carrier","geo_region"], keep= 'last')
landings_transformed.drop_duplicates("id", inplace=True)
landings_transformed.set_index("id", inplace=True)
landings_transformed.tail()

Unnamed: 0_level_0,unique_carrier,year,month,geo_region,aircraft_type,aircraft_manufacturer,aircraft_Model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1606,UA,2008,12,US,Passenger,Airbus,A320
1607,UA,2008,12,Mexico,Passenger,Airbus,A320
1608,VX,2008,12,US,Passenger,Airbus,A319
1609,VX,2008,12,US,Passenger,Airbus,A320
1610,VS,2008,12,Europe,Passenger,Boeing,747


### Transform Passenger DataFrame

In [1]:
#importing files 
passenger_file = "Resources/air-traffic-passenger-statistics.csv"
passenger_df = pd.read_csv(passenger_file)
passenger_df.head()

NameError: name 'pd' is not defined

In [57]:
# Changing column type to split the values
passenger_df['Activity Period'] = passenger_df['Activity Period'].astype(str)
passenger_df['Year'] = passenger_df['Activity Period'].str[0:4]
passenger_df['Month'] = passenger_df['Activity Period'].str[4:6]

In [58]:
# Changing type back into int
passenger_df['Year'] = passenger_df['Year'].astype(int)
passenger_df['Month'] = passenger_df['Month'].astype(int)

In [63]:
#Filtering data
passenger_df = passenger_df.loc[passenger_df['Year'] == 2008]
passenger_df.head()

Unnamed: 0,Activity Period,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Activity Type Code,Price Category Code,Terminal,Boarding Area,Passenger Count,Year,Month
3473,200801,Aer Lingus,EI,Aer Lingus,EI,International,Europe,Deplaned,Other,International,A,2858,2008,1
3474,200801,Aer Lingus,EI,Aer Lingus,EI,International,Europe,Enplaned,Other,International,A,2716,2008,1
3475,200801,Air Canada,AC,Air Canada,AC,International,Canada,Deplaned,Other,Terminal 3,E,19756,2008,1
3476,200801,Air Canada,AC,Air Canada,AC,International,Canada,Enplaned,Other,Terminal 3,E,18129,2008,1
3477,200801,Air China,CA,Air China,CA,International,Asia,Deplaned,Other,International,G,7603,2008,1


In [64]:
#Dropping duplicates
passenger_df.drop_duplicates(["Year","Operating Airline IATA Code", "GEO Region",
                              "Terminal","Boarding Area", "Passenger Count"], keep= 'last'.head()

Unnamed: 0,Activity Period,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Activity Type Code,Price Category Code,Terminal,Boarding Area,Passenger Count,Year,Month
3473,200801,Aer Lingus,EI,Aer Lingus,EI,International,Europe,Deplaned,Other,International,A,2858,2008,1
3474,200801,Aer Lingus,EI,Aer Lingus,EI,International,Europe,Enplaned,Other,International,A,2716,2008,1
3475,200801,Air Canada,AC,Air Canada,AC,International,Canada,Deplaned,Other,Terminal 3,E,19756,2008,1
3476,200801,Air Canada,AC,Air Canada,AC,International,Canada,Enplaned,Other,Terminal 3,E,18129,2008,1
3477,200801,Air China,CA,Air China,CA,International,Asia,Deplaned,Other,International,G,7603,2008,1
3478,200801,Air China,CA,Air China,CA,International,Asia,Enplaned,Other,International,G,6027,2008,1
3479,200801,Air France,AF,Air France,AF,International,Europe,Deplaned,Other,International,A,6827,2008,1
3480,200801,Air France,AF,Air France,AF,International,Europe,Enplaned,Other,International,A,6190,2008,1
3481,200801,Air New Zealand,NZ,Air New Zealand,NZ,International,Australia / Oceania,Deplaned,Other,International,G,8057,2008,1
3482,200801,Air New Zealand,NZ,Air New Zealand,NZ,International,Australia / Oceania,Enplaned,Other,International,G,8977,2008,1


In [68]:
#Creating column to set as 'id'
passenger_df['ID'] = range(0, 0+len(passenger_df))

In [74]:
#Dropping splited column
passenger_df.drop(["Activity Period"], axis=1).head()

Unnamed: 0,Operating Airline,Operating Airline IATA Code,Published Airline,Published Airline IATA Code,GEO Summary,GEO Region,Activity Type Code,Price Category Code,Terminal,Boarding Area,Passenger Count,Year,Month,ID
3473,Aer Lingus,EI,Aer Lingus,EI,International,Europe,Deplaned,Other,International,A,2858,2008,1,0
3474,Aer Lingus,EI,Aer Lingus,EI,International,Europe,Enplaned,Other,International,A,2716,2008,1,1
3475,Air Canada,AC,Air Canada,AC,International,Canada,Deplaned,Other,Terminal 3,E,19756,2008,1,2
3476,Air Canada,AC,Air Canada,AC,International,Canada,Enplaned,Other,Terminal 3,E,18129,2008,1,3
3477,Air China,CA,Air China,CA,International,Asia,Deplaned,Other,International,G,7603,2008,1,4


In [75]:
# Create a filtered dataframe from specific columns
passenger_cols = ["ID","Operating Airline IATA Code","Year", "Month", "GEO Region", 
                 "Terminal", "Boarding Area", "Passenger Count"]
passenger_transformed = passenger_df[passenger_cols].copy()

# Rename the column headers
passenger_transformed = passenger_transformed.rename(columns={"ID":"id",
                                                            "Operating Airline IATA Code": "unique_carrier",
                                                          "Year": "year",
                                                          "Month": "month",
                                                     "GEO Region": "geo_region",
                                                     "Terminal": "terminal",
                                                           "Boarding Area": "boarding_area",
                                                     "Passenger Count": "passengers_number"})

#landings_transformed.drop_duplicates(["unique_carrier","geo_region"], keep= 'last')
passenger_transformed.drop_duplicates("id", inplace=True)
passenger_transformed.set_index("id", inplace=True)
passenger_transformed.tail()

Unnamed: 0_level_0,unique_carrier,year,month,geo_region,terminal,boarding_area,passengers_number
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1428,UA,2008,12,Mexico,International,G,71
1429,VX,2008,12,US,International,A,85284
1430,VX,2008,12,US,International,A,86713
1431,VS,2008,12,Europe,International,A,8987
1432,VS,2008,12,Europe,International,A,8447


In [78]:
connection_string = "root:<insert your password>@localhost/sfo_db"
engine = create_engine(f'mysql://{connection_string}')

In [79]:
# Confirm tables
engine.table_names()

['delays', 'landings', 'passengers']

In [80]:
delay_transformed.to_sql(name='delays', con=engine, if_exists='append', index=True)


In [81]:
landings_transformed.to_sql(name='landings', con=engine, if_exists='append', index=True)

In [82]:
passenger_transformed.to_sql(name='passengers', con=engine, if_exists='append', index=True)