In [60]:
import pandas as pd

# Load the cleaned flight data
flight_data = pd.read_csv(r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\cleaned_flight_data.csv')

# Load the cleaned IATA data
iata_data = pd.read_csv(r'C:\Users\hopeh\Desktop\data_science_bootcamp\flight_times_capstone\cleaned_iata_data.csv')

# Display the first few rows of each DataFrame to confirm they loaded correctly
print(flight_data.head())
print(iata_data.head())


   year  month  day_of_month  day_of_week        date op_unique_carrier  \
0  2022      5             1            7  2022-05-01                9e   
1  2022      5             1            7  2022-05-01                9e   
2  2022      5             1            7  2022-05-01                9e   
3  2022      5             1            7  2022-05-01                9e   
4  2022      5             1            7  2022-05-01                9e   

  tail_num  op_carrier_fl_num origin_iata                origin_city  ...  \
0   n131ev               4633         atl                    atlanta  ...   
1   n131ev               4633         hpn               white plains  ...   
2   n131ev               4717         tys                  knoxville  ...   
3   n131ev               4958         atl                    atlanta  ...   
4   n131ev               4958         oaj  jacksonville/camp lejeune  ...   

  flights distance  distance_group  carrier_delay  weather_delay  nas_delay  \
0     1

In [61]:
print(flight_data.columns)
print(iata_data.columns)

Index(['year', 'month', 'day_of_month', 'day_of_week', 'date',
       'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_iata',
       'origin_city', 'dest_iata', 'dest_city', 'crs_dep_time', 'dep_time',
       'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'crs_arr_time',
       'arr_time', 'cancelled', 'diverted', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'distance_group', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay', 'origin_state', 'dest_state'],
      dtype='object')
Index(['state', 'iata', 'airport_name', 'latitude', 'longitude', 'unique_id',
       'state_abbr'],
      dtype='object')


In [62]:
# Remove multiple columns by names
flight_data = flight_data.drop(columns=['year', 'month', 'day_of_month'])
iata_data = iata_data.drop(columns=['state', 'unique_id'])

In [63]:
print(flight_data.columns)
print(iata_data.columns)

Index(['day_of_week', 'date', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_iata', 'origin_city', 'dest_iata',
       'dest_city', 'crs_dep_time', 'dep_time', 'taxi_out', 'wheels_off',
       'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'cancelled',
       'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time',
       'flights', 'distance', 'distance_group', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'origin_state', 'dest_state'],
      dtype='object')
Index(['iata', 'airport_name', 'latitude', 'longitude', 'state_abbr'], dtype='object')


In [64]:
print(flight_data.shape)
print(iata_data.shape)

(13105004, 32)
(2029, 5)


In [65]:
# Function to clean column names and string values
def clean_dataframe(df):
    # Convert column names to lowercase and strip whitespace
    df.columns = df.columns.str.lower().str.strip()
    # Convert all string values to lowercase and strip whitespace
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.lower().str.strip()
    return df

# Clean both datasets
flight_data = clean_dataframe(flight_data)
iata_data = clean_dataframe(iata_data)

# Now both datasets have lowercase columns and cleaned string values


In [66]:
print(flight_data.head())
print(iata_data.head())

   day_of_week        date op_unique_carrier tail_num  op_carrier_fl_num  \
0            7  2022-05-01                9e   n131ev               4633   
1            7  2022-05-01                9e   n131ev               4633   
2            7  2022-05-01                9e   n131ev               4717   
3            7  2022-05-01                9e   n131ev               4958   
4            7  2022-05-01                9e   n131ev               4958   

  origin_iata                origin_city dest_iata                  dest_city  \
0         atl                    atlanta       hpn               white plains   
1         hpn               white plains       atl                    atlanta   
2         tys                  knoxville       atl                    atlanta   
3         atl                    atlanta       oaj  jacksonville/camp lejeune   
4         oaj  jacksonville/camp lejeune       atl                    atlanta   

   crs_dep_time  ...  flights  distance  distance_group 

In [67]:
# Merge on origin_iata
merged_data = flight_data.merge(iata_data, left_on='origin_iata', right_on='iata', suffixes=('', '_origin'))

# Merge on dest_iata
merged_data = merged_data.merge(iata_data, left_on='dest_iata', right_on='iata', suffixes=('', '_dest'))

# Now merged_data contains the combined information from both datasets


In [68]:
print(merged_data.head())

   day_of_week        date op_unique_carrier tail_num  op_carrier_fl_num  \
0            7  2022-05-01                9e   n131ev               4633   
1            7  2022-05-01                9e   n131ev               4633   
2            7  2022-05-01                9e   n131ev               4717   
3            7  2022-05-01                9e   n131ev               4958   
4            7  2022-05-01                9e   n131ev               4958   

  origin_iata                origin_city dest_iata                  dest_city  \
0         atl                    atlanta       hpn               white plains   
1         hpn               white plains       atl                    atlanta   
2         tys                  knoxville       atl                    atlanta   
3         atl                    atlanta       oaj  jacksonville/camp lejeune   
4         oaj  jacksonville/camp lejeune       atl                    atlanta   

   crs_dep_time  ...  iata                              

In [69]:
print(merged_data.columns)

Index(['day_of_week', 'date', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_iata', 'origin_city', 'dest_iata',
       'dest_city', 'crs_dep_time', 'dep_time', 'taxi_out', 'wheels_off',
       'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'cancelled',
       'diverted', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time',
       'flights', 'distance', 'distance_group', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'origin_state', 'dest_state', 'iata', 'airport_name', 'latitude',
       'longitude', 'state_abbr', 'iata_dest', 'airport_name_dest',
       'latitude_dest', 'longitude_dest', 'state_abbr_dest'],
      dtype='object')


In [70]:
print(merged_data[merged_data['origin_iata']=='stl'])

          day_of_week        date op_unique_carrier tail_num  \
100                 7  2022-05-01                9e   n295pq   
228                 7  2022-05-01                9e   n330pq   
307                 7  2022-05-01                9e   n482px   
662                 7  2022-05-01                9e   n936xj   
1705                7  2022-05-01                aa   n738us   
...               ...         ...               ...      ...   
12950971            2  2024-04-30                yx   n426yx   
12951019            2  2024-04-30                yx   n436yx   
12951149            2  2024-04-30                yx   n745yx   
12951181            2  2024-04-30                yx   n755yx   
12951294            2  2024-04-30                yx   n880rw   

          op_carrier_fl_num origin_iata origin_city dest_iata    dest_city  \
100                    5372         stl   st. louis       lga     new york   
228                    5094         stl   st. louis       lga     new york 

In [71]:
# Remove multiple columns by names
merged_data = merged_data.drop(columns=['airport_name', 'airport_name_dest'])

In [72]:
# Save the merged_data DataFrame to a CSV file
merged_data.to_csv('flights_iata.csv', index=False)