In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('annual_time_series_with_airports&airlines.csv')

In [4]:
df.columns

Index(['ORIGIN_AIRPORT_ID', 'ORIGIN_STATE_ABR', 'CODE', 'NAME', 'AIRLINE_ID',
       'UNIQUE_CARRIER_NAME', 'AIRCRAFT_TYPE', 'Description', '2019', '2020',
       '2021', '2022'],
      dtype='object')

In [5]:
# First, let's create the total_2019 and total_2022
df['total_2019'] = df.groupby('ORIGIN_AIRPORT_ID')['2019'].transform('sum')
df['total_2022'] = df.groupby('ORIGIN_AIRPORT_ID')['2022'].transform('sum')

# Now, let's create a mask for the AIRCRAFT_TYPE values of interest
mask = df['AIRCRAFT_TYPE'].isin([628, 629, 674, 675, 676])

# Use this mask to sum the values in 2019 and 2022 for these specific AIRCRAFT_TYPEs
df['total_50s_2019'] = df.loc[mask].groupby('ORIGIN_AIRPORT_ID')['2019'].transform('sum')
df['total_50s_2022'] = df.loc[mask].groupby('ORIGIN_AIRPORT_ID')['2022'].transform('sum')

# Fill NaN values (where the AIRCRAFT_TYPE is not one of the ones specified) with 0
df['total_50s_2019'].fillna(0, inplace=True)
df['total_50s_2022'].fillna(0, inplace=True)

# Calculate the percentages
df['pct_50_2019'] = df['total_50s_2019'] / df['total_2019'] * 100
df['pct_50_2022'] = df['total_50s_2022'] / df['total_2022'] * 100

# Finally, let's create the output_df
output_df = df[['ORIGIN_AIRPORT_ID', 'ORIGIN_STATE_ABR', 'CODE', 'NAME', 'total_2019', 'total_2022', 'total_50s_2019', 'pct_50_2019', 'total_50s_2022', 'pct_50_2022']].drop_duplicates()


In [6]:
output_df

Unnamed: 0,ORIGIN_AIRPORT_ID,ORIGIN_STATE_ABR,CODE,NAME,total_2019,total_2022,total_50s_2019,pct_50_2019,total_50s_2022,pct_50_2022
0,10397,GA,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",395557.0,314165.0,0.0,0.000000,0.0,0.000000
23,10397,GA,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",395557.0,314165.0,37002.0,9.354404,17658.0,5.620613
79,13930,IL,ORD,"Chicago, IL: Chicago O'Hare International",392793.0,291702.0,0.0,0.000000,0.0,0.000000
101,13930,IL,ORD,"Chicago, IL: Chicago O'Hare International",392793.0,291702.0,122896.0,31.287727,67913.0,23.281637
165,11292,CO,DEN,"Denver, CO: Denver International",288617.0,277694.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
10860,14092,NC,PGV,"Greenville, NC: Pitt Greenville",1484.0,786.0,0.0,0.000000,0.0,0.000000
10898,11415,TX,DRT,"Del Rio, TX: Del Rio International",714.0,683.0,0.0,0.000000,0.0,0.000000
10905,12129,MN,HIB,"Hibbing, MN: Range Regional",627.0,640.0,0.0,0.000000,0.0,0.000000
10916,10409,SD,ATY,"Watertown, SD: Watertown Regional",502.0,623.0,0.0,0.000000,0.0,0.000000


In [8]:
output_df.to_csv('after_correction_50_airports.csv', index=False)