#### Loading in packages and data / selecting columns and manipulating

In [1]:
import pandas as pd
import numpy as np
from datetime import date
import holidays
import seaborn as sns

In [2]:
train = pd.read_csv('D:/Summer Practicum/1_Data/training_airlines.csv', low_memory=False)
test = pd.read_csv('D:/Summer Practicum/1_Data/testing_airlines.csv', low_memory=False)
validate = pd.read_csv('D:/Summer Practicum/1_Data/validation_airlines.csv', low_memory=False)
df = pd.concat([train,test,validate])
df.columns = df.columns.str.replace(' ', '')

In [3]:
#selecting specific columns from the training data
df_trim = df[["OriginAirportID", "Flights", "FlightDate", "Month", "DayofMonth", "DayOfWeek","Operating_Airline","Origin", "OriginCityName", "OriginState","Dest","DestCityName",
"DestState","DepTime","DepDelay","DepDel15","DepartureDelayGroups","TaxiOut","TaxiIn","ArrTime","ArrDelay","ArrDel15", "ArrDelayMinutes",
"ArrivalDelayGroups",'Cancelled',"Diverted","ActualElapsedTime","AirTime","Distance","DistanceGroup",
"CarrierDelay","WeatherDelay","NASDelay","SecurityDelay","LateAircraftDelay"]]

In [4]:
#filling na values in delay fields
df_trim[["NASDelay", "SecurityDelay", "CarrierDelay", "WeatherDelay", "LateAircraftDelay"]] = df_trim[["NASDelay", \
     "SecurityDelay", "CarrierDelay", "WeatherDelay", "LateAircraftDelay"]].fillna(0)

#creating a total delay field
df_trim["TotalDelay"] = df_trim[["NASDelay", "SecurityDelay", "CarrierDelay", "WeatherDelay", "LateAircraftDelay"]].sum(axis=1)

#field for if flight was cancelled or diverted
df_trim["CancOrDiv"] = df_trim["Cancelled"] + df_trim["Diverted"]
df_trim["CancOrDiv2"] = np.where((df_trim["Cancelled"] == 1) | df_trim["Diverted"] == 1, 1, 0)
df_trim["CancOrDiv2"] == df_trim["CancOrDiv"]

#field for if a flight was delayed
df_trim["Delayed"] = np.where((df_trim["ArrDelayMinutes"]) > 0, 1, 0)

#creating a region column
def state_to_region(state):
    West = ["WA", "MT", "ID", "WY", "OR", "CA", "NV", "UT", "CO", "AZ", "NM", "AK", "HI"]
    South = ["TX", "OK", "AR", "LA", "MS", "AL", "TN", "KY", "FL", "GA", "SC", "NC", "VA", "MD", "DE"]
    Northeast = ["PA", "NJ", "NY", "CT", "MA", "RI", "ME", "NH", "VT"]
    Midwest = ["ND", "SD", "NE", "KS", "MO", "IA", "MN", "WI", "IL", "IN", "OH", "MI"]
    if state in West:
        return "West"
    elif state in South:
        return "South"
    elif state in Northeast:
        return "Northeast"
    elif state in Midwest:
        return "Midwest"
    
df_trim["Region"] = df_trim["OriginState"].map(state_to_region)

#creating is holiday field
df_trim['FlightDate'] = pd.to_datetime(df_trim['FlightDate'])
df_trim.columns = df_trim.columns.str.replace(' ','')

us_holidays = holidays.US(years=range(df_trim['FlightDate'].min().year,df_trim['FlightDate'].max().year +1))
holiday_dates = {date for date, name in us_holidays.items()}

df_trim['Is_Holiday'] = df_trim['FlightDate'].dt.date.isin(holiday_dates)
df_trim['Is_Delayed'] = (df_trim['ArrDelay'] > 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trim[["NASDelay", "SecurityDelay", "CarrierDelay", "WeatherDelay", "LateAircraftDelay"]] = df_trim[["NASDelay", \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_trim["TotalDelay"] = df_trim[["NASDelay", "SecurityDelay", "CarrierDelay", "WeatherDelay", "LateAircraftDelay"]].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexin

#### Want to remove any flights not involving the top airports (as an origin or destination)

In [5]:
#which airports are top airports
origin_df = df_trim.groupby("Origin")["Flights"].value_counts().reset_index(name="Yearly_Flights")
origin_df_trim = origin_df[origin_df["Yearly_Flights"] >= 10000]
origin_df_trim.sort_values("Yearly_Flights", ascending=True)

dest_df = df_trim.groupby("Dest")["Flights"].value_counts().reset_index(name="Yearly_Flights")
dest_df_trim = dest_df[dest_df["Yearly_Flights"] >= 10000]
dest_df_trim.sort_values("Yearly_Flights", ascending=True)

dest_list = list(dest_df_trim["Dest"])
origin_list = list(origin_df_trim["Origin"])

combined_list = list(set(dest_list + origin_list))
len(combined_list)

78

In [6]:
#remove flights from airports not having more than 10,000 flights

df_2 = df_trim[(df_trim["Origin"].isin(combined_list)) | (df_trim["Dest"].isin(combined_list))]

print(f"Original size: {df_trim.shape[0]}, New size: {df_2.shape[0]}")

Original size: 4921235, New size: 4865853


#### Creating a dataframe that has the holiday, delay, divcancel, extreme delay, and carrier fault at each airport for all airlines

In [7]:
#what percent of delays are the airlines fault
when_delayed_df_total = df_2[df_2["TotalDelay"] != 0][["NASDelay", "SecurityDelay", "CarrierDelay", 
                                                                   "WeatherDelay", "LateAircraftDelay", "Origin", "TotalDelay"]]

when_delayed_df_total = when_delayed_df_total.reset_index()

when_delayed_df_total["PercentAirlineFault"] = ((when_delayed_df_total["CarrierDelay"]) / when_delayed_df_total["TotalDelay"])

fault_at_airports_total = when_delayed_df_total.groupby("Origin")["PercentAirlineFault"].mean().reset_index()
fault_at_airports_total.shape

(234, 2)

In [8]:
#percent of flights delayed and flights delayed/cancelled
airport_stats_df_total = df_2.groupby("Origin").agg(
    percent_delayed = ("Delayed", "mean"),
    percent_diverted_cancelled = ("CancOrDiv", "mean")).reset_index() 

airport_stats_df_total.shape

(234, 3)

In [9]:
#holiday performance at each airport
holiday_df_total = df_2[df_2["Is_Holiday"] == 1]

holiday_performance_df_total= holiday_df_total.groupby("Origin")["Delayed"].mean().reset_index()
holiday_performance_df_total.shape

(232, 2)

In [10]:
#how extreme is the average delay
only_delayed_flights_total = df_2[df_2["Delayed"] == 1]
how_extreme_delays_total = only_delayed_flights_total.groupby("Origin")["ArrDelayMinutes"].mean().reset_index()

In [11]:
#merging all of the dataframes together

merge2t = pd.merge(airport_stats_df_total, fault_at_airports_total, on="Origin", how="left")

merge3t = pd.merge(merge2t, holiday_performance_df_total, on="Origin", how="left")

merge4t = pd.merge(merge3t, how_extreme_delays_total, on="Origin", how="left")

Final_merget = merge4t[["Origin", "percent_delayed", "percent_diverted_cancelled", "PercentAirlineFault"]]
Final_merget["HolidayPercentDelayed"] = merge4t["Delayed"]
Final_merget["HowExtremeDelay"] = merge4t["ArrDelayMinutes"]
Final_merget["HolidayPercentDelayed"] = Final_merget["HolidayPercentDelayed"].fillna(0)
Final_merget

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_merget["HolidayPercentDelayed"] = merge4t["Delayed"]


Unnamed: 0,Origin,percent_delayed,percent_diverted_cancelled,PercentAirlineFault,HolidayPercentDelayed,HowExtremeDelay
0,ABE,0.386694,0.029106,0.308392,0.450000,43.365591
1,ABQ,0.358389,0.021467,0.255210,0.342020,40.937511
2,ACK,0.350123,0.041769,0.135145,0.250000,61.807018
3,ACY,0.267287,0.032868,0.236166,0.237705,48.984919
4,ADK,0.432692,0.028846,0.069621,0.000000,37.555556
...,...,...,...,...,...,...
229,TVC,0.393728,0.013937,0.415341,0.473684,39.951327
230,TYS,0.300554,0.014500,0.194414,0.318750,51.291228
231,USA,0.605042,0.029412,0.166805,0.357143,60.118056
232,VPS,0.326290,0.016149,0.279890,0.313043,47.070768


#### Alaska airlines analysis

In [12]:
df_alaska = df_2[df_2["Operating_Airline"] == "AS"]
df_alaska.shape

(227699, 42)

In [13]:
#where alaska is flying - regions
alaska_percent_regions = df_alaska.groupby("Region")["Flights"].sum() / len(df_alaska["Flights"])
alaska_percent_regions * 100

Region
Midwest       3.140989
Northeast     4.895059
South         8.307019
West         83.656933
Name: Flights, dtype: float64

In [14]:
#Where alaska is flying - airports
alaska_percent_airports = (pd.DataFrame(df_alaska.groupby("Origin")["Flights"].sum() / len(df_alaska["Flights"]))).reset_index()
alaska_percent_airports.rename(columns={"Flights" : "Percentage_airline_flights"}, inplace=True)
alaska_percent_airports


Unnamed: 0,Origin,Percentage_airline_flights
0,ABQ,0.003026
1,ADK,0.000457
2,ADQ,0.004023
3,AKN,0.001010
4,ANC,0.063329
...,...,...
79,SNA,0.015472
80,STL,0.002398
81,STS,0.000843
82,TPA,0.003606


In [15]:
#what percent of delays are the airlines fault
when_delayed_df = df_alaska[df_alaska["TotalDelay"] != 0][["NASDelay", "SecurityDelay", "CarrierDelay", 
                                                                   "WeatherDelay", "LateAircraftDelay", "Origin", "TotalDelay"]]

when_delayed_df = when_delayed_df.reset_index()

when_delayed_df["PercentAirlineFault"] = ((when_delayed_df["CarrierDelay"]) / when_delayed_df["TotalDelay"])

fault_at_airports = when_delayed_df.groupby("Origin")["PercentAirlineFault"].mean().reset_index()
fault_at_airports

Unnamed: 0,Origin,PercentAirlineFault
0,ABQ,0.253801
1,ADK,0.069621
2,ADQ,0.259974
3,AKN,0.032173
4,ANC,0.325114
...,...,...
79,SNA,0.184864
80,STL,0.394896
81,STS,0.156903
82,TPA,0.187796


In [16]:
#percent of flights delayed and flights delayed/cancelled
airport_stats_df = df_alaska.groupby("Origin").agg(
    percent_delayed = ("Delayed", "mean"),
    percent_diverted_cancelled = ("CancOrDiv", "mean")).reset_index() 

airport_stats_df

Unnamed: 0,Origin,percent_delayed,percent_diverted_cancelled
0,ABQ,0.338171,0.027576
1,ADK,0.432692,0.028846
2,ADQ,0.279476,0.033843
3,AKN,0.569565,0.021739
4,ANC,0.385298,0.023994
...,...,...,...
79,SNA,0.314789,0.034346
80,STL,0.358974,0.025641
81,STS,0.489583,0.015625
82,TPA,0.337393,0.035323


In [17]:
#holiday performance at each airport
holiday_df = df_alaska[df_alaska["Is_Holiday"] == 1]

holiday_performance_df = holiday_df.groupby("Origin")["Delayed"].mean().reset_index()
holiday_performance_df.head(10)

Unnamed: 0,Origin,Delayed
0,ABQ,0.392857
1,ADQ,0.321429
2,AKN,0.4
3,ANC,0.369942
4,ATL,0.342105
5,AUS,0.382812
6,BET,0.545455
7,BNA,0.555556
8,BOI,0.46875
9,BOS,0.325843


In [18]:
#how extreme is the average delay
only_delayed_flights = df_alaska[df_alaska["Delayed"] == 1]
how_extreme_delays = only_delayed_flights.groupby("Origin")["ArrDelayMinutes"].mean().reset_index()

In [19]:
#merging all of the dataframes together
merge1 = pd.merge(alaska_percent_airports[["Origin", "Percentage_airline_flights"]], airport_stats_df, how="left", on="Origin")

merge2 = pd.merge(merge1, fault_at_airports, on="Origin", how="left")

merge3 = pd.merge(merge2, holiday_performance_df, on="Origin", how="left")

merge4 = pd.merge(merge3, how_extreme_delays, on="Origin", how="left")

Final_merge = merge4[["Origin", "Percentage_airline_flights", "percent_delayed", "percent_diverted_cancelled", "PercentAirlineFault"]]
Final_merge["HolidayPercentDelayed"] = merge4["Delayed"]
Final_merge["HowExtremeDelay"] = merge4["ArrDelayMinutes"]
Final_merge["HolidayPercentDelayed"] = Final_merge["HolidayPercentDelayed"].fillna(0)
Final_merge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_merge["HolidayPercentDelayed"] = merge4["Delayed"]


Unnamed: 0,Origin,Percentage_airline_flights,percent_delayed,percent_diverted_cancelled,PercentAirlineFault,HolidayPercentDelayed,HowExtremeDelay
0,ABQ,0.003026,0.338171,0.027576,0.253801,0.392857,32.459227
1,ADK,0.000457,0.432692,0.028846,0.069621,0.000000,37.555556
2,ADQ,0.004023,0.279476,0.033843,0.259974,0.321429,30.496094
3,AKN,0.001010,0.569565,0.021739,0.032173,0.400000,43.763359
4,ANC,0.063329,0.385298,0.023994,0.325114,0.369942,23.277178
...,...,...,...,...,...,...,...
79,SNA,0.015472,0.314789,0.034346,0.184864,0.360294,25.578900
80,STL,0.002398,0.358974,0.025641,0.394896,0.562500,35.244898
81,STS,0.000843,0.489583,0.015625,0.156903,0.625000,27.595745
82,TPA,0.003606,0.337393,0.035323,0.187796,0.300000,39.079422


In [20]:
#Create a total score for each airport

#Scoring for all columns with for loop
columns_of_interest = ["percent_delayed", "percent_diverted_cancelled", "PercentAirlineFault", "HolidayPercentDelayed", "HowExtremeDelay"]

for column in columns_of_interest:

    quantiles = Final_merget[[column]].quantile([0.25, 0.75])
    twenty_fifth = quantiles.loc[0.25][column]
    seventy_fifth = quantiles.loc[0.75][column]

    Final_merge[("ScoreFrom_" + column)] = 0

    for i in range(len(Final_merge["Origin"])):
        value = Final_merge[column][i]

        if value <= twenty_fifth:
            points = 1
        elif value > twenty_fifth and value <= seventy_fifth:
            points = 0.5
        elif value > seventy_fifth:
            points = 0.25

        Final_merge[("ScoreFrom_" + column)][i] = points

#Adding a modifer for the size of the airport

percent_flights_quantiles = Final_merge[["Percentage_airline_flights"]].quantile([0.25, 0.75])
twenty_fifth = percent_flights_quantiles.loc[0.25]["Percentage_airline_flights"]
seventy_fifth = percent_flights_quantiles.loc[0.75]["Percentage_airline_flights"]

Final_merge["ScoreFrom_Size"] = 0

for i in range(len(Final_merge["Origin"])):
    value = Final_merge["Percentage_airline_flights"][i]

    if value >= seventy_fifth:
            points = .5
    elif value < seventy_fifth and value > twenty_fifth:
            points = 0.25
    elif value <= twenty_fifth:
            points = 0

    Final_merge["ScoreFrom_Size"][i] = points

Final_merge["TotalScore"] = Final_merge[["ScoreFrom_percent_delayed", "ScoreFrom_percent_diverted_cancelled","ScoreFrom_PercentAirlineFault",
                                         "ScoreFrom_HolidayPercentDelayed", "ScoreFrom_HowExtremeDelay", "ScoreFrom_Size"]].sum(axis=1)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  Final_merge[("ScoreFrom_" + column)][i] = points
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_merge[("

In [21]:
#Bringing in latitude and longitude for the airports
from airportsdata import load

airports = load('IATA')

iata_codes = list(Final_merge["Origin"])

data = []

for code in iata_codes:
    info = airports.get(code)
    if info:
        data.append({
            "IATA" : code,
            "Name" : info["name"],
            "State" : info["subd"],
            "Latitude" : info["lat"],
            "Longitude" : info["lon"]
        })

    else:
        data.append({
            "IATA" : code,
            "Name" : "Not found",
            "State" : "",
            "Latitude" : None,
            "Longitude" : None
        })

airports_geog_info_df = pd.DataFrame(data)

print(airports_geog_info_df)

   IATA                                         Name       State  Latitude  \
0   ABQ    Albuquerque International Sunport Airport  New Mexico  35.03893   
1   ADK                                 Adak Airport      Alaska  51.88358   
2   ADQ                               Kodiak Airport      Alaska  57.74979   
3   AKN                          King Salmon Airport      Alaska  58.67649   
4   ANC  Ted Stevens Anchorage International Airport      Alaska  61.17408   
..  ...                                          ...         ...       ...   
79  SNA             John Wayne/Orange County Airport  California  33.67566   
80  STL       St Louis Lambert International Airport    Missouri  38.74870   
81  STS     Charles M Schulz - Sonoma County Airport  California  38.50969   
82  TPA                  Tampa International Airport     Florida  27.97547   
83  TUS                 Tucson International Airport     Arizona  32.11607   

    Longitude  
0  -106.60826  
1  -176.64248  
2  -152.49394  

In [22]:
#Merge airport geographic information with airport information

merge_locations = pd.merge(Final_merge, airports_geog_info_df, how="left", left_on="Origin", right_on="IATA")

merge_locations.to_csv("Alaska_AirportScores_Locations.csv", index=False)


In [23]:
#Find the 5 airports above the fiftieth percentile with the worst score
fifty = merge_locations["Percentage_airline_flights"].median()
merge_locations[merge_locations["Percentage_airline_flights"] > fifty].sort_values("TotalScore", ascending=True).head(5)


Unnamed: 0,Origin,Percentage_airline_flights,percent_delayed,percent_diverted_cancelled,PercentAirlineFault,HolidayPercentDelayed,HowExtremeDelay,ScoreFrom_percent_delayed,ScoreFrom_percent_diverted_cancelled,ScoreFrom_PercentAirlineFault,ScoreFrom_HolidayPercentDelayed,ScoreFrom_HowExtremeDelay,ScoreFrom_Size,TotalScore,IATA,Name,State,Latitude,Longitude
31,FLL,0.003961,0.487805,0.059867,0.224302,0.45,42.268182,0.25,0.25,0.5,0.25,0.5,0.25,2.0,FLL,Fort Lauderdale/Hollywood International Airport,Florida,26.07167,-80.14969
21,DCA,0.007993,0.410989,0.03022,0.210138,0.328571,41.576203,0.5,0.25,0.5,0.5,0.5,0.25,2.5,DCA,Ronald Reagan Washington Ntl Airport,Dist. Of Columbia,38.85144,-77.03772
52,OGG,0.012806,0.500686,0.022634,0.224151,0.431034,39.752055,0.25,0.5,0.5,0.5,0.5,0.5,2.75,OGG,Kahului Airport,Hawaii,20.89865,-156.43046
9,BOI,0.006684,0.387648,0.020368,0.254058,0.46875,25.330508,0.5,0.5,0.5,0.25,1.0,0.25,3.0,BOI,Boise Air Trml/Gowen Field,Idaho,43.56436,-116.22286
23,DFW,0.008379,0.379979,0.031447,0.233816,0.416667,35.198621,0.5,0.25,0.5,0.5,1.0,0.25,3.0,DFW,Dallas-Fort Worth International Airport,Texas,32.89723,-97.03769


#### Jet Blue Analysis

In [24]:
df_jetblue = df_2[df_2["Operating_Airline"] == "B6"]
df_jetblue.shape

(279120, 42)

In [25]:
#where jetblue is flying - regions
jetblue_percent_regions = df_jetblue.groupby("Region")["Flights"].sum() / len(df_jetblue["Flights"])
jetblue_percent_regions * 100

Region
Midwest       2.286830
Northeast    49.623818
South        32.976498
West         10.963385
Name: Flights, dtype: float64

In [26]:
#Where jetblue is flying - airports
jetblue_percent_airports = (pd.DataFrame(df_jetblue.groupby("Origin")["Flights"].sum() / len(df_jetblue["Flights"]))).reset_index()
jetblue_percent_airports.rename(columns={"Flights" : "Percentage_airline_flights"}, inplace=True)
jetblue_percent_airports

Unnamed: 0,Origin,Percentage_airline_flights
0,ABQ,0.000781
1,ACK,0.002916
2,ALB,0.002429
3,ATL,0.008298
4,AUS,0.004439
...,...,...
66,SMF,0.001652
67,SRQ,0.003343
68,STT,0.002949
69,SYR,0.003816


In [27]:
#what percent of delays are the airlines fault
when_delayed_df = df_jetblue[df_jetblue["TotalDelay"] != 0][["NASDelay", "SecurityDelay", "CarrierDelay", 
                                                                   "WeatherDelay", "LateAircraftDelay", "Origin", "TotalDelay"]]

when_delayed_df = when_delayed_df.reset_index()

when_delayed_df["PercentAirlineFault"] = ((when_delayed_df["CarrierDelay"]) / when_delayed_df["TotalDelay"])

fault_at_airports = when_delayed_df.groupby("Origin")["PercentAirlineFault"].mean().reset_index()
fault_at_airports

Unnamed: 0,Origin,PercentAirlineFault
0,ABQ,0.170550
1,ACK,0.135145
2,ALB,0.239163
3,ATL,0.429108
4,AUS,0.360460
...,...,...
66,SMF,0.305418
67,SRQ,0.274008
68,STT,0.223779
69,SYR,0.331678


In [28]:
#percent of flights delayed and flights delayed/cancelled
airport_stats_df = df_jetblue.groupby("Origin").agg(
    percent_delayed = ("Delayed", "mean"),
    percent_diverted_cancelled = ("CancOrDiv", "mean")).reset_index() 

airport_stats_df

Unnamed: 0,Origin,percent_delayed,percent_diverted_cancelled
0,ABQ,0.660550,0.018349
1,ACK,0.350123,0.041769
2,ALB,0.452802,0.028024
3,ATL,0.552677,0.018998
4,AUS,0.489104,0.025020
...,...,...,...
66,SMF,0.572668,0.021692
67,SRQ,0.535906,0.041801
68,STT,0.425273,0.029162
69,SYR,0.445070,0.046948


In [29]:
#holiday performance at each airport
holiday_df = df_jetblue[df_jetblue["Is_Holiday"] == 1]

holiday_performance_df = holiday_df.groupby("Origin")["Delayed"].mean().reset_index()
holiday_performance_df.head(10)

Unnamed: 0,Origin,Delayed
0,ABQ,0.333333
1,ACK,0.25
2,ALB,0.555556
3,ATL,0.526316
4,AUS,0.511111
5,AVL,0.25
6,BDL,0.403361
7,BNA,0.593023
8,BOS,0.452409
9,BQN,0.673077


In [30]:
#how extreme is the average delay
only_delayed_flights = df_jetblue[df_jetblue["Delayed"] == 1]
how_extreme_delays = only_delayed_flights.groupby("Origin")["ArrDelayMinutes"].mean().reset_index()

In [31]:
#merging all of the dataframes together
merge1 = pd.merge(jetblue_percent_airports[["Origin", "Percentage_airline_flights"]], airport_stats_df, how="left", on="Origin")

merge2 = pd.merge(merge1, fault_at_airports, on="Origin", how="left")

merge3 = pd.merge(merge2, holiday_performance_df, on="Origin", how="left")

merge4 = pd.merge(merge3, how_extreme_delays, on="Origin", how="left")

Final_merge = merge4[["Origin", "Percentage_airline_flights", "percent_delayed", "percent_diverted_cancelled", "PercentAirlineFault"]]
Final_merge["HolidayPercentDelayed"] = merge4["Delayed"]
Final_merge["HowExtremeDelay"] = merge4["ArrDelayMinutes"]
Final_merge["HolidayPercentDelayed"] = Final_merge["HolidayPercentDelayed"].fillna(0)
Final_merge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_merge["HolidayPercentDelayed"] = merge4["Delayed"]


Unnamed: 0,Origin,Percentage_airline_flights,percent_delayed,percent_diverted_cancelled,PercentAirlineFault,HolidayPercentDelayed,HowExtremeDelay
0,ABQ,0.000781,0.660550,0.018349,0.170550,0.333333,48.013889
1,ACK,0.002916,0.350123,0.041769,0.135145,0.250000,61.807018
2,ALB,0.002429,0.452802,0.028024,0.239163,0.555556,67.276873
3,ATL,0.008298,0.552677,0.018998,0.429108,0.526316,56.485156
4,AUS,0.004439,0.489104,0.025020,0.360460,0.511111,51.886139
...,...,...,...,...,...,...,...
66,SMF,0.001652,0.572668,0.021692,0.305418,0.578947,55.295455
67,SRQ,0.003343,0.535906,0.041801,0.274008,0.694444,67.626000
68,STT,0.002949,0.425273,0.029162,0.223779,0.588235,58.102857
69,SYR,0.003816,0.445070,0.046948,0.331678,0.425000,59.559072


In [32]:
#Create a total score for each airport

#Scoring for all columns with for loop
columns_of_interest = ["percent_delayed", "percent_diverted_cancelled", "PercentAirlineFault", "HolidayPercentDelayed", "HowExtremeDelay"]

for column in columns_of_interest:

    quantiles = Final_merget[[column]].quantile([0.25, 0.75])
    twenty_fifth = quantiles.loc[0.25][column]
    seventy_fifth = quantiles.loc[0.75][column]

    Final_merge[("ScoreFrom_" + column)] = 0

    for i in range(len(Final_merge["Origin"])):
        value = Final_merge[column][i]

        if value <= twenty_fifth:
            points = 1
        elif value > twenty_fifth and value <= seventy_fifth:
            points = 0.5
        elif value > seventy_fifth:
            points = 0.25

        Final_merge[("ScoreFrom_" + column)][i] = points

#Adding a modifer for the size of the airport

percent_flights_quantiles = Final_merge[["Percentage_airline_flights"]].quantile([0.25, 0.75])
twenty_fifth = percent_flights_quantiles.loc[0.25]["Percentage_airline_flights"]
seventy_fifth = percent_flights_quantiles.loc[0.75]["Percentage_airline_flights"]

Final_merge["ScoreFrom_Size"] = 0

for i in range(len(Final_merge["Origin"])):
    value = Final_merge["Percentage_airline_flights"][i]

    if value >= seventy_fifth:
            points = .5
    elif value < seventy_fifth and value > twenty_fifth:
            points = 0.25
    elif value <= twenty_fifth:
            points = 0

    Final_merge["ScoreFrom_Size"][i] = points

Final_merge["TotalScore"] = Final_merge[["ScoreFrom_percent_delayed", "ScoreFrom_percent_diverted_cancelled","ScoreFrom_PercentAirlineFault",
                                         "ScoreFrom_HolidayPercentDelayed", "ScoreFrom_HowExtremeDelay", "ScoreFrom_Size"]].sum(axis=1)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  Final_merge[("ScoreFrom_" + column)][i] = points
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_merge[("

In [33]:
#Bringing in latitude and longitude for the airports
from airportsdata import load

airports = load('IATA')

iata_codes = list(Final_merge["Origin"])

data = []

for code in iata_codes:
    info = airports.get(code)
    if info:
        data.append({
            "IATA" : code,
            "Name" : info["name"],
            "State" : info["subd"],
            "Latitude" : info["lat"],
            "Longitude" : info["lon"]
        })

    else:
        data.append({
            "IATA" : code,
            "Name" : "Not found",
            "State" : "",
            "Latitude" : None,
            "Longitude" : None
        })

airports_geog_info_df = pd.DataFrame(data)

print(airports_geog_info_df)

   IATA                                               Name           State  \
0   ABQ          Albuquerque International Sunport Airport      New Mexico   
1   ACK                         Nantucket Memorial Airport   Massachusetts   
2   ALB                       Albany International Airport        New York   
3   ATL  Hartsfield - Jackson Atlanta International Air...         Georgia   
4   AUS             Austin-Bergstrom International Airport           Texas   
..  ...                                                ...             ...   
66  SMF                   Sacramento International Airport      California   
67  SRQ           Sarasota/Bradenton International Airport         Florida   
68  STT                               Cyril E King Airport  Virgin Islands   
69  SYR             Syracuse Hancock International Airport        New York   
70  TPA                        Tampa International Airport         Florida   

    Latitude  Longitude  
0   35.03893 -106.60826  
1   41.3000

In [34]:
#Merge airport geographic information with airport information

merge_locations = pd.merge(Final_merge, airports_geog_info_df, how="left", left_on="Origin", right_on="IATA")

merge_locations.to_csv("Jetblue_AirportScores_Locations.csv", index=False)

In [35]:
#Find the 5 airports above the fiftieth percentile with the worst score
#creating a region column in merge_locations
def state_to_region(state):
    West = ["Washington", "Montana", "Idaho", "Wyoming", "Oregon", "California", "Nevada", "Utah", "Colorado", "Arizona", "New Mexico", "Alaska", "Hawaii"]
    South = ["Texas", "Oklahoma", "Arkansas", "Louisiana", "Mississippi", "Alabama", "Tennessee", "Kentucky", "Florida", "Georgia", "South Carolina", "North Carolina", "Virginia", "Maryland", "Delaware"]
    Northeast = ["Pennsylvania", "New Jersey", "New York", "Connecticut", "Massachusetts", "Rhode Island", "Maine", "New Hampshire", "Vermont"]
    Midwest = ["North Dakota", "South Dakota", "Nebraska", "Kansas", "Missouri", "Iowa", "Minnesota", "Wisconsin", "Illinois", "Indiana", "Ohio", "Michigan"]

    if state in West:
        return "West"
    elif state in South:
        return "South"
    elif state in Northeast:
        return "Northeast"
    elif state in Midwest:
        return "Midwest"
    
merge_locations["Region"] = merge_locations["State"].map(state_to_region)

fifty = merge_locations["Percentage_airline_flights"].median()
five_worst_jetblue = merge_locations[(merge_locations["Percentage_airline_flights"] > fifty) & (merge_locations["Region"] == "South")].sort_values("TotalScore", ascending=True).head(5)
five_worst_jetblue.to_csv("FiveWorstJetblue.csv", index=False)

#### Allegiant Analysis

In [36]:
df_allegiant = df_2[df_2["Operating_Airline"] == "G4"]
df_allegiant.shape

(68701, 42)

In [37]:
#where allegiant is flying - regions
allegiant_percent_regions = df_allegiant.groupby("Region")["Flights"].sum() / len(df_allegiant["Flights"])
allegiant_percent_regions * 100

Region
Midwest      16.908051
Northeast     6.307041
South        47.727107
West         28.973377
Name: Flights, dtype: float64

In [38]:
#Where frontier is flying - airports
allegiant_percent_airports = (pd.DataFrame(df_allegiant.groupby("Origin")["Flights"].sum() / len(df_allegiant["Flights"]))).reset_index()
allegiant_percent_airports.rename(columns={"Flights" : "Percentage_airline_flights"}, inplace=True)
allegiant_percent_airports

Unnamed: 0,Origin,Percentage_airline_flights
0,ABE,0.007001
1,ABQ,0.000175
2,ALB,0.002766
3,AMA,0.000451
4,ATW,0.007977
...,...,...
109,TUL,0.003624
110,TYS,0.018049
111,USA,0.003464
112,VPS,0.018588


In [39]:
#what percent of delays are the airlines fault
when_delayed_df = df_allegiant[df_allegiant["TotalDelay"] != 0][["NASDelay", "SecurityDelay", "CarrierDelay", 
                                                                   "WeatherDelay", "LateAircraftDelay", "Origin", "TotalDelay"]]

when_delayed_df = when_delayed_df.reset_index()

when_delayed_df["PercentAirlineFault"] = ((when_delayed_df["CarrierDelay"]) / when_delayed_df["TotalDelay"])

fault_at_airports = when_delayed_df.groupby("Origin")["PercentAirlineFault"].mean().reset_index()
fault_at_airports

Unnamed: 0,Origin,PercentAirlineFault
0,ABE,0.308392
1,ABQ,0.086527
2,ALB,0.135637
3,AMA,0.677327
4,ATW,0.222111
...,...,...
109,TUL,0.103735
110,TYS,0.201801
111,USA,0.166805
112,VPS,0.239689


In [40]:
#percent of flights delayed and flights delayed/cancelled
airport_stats_df = df_allegiant.groupby("Origin").agg(
    percent_delayed = ("Delayed", "mean"),
    percent_diverted_cancelled = ("CancOrDiv", "mean")).reset_index() 

airport_stats_df

Unnamed: 0,Origin,percent_delayed,percent_diverted_cancelled
0,ABE,0.386694,0.029106
1,ABQ,0.666667,0.166667
2,ALB,0.510526,0.036842
3,AMA,0.612903,0.064516
4,ATW,0.536496,0.023723
...,...,...,...
109,TUL,0.481928,0.016064
110,TYS,0.400000,0.013710
111,USA,0.605042,0.029412
112,VPS,0.454973,0.024276


In [41]:
#holiday performance at each airport
holiday_df = df_allegiant[df_allegiant["Is_Holiday"] == 1]

holiday_performance_df = holiday_df.groupby("Origin")["Delayed"].mean().reset_index()
holiday_performance_df.head(10)

Unnamed: 0,Origin,Delayed
0,ABE,0.45
1,ALB,0.333333
2,AMA,0.0
3,ATW,0.607143
4,AUS,0.421875
5,AVL,0.373333
6,AZA,0.363636
7,BIL,0.6
8,BIS,0.571429
9,BLI,0.133333


In [42]:
#how extreme is the average delay
only_delayed_flights = df_allegiant[df_allegiant["Delayed"] == 1]
how_extreme_delays = only_delayed_flights.groupby("Origin")["ArrDelayMinutes"].mean().reset_index()

In [43]:
#merging all of the dataframes together
merge1 = pd.merge(allegiant_percent_airports[["Origin", "Percentage_airline_flights"]], airport_stats_df, how="left", on="Origin")

merge2 = pd.merge(merge1, fault_at_airports, on="Origin", how="left")

merge3 = pd.merge(merge2, holiday_performance_df, on="Origin", how="left")

merge4 = pd.merge(merge3, how_extreme_delays, on="Origin", how="left")

Final_merge = merge4[["Origin", "Percentage_airline_flights", "percent_delayed", "percent_diverted_cancelled", "PercentAirlineFault"]]
Final_merge["HolidayPercentDelayed"] = merge4["Delayed"]
Final_merge["HowExtremeDelay"] = merge4["ArrDelayMinutes"]
Final_merge["HolidayPercentDelayed"] = Final_merge["HolidayPercentDelayed"].fillna(0)
Final_merge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_merge["HolidayPercentDelayed"] = merge4["Delayed"]


Unnamed: 0,Origin,Percentage_airline_flights,percent_delayed,percent_diverted_cancelled,PercentAirlineFault,HolidayPercentDelayed,HowExtremeDelay
0,ABE,0.007001,0.386694,0.029106,0.308392,0.450000,43.365591
1,ABQ,0.000175,0.666667,0.166667,0.086527,0.000000,61.250000
2,ALB,0.002766,0.510526,0.036842,0.135637,0.333333,41.082474
3,AMA,0.000451,0.612903,0.064516,0.677327,0.000000,57.526316
4,ATW,0.007977,0.536496,0.023723,0.222111,0.607143,45.357143
...,...,...,...,...,...,...,...
109,TUL,0.003624,0.481928,0.016064,0.103735,0.428571,49.258333
110,TYS,0.018049,0.400000,0.013710,0.201801,0.323077,40.512097
111,USA,0.003464,0.605042,0.029412,0.166805,0.357143,60.118056
112,VPS,0.018588,0.454973,0.024276,0.239689,0.280702,46.729776


In [44]:
#Create a total score for each airport

#Scoring for all columns with for loop
columns_of_interest = ["percent_delayed", "percent_diverted_cancelled", "PercentAirlineFault", "HolidayPercentDelayed", "HowExtremeDelay"]

for column in columns_of_interest:

    quantiles = Final_merget[[column]].quantile([0.25, 0.75])
    twenty_fifth = quantiles.loc[0.25][column]
    seventy_fifth = quantiles.loc[0.75][column]

    Final_merge[("ScoreFrom_" + column)] = 0

    for i in range(len(Final_merge["Origin"])):
        value = Final_merge[column][i]

        if value <= twenty_fifth:
            points = 1
        elif value > twenty_fifth and value <= seventy_fifth:
            points = 0.5
        elif value > seventy_fifth:
            points = 0.25

        Final_merge[("ScoreFrom_" + column)][i] = points

#Adding a modifer for the size of the airport

percent_flights_quantiles = Final_merge[["Percentage_airline_flights"]].quantile([0.25, 0.75])
twenty_fifth = percent_flights_quantiles.loc[0.25]["Percentage_airline_flights"]
seventy_fifth = percent_flights_quantiles.loc[0.75]["Percentage_airline_flights"]

Final_merge["ScoreFrom_Size"] = 0

for i in range(len(Final_merge["Origin"])):
    value = Final_merge["Percentage_airline_flights"][i]

    if value >= seventy_fifth:
            points = .5
    elif value < seventy_fifth and value > twenty_fifth:
            points = 0.25
    elif value <= twenty_fifth:
            points = 0

    Final_merge["ScoreFrom_Size"][i] = points

Final_merge["TotalScore"] = Final_merge[["ScoreFrom_percent_delayed", "ScoreFrom_percent_diverted_cancelled","ScoreFrom_PercentAirlineFault",
                                         "ScoreFrom_HolidayPercentDelayed", "ScoreFrom_HowExtremeDelay", "ScoreFrom_Size"]].sum(axis=1)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  Final_merge[("ScoreFrom_" + column)][i] = points
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_merge[("

In [45]:
#Bringing in latitude and longitude for the airports
from airportsdata import load

airports = load('IATA')

iata_codes = list(Final_merge["Origin"])

data = []

for code in iata_codes:
    info = airports.get(code)
    if info:
        data.append({
            "IATA" : code,
            "Name" : info["name"],
            "State" : info["subd"],
            "Latitude" : info["lat"],
            "Longitude" : info["lon"]
        })

    else:
        data.append({
            "IATA" : code,
            "Name" : "Not found",
            "State" : "",
            "Latitude" : None,
            "Longitude" : None
        })

airports_geog_info_df = pd.DataFrame(data)

print(airports_geog_info_df)

    IATA                                         Name           State  \
0    ABE          Lehigh Valley International Airport    Pennsylvania   
1    ABQ    Albuquerque International Sunport Airport      New Mexico   
2    ALB                 Albany International Airport        New York   
3    AMA  Rick Husband Amarillo International Airport           Texas   
4    ATW               Appleton International Airport       Wisconsin   
..   ...                                          ...             ...   
109  TUL                  Tulsa International Airport        Oklahoma   
110  TYS                        Mc Ghee Tyson Airport       Tennessee   
111  USA             Concord-Padgett Regional Airport  North Carolina   
112  VPS     Eglin Afb/Destin-Ft Walton Beach Airport         Florida   
113  XNA               Northwest Arkansas Ntl Airport        Arkansas   

     Latitude  Longitude  
0    40.65236  -75.44041  
1    35.03893 -106.60826  
2    42.74912  -73.80198  
3    35.21936 -

In [46]:
#Merge airport geographic information with airport information

merge_locations = pd.merge(Final_merge, airports_geog_info_df, how="left", left_on="Origin", right_on="IATA")

merge_locations.to_csv("Allegiant_AirportScores_Locations.csv", index=False)

In [47]:
#Find the 5 airports above the fiftieth percentile with the worst score
#creating a region column in merge_locations
def state_to_region(state):
    West = ["Washington", "Montana", "Idaho", "Wyoming", "Oregon", "California", "Nevada", "Utah", "Colorado", "Arizona", "New Mexico", "Alaska", "Hawaii"]
    South = ["Texas", "Oklahoma", "Arkansas", "Louisiana", "Mississippi", "Alabama", "Tennessee", "Kentucky", "Florida", "Georgia", "South Carolina", "North Carolina", "Virginia", "Maryland", "Delaware"]
    Northeast = ["Pennsylvania", "New Jersey", "New York", "Connecticut", "Massachusetts", "Rhode Island", "Maine", "New Hampshire", "Vermont"]
    Midwest = ["North Dakota", "South Dakota", "Nebraska", "Kansas", "Missouri", "Iowa", "Minnesota", "Wisconsin", "Illinois", "Indiana", "Ohio", "Michigan"]

    if state in West:
        return "West"
    elif state in South:
        return "South"
    elif state in Northeast:
        return "Northeast"
    elif state in Midwest:
        return "Midwest"
    
merge_locations["Region"] = merge_locations["State"].map(state_to_region)

fifty = merge_locations["Percentage_airline_flights"].median()
five_worst_allegiant = merge_locations[(merge_locations["Percentage_airline_flights"] > fifty) & (merge_locations["Region"] == "Midwest")].sort_values("TotalScore", ascending=True).head(5)
five_worst_allegiant.to_csv("FiveWorstAllegiant.csv", index=False)

#### Frontier analysis

In [48]:
df_frontier = df_2[df_2["Operating_Airline"] == "F9"]
df_frontier.shape

(159681, 42)

In [49]:
#where frontier is flying - regions
frontier_percent_regions = df_frontier.groupby("Region")["Flights"].sum() / len(df_frontier["Flights"])
frontier_percent_regions * 100

Region
Midwest       9.738165
Northeast    12.363400
South        39.972195
West         35.463205
Name: Flights, dtype: float64

In [50]:
#Where frontier is flying - airports
frontier_percent_airports = (pd.DataFrame(df_frontier.groupby("Origin")["Flights"].sum() / len(df_frontier["Flights"]))).reset_index()
frontier_percent_airports.rename(columns={"Flights" : "Percentage_airline_flights"}, inplace=True)
frontier_percent_airports

Unnamed: 0,Origin,Percentage_airline_flights
0,ABQ,0.000495
1,ALB,0.001309
2,ATL,0.053569
3,AUS,0.002411
4,BDL,0.008580
...,...,...
92,SYR,0.002749
93,TPA,0.031638
94,TTN,0.013038
95,TYS,0.001490


In [51]:
#what percent of delays are the airlines fault
when_delayed_df = df_frontier[df_frontier["TotalDelay"] != 0][["NASDelay", "SecurityDelay", "CarrierDelay", 
                                                                   "WeatherDelay", "LateAircraftDelay", "Origin", "TotalDelay"]]

when_delayed_df = when_delayed_df.reset_index()

when_delayed_df["PercentAirlineFault"] = ((when_delayed_df["CarrierDelay"]) / when_delayed_df["TotalDelay"])

fault_at_airports = when_delayed_df.groupby("Origin")["PercentAirlineFault"].mean().reset_index()
fault_at_airports

Unnamed: 0,Origin,PercentAirlineFault
0,ABQ,0.179281
1,ALB,0.064400
2,ATL,0.429877
3,AUS,0.166689
4,BDL,0.221520
...,...,...
92,SYR,0.150750
93,TPA,0.343447
94,TTN,0.149729
95,TYS,0.117750


In [52]:
#percent of flights delayed and flights delayed/cancelled
airport_stats_df = df_frontier.groupby("Origin").agg(
    percent_delayed = ("Delayed", "mean"),
    percent_diverted_cancelled = ("CancOrDiv", "mean")).reset_index() 

airport_stats_df

Unnamed: 0,Origin,percent_delayed,percent_diverted_cancelled
0,ABQ,0.658228,0.025316
1,ALB,0.387560,0.052632
2,ATL,0.550736,0.024901
3,AUS,0.449351,0.018182
4,BDL,0.583942,0.020438
...,...,...,...
92,SYR,0.435080,0.036446
93,TPA,0.423793,0.029493
94,TTN,0.398655,0.048031
95,TYS,0.386555,0.012605


In [53]:
#holiday performance at each airport
holiday_df = df_frontier[df_frontier["Is_Holiday"] == 1]

holiday_performance_df = holiday_df.groupby("Origin")["Delayed"].mean().reset_index()
holiday_performance_df.head(10)

Unnamed: 0,Origin,Delayed
0,ABQ,0.75
1,ALB,0.333333
2,ATL,0.547278
3,AUS,0.666667
4,BDL,0.473684
5,BIS,0.0
6,BKG,0.0
7,BMI,0.222222
8,BNA,0.44186
9,BOS,0.52381


In [54]:
#how extreme is the average delay
only_delayed_flights = df_frontier[df_frontier["Delayed"] == 1]
how_extreme_delays = only_delayed_flights.groupby("Origin")["ArrDelayMinutes"].mean().reset_index()

In [55]:
#merging all of the dataframes together
merge1 = pd.merge(frontier_percent_airports[["Origin", "Percentage_airline_flights"]], airport_stats_df, how="left", on="Origin")

merge2 = pd.merge(merge1, fault_at_airports, on="Origin", how="left")

merge3 = pd.merge(merge2, holiday_performance_df, on="Origin", how="left")

merge4 = pd.merge(merge3, how_extreme_delays, on="Origin", how="left")

Final_merge = merge4[["Origin", "Percentage_airline_flights", "percent_delayed", "percent_diverted_cancelled", "PercentAirlineFault"]]
Final_merge["HolidayPercentDelayed"] = merge4["Delayed"]
Final_merge["HowExtremeDelay"] = merge4["ArrDelayMinutes"]
Final_merge["HolidayPercentDelayed"] = Final_merge["HolidayPercentDelayed"].fillna(0)
Final_merge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_merge["HolidayPercentDelayed"] = merge4["Delayed"]


Unnamed: 0,Origin,Percentage_airline_flights,percent_delayed,percent_diverted_cancelled,PercentAirlineFault,HolidayPercentDelayed,HowExtremeDelay
0,ABQ,0.000495,0.658228,0.025316,0.179281,0.750000,48.596154
1,ALB,0.001309,0.387560,0.052632,0.064400,0.333333,62.950617
2,ATL,0.053569,0.550736,0.024901,0.429877,0.547278,55.981745
3,AUS,0.002411,0.449351,0.018182,0.166689,0.666667,49.508671
4,BDL,0.008580,0.583942,0.020438,0.221520,0.473684,68.861250
...,...,...,...,...,...,...,...
92,SYR,0.002749,0.435080,0.036446,0.150750,0.571429,58.303665
93,TPA,0.031638,0.423793,0.029493,0.343447,0.445596,44.855208
94,TTN,0.013038,0.398655,0.048031,0.149729,0.369231,53.745783
95,TYS,0.001490,0.386555,0.012605,0.117750,0.538462,47.619565


In [56]:
#Create a total score for each airport

#Scoring for all columns with for loop
columns_of_interest = ["percent_delayed", "percent_diverted_cancelled", "PercentAirlineFault", "HolidayPercentDelayed", "HowExtremeDelay"]

for column in columns_of_interest:

    quantiles = Final_merget[[column]].quantile([0.25, 0.75])
    twenty_fifth = quantiles.loc[0.25][column]
    seventy_fifth = quantiles.loc[0.75][column]

    Final_merge[("ScoreFrom_" + column)] = 0

    for i in range(len(Final_merge["Origin"])):
        value = Final_merge[column][i]

        if value <= twenty_fifth:
            points = 1
        elif value > twenty_fifth and value <= seventy_fifth:
            points = 0.5
        elif value > seventy_fifth:
            points = 0.25

        Final_merge[("ScoreFrom_" + column)][i] = points

#Adding a modifer for the size of the airport

percent_flights_quantiles = Final_merge[["Percentage_airline_flights"]].quantile([0.25, 0.75])
twenty_fifth = percent_flights_quantiles.loc[0.25]["Percentage_airline_flights"]
seventy_fifth = percent_flights_quantiles.loc[0.75]["Percentage_airline_flights"]

Final_merge["ScoreFrom_Size"] = 0

for i in range(len(Final_merge["Origin"])):
    value = Final_merge["Percentage_airline_flights"][i]

    if value >= seventy_fifth:
            points = .5
    elif value < seventy_fifth and value > twenty_fifth:
            points = 0.25
    elif value <= twenty_fifth:
            points = 0

    Final_merge["ScoreFrom_Size"][i] = points

Final_merge["TotalScore"] = Final_merge[["ScoreFrom_percent_delayed", "ScoreFrom_percent_diverted_cancelled","ScoreFrom_PercentAirlineFault",
                                         "ScoreFrom_HolidayPercentDelayed", "ScoreFrom_HowExtremeDelay", "ScoreFrom_Size"]].sum(axis=1)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  Final_merge[("ScoreFrom_" + column)][i] = points
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_merge[("

In [57]:
#Bringing in latitude and longitude for the airports
from airportsdata import load

airports = load('IATA')

iata_codes = list(Final_merge["Origin"])

data = []

for code in iata_codes:
    info = airports.get(code)
    if info:
        data.append({
            "IATA" : code,
            "Name" : info["name"],
            "State" : info["subd"],
            "Latitude" : info["lat"],
            "Longitude" : info["lon"]
        })

    else:
        data.append({
            "IATA" : code,
            "Name" : "Not found",
            "State" : "",
            "Latitude" : None,
            "Longitude" : None
        })

airports_geog_info_df = pd.DataFrame(data)

print(airports_geog_info_df)

   IATA                                               Name        State  \
0   ABQ          Albuquerque International Sunport Airport   New Mexico   
1   ALB                       Albany International Airport     New York   
2   ATL  Hartsfield - Jackson Atlanta International Air...      Georgia   
3   AUS             Austin-Bergstrom International Airport        Texas   
4   BDL                      Bradley International Airport  Connecticut   
..  ...                                                ...          ...   
92  SYR             Syracuse Hancock International Airport     New York   
93  TPA                        Tampa International Airport      Florida   
94  TTN                             Trenton Mercer Airport   New Jersey   
95  TYS                              Mc Ghee Tyson Airport    Tennessee   
96  XNA                     Northwest Arkansas Ntl Airport     Arkansas   

    Latitude  Longitude  
0   35.03893 -106.60826  
1   42.74912  -73.80198  
2   33.63670  -84.427

In [58]:
#Merge airport geographic information with airport information

merge_locations = pd.merge(Final_merge, airports_geog_info_df, how="left", left_on="Origin", right_on="IATA")

merge_locations.to_csv("Frontier_AirportScores_Locations.csv", index=False)

In [59]:
#Find the 5 airports above the fiftieth percentile with the worst score
#creating a region column in merge_locations
def state_to_region(state):
    West = ["Washington", "Montana", "Idaho", "Wyoming", "Oregon", "California", "Nevada", "Utah", "Colorado", "Arizona", "New Mexico", "Alaska", "Hawaii"]
    South = ["Texas", "Oklahoma", "Arkansas", "Louisiana", "Mississippi", "Alabama", "Tennessee", "Kentucky", "Florida", "Georgia", "South Carolina", "North Carolina", "Virginia", "Maryland", "Delaware"]
    Northeast = ["Pennsylvania", "New Jersey", "New York", "Connecticut", "Massachusetts", "Rhode Island", "Maine", "New Hampshire", "Vermont"]
    Midwest = ["North Dakota", "South Dakota", "Nebraska", "Kansas", "Missouri", "Iowa", "Minnesota", "Wisconsin", "Illinois", "Indiana", "Ohio", "Michigan"]

    if state in West:
        return "West"
    elif state in South:
        return "South"
    elif state in Northeast:
        return "Northeast"
    elif state in Midwest:
        return "Midwest"
    
merge_locations["Region"] = merge_locations["State"].map(state_to_region)

fifty = merge_locations["Percentage_airline_flights"].median()
five_worst_frontier = merge_locations[(merge_locations["Percentage_airline_flights"] > fifty) & (merge_locations["Region"] == "South")].sort_values("TotalScore", ascending=True).head(5)
five_worst_frontier.to_csv("FiveWorstFrontier.csv", index=False)