In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import sys, os, pathlib, shutil, platform
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX

from prophet import Prophet

from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics

from prophet.plot import plot_plotly, plot_components_plotly
from prophet.plot import plot_cross_validation_metric

# import plotly.graph_objs as go
import plotly.express as px

from dateutil.relativedelta import relativedelta
import humanize
from datetime import date, datetime, timedelta


In [None]:
%matplotlib inline 
%load_ext autoreload
%autoreload 2
 
plt.rcParams['figure.figsize']=(20,10)

In [None]:
## rewrote prophet's make future dataframe cuz it doesnt handle very well only monthly data; 
# fc_model.make_future_dataframe(periods=10, freq='m') ##compare my_make_future_dataframe with the one provided by Prophet
def my_make_future_dataframe(df, periods):
    last_date=df['ds'].max()
    complete_df = df.append(pd.DataFrame([last_date + relativedelta(months = i + 1) for i in range(periods)],
                                              columns =['ds']), ignore_index=True, sort=True)
    return complete_df

def forecast_future(future_samples_count, df, growth = 'linear'):
    model = Prophet(growth=growth)
    model.fit(df)
    
    future = my_make_future_dataframe(df, periods=future_samples_count)

    forecast = model.predict(future)
    return forecast, model

def forecast_in_sample(hold_out_samples_count, df, growth = 'linear'):
    train_data = df.drop(df.index[-hold_out_samples_count:])
    print(train_data.head(4), train_data.tail(4))
    print(train_data.shape)
    
    model = Prophet(growth=growth)
    model.fit(train_data)
    
    future = df[['ds']].reset_index()                         # predicts for all ds values
    forecast = model.predict(future)
    return forecast, model

def forecasted_percentiles(fc_model, input_df, percentiles):     
    forecasted_samples = fc_model.predictive_samples(input_df)
    forecasted_stats=pd.DataFrame(data=np.transpose(np.percentile(forecasted_samples['yhat'], percentiles, axis=1 )) #made a change, it said 'yhat' before 'Predicted'
             ,  columns = ['pct_'+str(x) for x in percentiles])
    forecasted_stats.insert(loc=0, column='Predicted', value=input_df['yhat'])
    forecasted_stats.insert(loc=0, column='ds', value=input_df['ds'])
    return forecasted_stats 

In [None]:
# preprocessed_data_s1['ds'] = preprocessed_data_s1['ds'].map(str) +"-01" + "-01"
# preprocessed_data_s1['ds'] = pd.to_datetime(preprocessed_data_s1['ds'],format='%Y-%m-%d')
# preprocessed_data_s1
# preprocessed_data_s1.dtypes
# model = Prophet(growth='linear')
# model.fit(preprocessed_data_s1)

# future_ds = my_make_future_dataframe(preprocessed_data_s1, periods=120)

# forecasted_df = model.predict(future_ds)
# forecasted_df
# #in the 01 and 02 notebooks, we fake the day cuz we have to have a full date, but the month and year are true since the data are monthly aggregations
# #here (2 cells above where we change the data type to datetime), we fake the month and the day because we have to have a full date, but the year is true because the data
# # are yearly agg

In [None]:
all_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 
          'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 
          'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 
          'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

# use these states for regional nonjet origin these states dest anywhere
# my_states = ['Alabama', 'Alaska', 'Arizona', 'California', 'Colorado', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 
#           'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Michigan', 'Minnesota', 'Montana', 
#           'Nebraska', 'Nevada', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oregon', 'Pennsylvania',
#              'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

# use these states for regional nonjetandjets origin these states dest anywhere; for regional jets origin these states dest anywhere
my_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 
          'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 
          'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 
          'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


data_path = './../../../data/paav_cargo/agg_data/freight_aggregation__' 

years_list_a4 = [yr for yr in range (2004,2024)]

#abs_yhat is the raw predicted cargo val 10 yrs from now, whereas crt_abs_y is the last recorded raw cargo value (the current y!)

def geoplot_cols(future_periods_in_months, future_periods_in_days, your_years_list, origin_data = True, regional_data = True, aircraft_type = "jets"):
    geoplot_df = pd.DataFrame(columns=['origin', 'dest', 'pct_inc', 'abs_yhat', 'crt_abs_y'])
    if origin_data == True:
        analysis_type_list = [4]
        origin_flag = 'origin'
    elif origin_data == False:
        analysis_type_list = [5] 
        origin_flag = 'dest'
        
        
    if regional_data == True:
        regional_flag = 'regional'
    elif regional_data == False:
        regional_flag = 'alldistance'
        
        
    if aircraft_type == 'nonjets':
        aircraft_flag = 'nonjets'
    if aircraft_type == 'jetsandnonjets':
        aircraft_flag = 'jetsandnonjets'
    if aircraft_type == "jets":
        aircraft_flag = 'jets'
        
     
    #concat
    full_path_list = [data_path+str(crt_year) + '_analysis'+ str(analysis_type) +'_' + regional_flag +'_'+ aircraft_flag + '.csv' 
                      for crt_year in your_years_list 
                      for analysis_type in analysis_type_list]
    print(full_path_list[0])
    all_datasets= pd.concat([pd.read_csv(str(crt_file_name)) for crt_file_name in full_path_list], keys=your_years_list).reset_index()
    
    # yearly agg
    all_datasets.drop(columns=['level_1'],axis=1,inplace=True)
    all_datasets = all_datasets.rename(columns = {'level_0':'year'})
    all_datasets = all_datasets.groupby(['year', 'origin', 'dest'], as_index=True, group_keys=True)['y', 'num flights' ].agg(['sum','count'])
    all_datasets.reset_index(inplace=True)
    all_datasets.columns= ['_'.join(col) for col in all_datasets.columns.values]
    all_datasets.drop(['y_count', 'num flights_count','num flights_sum'], axis=1, inplace = True)
    all_datasets.rename(columns={'y_sum': 'y', 'ds_':'ds', 'year_':'year', 'origin_':'origin', 'dest_':'dest'}, inplace = True)
        
    #faking date: only the year aspect of the date is true, the month and day are made up which we do bc prophet requires a full date of this format: yr-mo-day
    all_datasets['ds'] = all_datasets['year'].map(str) +"-01" + "-01"
    all_datasets['ds'] = pd.to_datetime(all_datasets['ds'],format='%Y-%m-%d')
    
   
    for crt_state in my_states:
        #percent increase calculation
        crt_df = all_datasets[all_datasets[origin_flag]==crt_state]
        crt_df=crt_df[['y','ds']]
        model = Prophet(growth='linear')
        model.fit(crt_df)
        crt_future_ds = my_make_future_dataframe(crt_df, periods=future_periods_in_months)
        crt_forecasted_df = model.predict(crt_future_ds)
        crt_last_cargo_value = (crt_df[crt_df['ds'] == crt_df['ds'].max()])['y'].tolist()[0]
        #the following tagged code is tagged because it was used back when we had weird states that contained zeroes in their yearly aggregations
        # adder=1
        # while crt_last_cargo_value==0:
        #     crt_last_cargo_value = (crt_df[crt_df['ds'] == crt_df['ds'].iloc[-1-adder]])['y'].tolist()[0]
        #     adder = adder + 1
        crt_forecasted_df["percent increase"] = (crt_forecasted_df["yhat"]/crt_last_cargo_value -1)*100
        crt_last_date=crt_forecasted_df['ds'].max()
        crt_focused_df = crt_forecasted_df[crt_forecasted_df['ds']==crt_last_date]
        crt_pct_inc = crt_focused_df._get_value(index= (crt_focused_df.index[crt_focused_df['ds'] == crt_focused_df['ds'].max()])[0], 
                                                col='percent increase')
        print("The pct inc predicted 10 yrs from now is " + str(crt_pct_inc) + " for " + crt_state)
        print("The current cargo val is " + str(crt_last_cargo_value) + " for " + crt_state)
        
        
                
         #abs cargo value calculation
        ## crt_df = all_datasets[all_datasets[origin_flag]==crt_state] ##u alr said all this above! no need to be redundant
        ## crt_df=crt_df[['y','ds']]
        ## model = Prophet(growth='linear')
        ## model.fit(crt_df)
        ## crt_future_ds = my_make_future_dataframe(crt_df, periods=future_periods)
        ## crt_forecasted_df = model.predict(crt_future_ds)
        ## crt_last_date=crt_forecasted_df['ds'].max()
        ## crt_focused_df = crt_forecasted_df[crt_forecasted_df['ds']==crt_last_date]
        crt_abs_yhat = crt_focused_df._get_value(index= (crt_focused_df.index[crt_focused_df['ds'] == crt_focused_df['ds'].max()])[0], col='yhat')
        print("The abs cargo val predicted 10 yrs from now is " + str(crt_abs_yhat) + " for " + crt_state)
            
             
        #mape calclation
#         crt_m = Prophet()
#         crt_m.fit(crt_df) 

        m_s1 = Prophet()
        m_s1.fit(crt_df.drop(crt_df[crt_df['ds'] == pd.datetime(2023, 1, 1)].index,inplace=False)) 
        initial_time = f'{365*5} days'

        crt_cross_validation_results = cross_validation(m_s1, initial=initial_time, period='365 days', horizon = '3650 days', 
                                                        parallel="processes")
        crt_cross_validation_results.rename(columns={'cutoff': 'real_cutoff'},inplace=True) 
        crt_cross_validation_results['cutoff'] = np.NaN                                     
        for index, row in crt_cross_validation_results.iterrows():
            crt_cross_validation_results.at[index, 'cutoff'] = row['ds'] - timedelta(days=365*(((index)%10)+1))
        crt_performance_metrics_results = performance_metrics(crt_cross_validation_results.drop(['real_cutoff'], axis=1, inplace = False))
        crt_mape = (crt_performance_metrics_results[['mape']].iloc[-1])[0]
        print("The MAPE for 10 yrs from now is " + str(crt_mape) + " for " + crt_state)
        
        if origin_data == True:
            geoplot_df = geoplot_df.append({'origin': crt_state, 'dest': 'all', 'pct_inc':crt_pct_inc, 'abs_yhat':crt_abs_yhat, "crt_abs_y":crt_last_cargo_value, "mape":crt_mape}, ignore_index=True)
        else:
            geoplot_df = geoplot_df.append({'origin': 'all', 'dest': crt_state, 'pct_inc':crt_pct_inc, 'abs_yhat':crt_abs_yhat, "crt_abs_y":crt_last_cargo_value, "mape":crt_mape}, ignore_index=True)

    geoplot_df.to_csv('./../../../data/paav_cargo/geoplot_exercise_data/geoplot_data_'+ origin_flag + '_analysis_'+ str(analysis_type_list[0]) + "_" + regional_flag +'_'+ aircraft_flag+'.csv', 
                          mode='a', index=False, header=True)
    # return crt_performance_metrics_results  

In [None]:
geoplot_cols(future_periods_in_months=120,future_periods_in_days=3650, your_years_list=years_list_a4, origin_data = True, regional_data = True, aircraft_type = "jets")
# geoplot_cols(future_periods_in_months=120,future_periods_in_days=3650, your_years_list=years_list_a4, origin_data = True, regional_data = True, aircraft_type = "jets")

In [None]:
analysis_type_list = [4]
years_list_a4 = [yr for yr in range (2004,2024)]

full_path_list = [ './../../../data/paav_cargo/agg_data/freight_aggregation__' +str(crt_year) + '_analysis'+ str(analysis_type) +'_' +
                  'regional'+'_'+'jets' + '.csv' 
                      for crt_year in years_list_a4 
                      for analysis_type in analysis_type_list]
print(full_path_list[0])
all_datasets= pd.concat([pd.read_csv(str(crt_file_name)) for crt_file_name in full_path_list], keys=years_list_a4).reset_index()
# yearly agg
all_datasets.drop(columns=['level_1'],axis=1,inplace=True)
all_datasets = all_datasets.rename(columns = {'level_0':'year'})
all_datasets = all_datasets.groupby(['year', 'origin', 'dest'], as_index=True, group_keys=True)['y', 'num flights' ].agg(['sum','count'])
all_datasets.reset_index(inplace=True)
all_datasets.columns= ['_'.join(col) for col in all_datasets.columns.values]
all_datasets.drop(['y_count','num flights_sum'], axis=1, inplace = True)
all_datasets.rename(columns={'y_sum': 'y', 'ds_':'ds', 'year_':'year', 'origin_':'origin', 'dest_':'dest'}, inplace = True)

all_datasets['ds'] = all_datasets['year'].map(str) +"-01" + "-01"
all_datasets['ds'] = pd.to_datetime(all_datasets['ds'],format='%Y-%m-%d')
all_datasets

In [None]:
all_datasets['num flights_count'].unique()

In [None]:
crt_df = all_datasets[all_datasets['origin']=='Tennessee']
crt_df

In [None]:
# finding out which states contain any zeroes in their y summations over a year

my_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 
          'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 
          'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 
          'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

for investigated_crt_state in my_states:
    investigated_crt_state_df = all_datasets[all_datasets['origin']==investigated_crt_state]
    # if investigated_crt_state_df[investigated_crt_state_df['y']==0.0]:
    #     print("need to investigate "+ investigated_crt_state)
    for crt_y in investigated_crt_state_df['y']:
        if crt_y == 0:
            print("need to investigate "+ investigated_crt_state)
        
list_of_states_to_investigate_origin_reg_nonjets = ['Arkansas', 'Connecticut', 'Delaware', 'Illinois', 'Massachusetts', 'Mississippi', 'Missouri', 'New Hampshire', 'Oklahoma', 'Rhode Island', 'Vermont', 'Virginia']
list_of_states_to_investigate_origin_reg_nonjetsandjets = ['Delaware']
list_of_states_to_investigate_origin_reg_jets = ['Delaware']

In [None]:
crt_df=crt_df[['y','ds']]
crt_df
crt_df.dtypes

In [None]:
#NEW: remove 2023
crt_df = crt_df[crt_df['ds'].year < 2023]

In [None]:
model = Prophet(growth='linear')
model.fit(crt_df)
crt_future_ds = my_make_future_dataframe(crt_df, periods=120)
crt_future_ds

In [None]:
crt_forecasted_df = model.predict(crt_future_ds)
crt_forecasted_df

In [None]:
crt_last_cargo_value = (crt_df[crt_df['ds'] == crt_df['ds'].max()])['y'].tolist()[0]
crt_last_cargo_value

In [None]:
crt_forecasted_df["percent increase"] = (crt_forecasted_df["yhat"]/crt_last_cargo_value -1)*100
crt_forecasted_df

In [None]:
crt_last_date=crt_forecasted_df['ds'].max()
crt_last_date

In [None]:
crt_focused_df = crt_forecasted_df[crt_forecasted_df['ds']==crt_last_date]
crt_focused_df

In [None]:
crt_pct_inc = crt_focused_df._get_value(index= (crt_focused_df.index[crt_focused_df['ds'] == crt_focused_df['ds'].max()])[0], col='percent increase')
print("The pct inc predicted 10 yrs from now is " + str(crt_pct_inc) + " for " + 'Texas')

In [None]:
crt_abs_yhat = crt_focused_df._get_value(index= (crt_focused_df.index[crt_focused_df['ds'] == crt_focused_df['ds'].max()])[0], col='yhat')
print("The abs cargo val predicted 10 yrs from now is " + str(crt_abs_yhat) + " for " + "Texas")

In [None]:
## mape
# initial_time = f'{365*5} days'

# crt_cross_validation_results = cross_validation(model, initial=initial_time, period='365 days', horizon = '3650 days', 
#                                                 parallel="processes")
# crt_cross_validation_results.rename(columns={'cutoff': 'real_cutoff'},inplace=True) 
# crt_cross_validation_results['cutoff'] = np.NaN                                     
# for index, row in crt_cross_validation_results.iterrows():
#     crt_cross_validation_results.at[index, 'cutoff'] = row['ds'] - timedelta(days=365*(((index)%10)+1))
# crt_performance_metrics_results = performance_metrics(crt_cross_validation_results.drop(['real_cutoff'], axis=1, inplace = False))
# crt_mape = (crt_performance_metrics_results[['mape']].iloc[-1])[0]
# print("The MAPE is " + str(crt_mape) + " for " + "Texas")

In [None]:
crt_df

In [None]:
m_s1 = Prophet()
m_s1.fit(crt_df.drop(crt_df[crt_df['ds'] == pd.datetime(2023, 1, 1)].index,inplace=False))
initial_time = f'{365*5} days'


In [None]:
crt_cross_validation_results = cross_validation(m_s1, initial=initial_time, period='365 days', horizon = '3650 days', parallel="processes")

In [None]:
crt_cross_validation_results

In [None]:
crt_cross_validation_results.rename(columns={'cutoff': 'real_cutoff'},inplace=True)
crt_cross_validation_results

In [None]:
crt_cross_validation_results['cutoff'] = np.NaN 
crt_cross_validation_results

In [None]:
for index, row in crt_cross_validation_results.iterrows():
    crt_cross_validation_results.at[index, 'cutoff'] = row['ds'] - timedelta(days=365*(((index)%10)+1))
crt_cross_validation_results

In [None]:
crt_performance_metrics_results = performance_metrics(crt_cross_validation_results.drop(['real_cutoff'], axis=1, inplace = False))
crt_performance_metrics_results

In [None]:
crt_mape = (crt_performance_metrics_results[['mape']].iloc[-1])[0]
print("The MAPE is " + str(crt_mape) + " for " + "Texas")

In [None]:
def load_datasets_v2(data_path, years_list, analysis_type_list):
    """
    Concatanates a list of datasets into one dataframe, and also does yearly agg

    Returns:
    Concatanated list as a dataframe
    """
    #concat
    full_path_list = [data_path+str(crt_year) + '_analysis'+ str(analysis_type) +'_' + str(regional_flag)+'_'+str(non_jets_flag) + '.csv' 
                      for crt_year in years_list 
                      for analysis_type in analysis_type_list]
    print(full_path_list[0])
    all_datasets= pd.concat([pd.read_csv(str(crt_file_name)) for crt_file_name in full_path_list], keys=years_list).reset_index()
    
    # yearly agg
    all_datasets.drop(columns=['level_1'],axis=1,inplace=True)
    all_datasets = all_datasets.rename(columns = {'level_0':'year'})
    all_datasets = all_datasets.groupby(['year', 'origin', 'dest'], as_index=True, group_keys=True)['y', 'num flights' ].agg(['sum','count'])
    all_datasets.reset_index(inplace=True)
    all_datasets.columns= ['_'.join(col) for col in all_datasets.columns.values]
    all_datasets.drop(['y_count', 'num flights_count','num flights_sum'], axis=1, inplace = True)
    all_datasets.rename(columns={'y_sum': 'y', 'ds_':'ds', 'year_':'year', 'origin_':'origin', 'dest_':'dest'}, inplace = True)

# my_testing["year"] = my_testing['ds'].dt.year #tagged this cuz level_0 column takes care of it

    return all_datasets    

In [None]:
years_list_a4 = [yr for yr in range (2004,2024)]
data_path='./../../../data/paav_cargo/agg_data/freight_aggregation__' 
analysis_type_list_a4 = [4]

regional_flag = 'regional'
non_jets_flag = 'nonjets'
origindater = load_datasets_v2(data_path, years_list_a4,analysis_type_list_a4)
origindater

In [None]:
crt_df = origindater[origindater['origin']=="Rhode Island"]
crt_df

In [None]:
crt_df['ds'] = crt_df['year'].map(str) +"-01" + "-01"
crt_df['ds'] = pd.to_datetime(crt_df['ds'],format='%Y-%m-%d')
crt_df

In [None]:
crt_df=crt_df[['y','ds']]
crt_df

In [None]:
model = Prophet(growth='linear')
model.fit(crt_df)

In [None]:
future_ds = my_make_future_dataframe(crt_df, periods=120)

In [None]:
future_ds.head(3)
future_ds.tail(3)

In [None]:
forecasted_df = model.predict(future_ds)

In [None]:
forecasted_df.head(3)
forecasted_df.tail(3)

In [None]:
last_cargo_value = (crt_df[crt_df['ds'] == crt_df['ds'].max()])['y'].tolist()[0]
forecasted_df["percent increase"] = (forecasted_df["yhat"]/last_cargo_value -1)*100

In [None]:
crt_df
last_cargo_value

In [None]:
forecasted_df.head(3)
forecasted_df.tail(3)

In [None]:
last_date=forecasted_df['ds'].max()
focused_df = forecasted_df[forecasted_df['ds']==last_date]
focused_df
my_var = focused_df._get_value(index=135,col='percent increase')
my_var

In [None]:
a=(focused_df.index[focused_df['ds'] == focused_df['ds'].max()])[0]
a

# Problem states

In [None]:
for crt_weird_state in list_of_states_to_investigate:
    all_datasets[all_datasets['origin']==crt_weird_state]

In [None]:
# def load_datasets(data_path, years_list, analysis_type_list):
#     """
#     Concatanates a list of datasets into one dataframe, and also does yearly agg

#     Returns:
#     Concatanated list as a dataframe
#     """
#     #concat
#     full_path_list = [data_path+str(crt_year) + '_analysis'+ str(analysis_type) +'_' + str(regional_flag)+'_'+str(non_jets_flag) + '.csv' 
#                       for crt_year in years_list 
#                       for analysis_type in analysis_type_list]
#     print(full_path_list[0])
#     all_datasets= pd.concat([pd.read_csv(str(crt_file_name)) for crt_file_name in full_path_list], keys=years_list).reset_index()
    
#     #yearly agg
#     all_datasets.drop(columns=['level_1'],axis=1,inplace=True)
#     all_datasets = all_datasets.rename(columns = {'level_0':'year'})
#     all_datasets = all_datasets.groupby(['year', 'origin', 'dest'], as_index=True, group_keys=True)['y', 'num flights' ].agg(['sum','count'])
#     all_datasets.reset_index(inplace=True)
#     all_datasets.columns= ['_'.join(col) for col in all_datasets.columns.values]
#     all_datasets.drop(['origin_', 'dest_', 'y_count', 'num flights_count','num flights_sum'], axis=1, inplace = True)
#     all_datasets.rename(columns={'y_sum': 'y', 'year_':'ds' }, inplace = True)

# # my_testing["year"] = my_testing['ds'].dt.year #tagged this cuz level_0 column takes care of it

#     return all_datasets    








# years_list = [yr for yr in range (2002,2024)]
# years_list_s1 = [yr for yr in range (2004,2024)]
# data_path='./../../../data/paav_cargo/agg_data/freight_aggregation__' 

# analysis_type_list = [analysis_type for analysis_type in range(11)]
# # regional_flag = 'alldistance' # , 'regional']
# # non_jets_flag = 'jetsandnonjets' # 'nonjets']
# regional_flag = 'regional' # , 'alldistance']
# non_jets_flag = 'nonjets' # 'jetsandnonjets']


# # analysis_type_list = [4]
# # preprocessed_data_s1_breakdown = load_datasets(data_path, years_list_s1, analysis_type_list)

# analysis_type_list = [0]
# preprocessed_data_s1 = load_datasets(data_path, years_list_s1, analysis_type_list)






# percentiles = [2.5, 97.5]
# forecasted_stats5 = forecasted_percentiles(m_s5, cross_validation_results_s5, percentiles = percentiles)
# forecasted_stats5
# forecasted_stats5["percent increase"] = (forecasted_stats5["Predicted"]/last_cargo_value -1)*100
# forecasted_stats5['Predicted'] = forecasted_stats5['Predicted'].apply(humanize.intword)
# forecasted_stats5['pct_2.5'] = forecasted_stats5['pct_2.5'].apply(humanize.intword)
# forecasted_stats5['pct_97.5'] = forecasted_stats5['pct_97.5'].apply(humanize.intword)
# forecasted_stats5.rename(columns={'pct_2.5': 'lower bound of 95% CI', 'pct_97.5': 'upper bound of 95% CI', 'Predicted':'predicted'}, inplace = True)
# forecasted_stats5['origin'] = origin_place
# forecasted_stats5['dest'] = dest_place
# forecasted_stats5['date these results were obtained'] = str(date.today())
# data_types_dict = {'ds': str}
# forecasted_stats5 = forecasted_stats5.astype(data_types_dict)
# forecasted_stats5
# print("Today's date:", today)

# pd.set_option('display.max_rows', None)
# df1 = forecasted_stats.loc[(pd.to_datetime(forecasted_stats['ds']) == pd.to_datetime('2024-03-01')) ]
# df2 = forecasted_stats.loc[(pd.to_datetime(forecasted_stats['ds']) == pd.to_datetime('2026-03-01')) ]
# df3 = forecasted_stats.loc[(pd.to_datetime(forecasted_stats['ds']) == pd.to_datetime('2030-03-01')) ]
# df1
# df2
# df3


# def forecasted_percentiles(fc_model, input_df, percentiles):     
#     forecasted_samples = fc_model.predictive_samples(input_df)
#     forecasted_stats=pd.DataFrame(data=np.transpose(np.percentile(forecasted_samples['yhat'], percentiles, axis=1 ))
#              ,  columns = ['pct_'+str(x) for x in percentiles])
#     forecasted_stats.insert(loc=0, column='Predicted', value=input_df['yhat'])
#     forecasted_stats.insert(loc=0, column='ds', value=input_df['ds'])
#     return forecasted_stats 



# def forecast_future(future_samples_count, df, growth = 'linear'):
#     model = Prophet(growth=growth)
#     model.fit(df)
    
#     future = my_make_future_dataframe(df, periods=future_samples_count)

#     forecast = model.predict(future)
#     return forecast, model