In [None]:
# import libraries
from pathlib import Path
import pandas as pd
from scipy.stats import weibull_min, burr, gengamma
import numpy as np

In [None]:
dir_sfcwind = Path("../data/processed/sfcwind")
files_sfcwind = list(dir_sfcwind.glob("*"))
dict_files = {file.stem: pd.read_parquet(file) for file in files_sfcwind}

In [None]:
models = {
    'weibull_min': weibull_min,
    'burr': burr,
    'gengamma': gengamma
}

In [69]:
# define a function to calculate AIC
def calculate_aic(log_likelihood, k):
    return 2 * k - 2 * log_likelihood

# define a function to calculate BIC
def calculate_bic(n, log_likelihood, k):
    return np.log(n) * k - 2 * log_likelihood

# define a function to select model
def model_selection(df):

    best_model = {'Model Name': None,
                       'Params': None, 'AIC': np.inf, 'BIC': np.inf}
    n = len(df)

    for model_name, model in models.items():
        try:
            # df_weibull = np.clip(df, 0.1, None)
            df_weibull = df[df>0]
            
            if model_name == 'weibull_min':
                params = model.fit(df_weibull)
                pdf = model.pdf
                log_likelihood = np.sum(np.log(pdf(df_weibull, *params)))
            else:
                params = model.fit(df)
                pdf = model.pdf
                log_likelihood = np.sum(np.log(pdf(df, *params)))

            k = len(params)
            aic = calculate_aic(log_likelihood,k)
            bic = calculate_bic(n,log_likelihood,k)

            # print(model_name, aic, bic)
            
            if aic < best_model['AIC']:
                best_model.update(
                    {'Model Name': model_name, 'Params': params, 'AIC': aic, 'BIC': bic})
        except Exception as e:
            print(f"Error fitting {model_name}: {e}")

    return best_model

In [72]:
# find the best model for wind distribution in specific wind farms
dict_best_models = {}
for code, df in dict_files.items():
    best_model = model_selection(df['sfcWind'])
    dict_best_models[code] = best_model


In [82]:
df_best_models = (pd.DataFrame.from_dict(dict_best_models, orient = 'index').reset_index().rename(columns={"index":"code"}))
df_best_models

Unnamed: 0,code,Model Name,Params,AIC,BIC
0,0WAMBOWF,gengamma,"(2.2982118167491468, 2.166852051413005, -0.886...",311670.873215,311708.397552
1,ALBANY,gengamma,"(1.609137475072723, 2.0643398651842144, -0.610...",431000.718235,431038.242572
2,ALINTA_WWF,gengamma,"(0.836682459123211, 2.583573203578604, -0.0015...",377520.148873,377557.673209
3,ARWF,gengamma,"(0.5571219629813979, 2.9899805377097373, 0.027...",384017.804056,384055.328392
4,BADGINGARRA,gengamma,"(1.2311748622347252, 2.137906115576946, -0.052...",366243.984863,366281.509199
...,...,...,...,...,...
96,WRWF1,gengamma,"(0.8880072928756162, 2.065404161623898, -0.000...",378093.157022,378130.681358
97,YAMBUK,gengamma,"(1.6575653301374897, 1.819249811824407, -0.205...",403443.066073,403480.590410
98,YANDIN,gengamma,"(1.611334399605959, 1.8982424816832042, -0.156...",360986.479546,361024.003882
99,YENDONWF,gengamma,"(5.2161361374914454, 1.1787913099977685, -1.18...",394153.488585,394191.012921


In [84]:
df_best_models.to_csv("../data/processed/distribution_wind.csv", index=False)