In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import chi2

#Prepare data and binning
def prepare_data(df):
    # Binning of 'NumberOfPersons'
    df['Group_Persons'] = pd.cut(df['NumberOfPersons'], 
                                 bins=[-np.inf, 1, 5, 30, 100, np.inf], 
                                 labels=['1', '2-5', '6-30', '31-100', '100+'])
    
    # Binning de 'CompanyAge' (Création de groupes pour stabiliser les facteurs)
    df['Group_Age'] = pd.cut(df['CompanyAge'], 
                             bins=[-np.inf, 3, 15, np.inf], 
                             labels=['New', 'Established', 'Experienced'])
    
    # Nettoyage des ratings financiers (Regroupement des catégories rares/manquantes)
    # Les ratings sont ordonnés de C à AAA. On traite les 'missing' séparément.
    df['FinancialRating'] = df['FinancialRating'].replace(['missing', 'AN', 'IR'], 'Unknown/Special')
    
    # S'assurer que les variables catégorielles sont bien définies
    for col in ['TravellingArea', 'Group_Persons', 'Group_Age', 'FinancialRating']:
        df[col] = df[col].astype('category')
        
    return df

df_train = prepare_data(pd.read_csv("GLM_KTH_Data_Train.csv"))
df_eval = prepare_data(pd.read_csv("GLM_KTH_Data_Eval.csv"))

#Frequencu model
# On utilise log(Duration) comme OFFSET pour modéliser le taux de sinistre par an
formula_freq = "NumberOfClaims ~ Group_Persons + Group_Age + TravellingArea + FinancialRating"

model_freq = smf.glm(formula=formula_freq, 
                     data=df_train, 
                     family=sm.families.Poisson(),
                     offset=np.log(df_train['Duration'])).fit()

#Severity model
# Uniquement sur les polices avec au moins 1 sinistre
df_sev = df_train[df_train['NumberOfClaims'] > 0].copy()
df_sev['AvgCost'] = df_sev['ClaimCost'] / df_sev['NumberOfClaims']

# Lien log pour garantir des coûts positifs et une structure multiplicative
formula_sev = "AvgCost ~ Group_Persons + TravellingArea"
model_sev = smf.glm(formula=formula_sev, 
                    data=df_sev, 
                    family=sm.families.Gamma(link=sm.families.links.log())).fit()

#Statistical validation LIKELIHOOD RATIO TEST performed
def perform_lrt(full_model, formula_reduced, data, family, offset=None):
    model_reduced = smf.glm(formula=formula_reduced, data=data, family=family, offset=offset).fit()
    lr_stat = 2 * (full_model.llf - model_reduced.llf)
    p_val = chi2.sf(lr_stat, full_model.df_model - model_reduced.df_model)
    return p_val

# Test : L'âge de l'entreprise est-il significatif pour la fréquence ?
p_val_age = perform_lrt(model_freq, "NumberOfClaims ~ Group_Persons + TravellingArea", 
                        df_train, sm.families.Poisson(), offset=np.log(df_train['Duration']))

print(f"LRT p-value pour Group_Age: {p_val_age:.4f}")

#Extraction of gamma values for the report
def get_gamma_factors(model):
    return np.exp(model.params)

print("\n--- FACTEURS DE RISQUE (MULTIPLICATEURS) ---")
print(get_gamma_factors(model_freq))

#Final price calculations
# Pour la tarification annuelle sur le set d'évaluation, Duration = 1
df_eval['Duration'] = 1.0
pred_freq = model_freq.predict(df_eval)
pred_sev = model_sev.predict(df_eval)

df_eval['premium'] = pred_freq * pred_sev

#Export values to Excel
df_eval[['PolID', 'premium']].to_csv("submission_project2.csv", index=False)

print("\nModelisation is completed")