In [1]:
# 🌧️ Zindi Hackathon - Starter Notebook: Predicting Corrected Precipitation (PRECTOTCORR)

# 📥 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import ipywidgets as widgets
from IPython.display import display
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd
import numpy as np

# Variables Météorologiques

## Variables Météorologiques de Base

| Variable | Description | Unité |
|----------|-------------|-------|
| WS2M | Vitesse du vent à 2m | m/s |
| T2M | Température à 2m | °C |
| T2MDEW | Température du point de rosée à 2m | °C |
| T2MWET | Température du bulbe humide à 2m | °C |
| RH2M | Humidité relative à 2m | % |
| PS | Pression de surface | kPa |
| QV2M | Humidité spécifique à 2m | g/kg |

## Variables Dérivées et Indices Calculés

| Variable | Description | Formule/Calcul | Signification |
|----------|-------------|----------------|---------------|
| HUMIDITY_SATURATION | Ratio de saturation | QV2M / (RH2M/100 + ε) | Capacité d'absorption d'eau |
| TEMP_PRESSURE_RATIO | Ratio température/pression | T2M / PS | Relation thermodynamique |
| TEMP_HUMIDITY_INDEX | Index de confort thermique | T2M × RH2M / 100 | Température ressentie |
| DEW_POINT_SPREAD | Écart point de rosée | T2M - T2MDEW | Potentiel de condensation |
| WET_BULB_DIFF | Différence bulbe humide | T2M - T2MWET | Capacité évaporative |
| PRESSURE_TENDENCY | Tendance de pression | diff(PS) | Évolution météorologique |

## Variables Temporelles et Cycliques

| Variable | Description | Formule | Utilité |
|----------|-------------|---------|---------|
| MONTH_SIN | Composante sinusoïdale du mois | sin(2π × MO/12) | Capture cycle annuel |
| MONTH_COS | Composante cosinusoïdale du mois | cos(2π × MO/12) | Capture cycle annuel |
| DAY_SIN | Composante sinusoïdale du jour | sin(2π × DY/31) | Capture cycle mensuel |
| DAY_COS | Composante cosinusoïdale du jour | cos(2π × DY/31) | Capture cycle mensuel |
| SEASON | Codage des saisons | 0=sèche, 1=transition, 2=pluies, 3=fin | Patterns saisonniers |

In [2]:
# 📁 2. Load the datasets
train = pd.read_csv("Train_data.csv")
test = pd.read_csv("Test_data.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()

Train shape: (3579, 13)
Test shape: (1535, 12)


Unnamed: 0,ID,YEAR,MO,DY,WS2M,T2M,T2MDEW,T2MWET,Target,RH2M,PS,QV2M,DATE
0,ID_IODka8_20140721,2014,7,21,1.83,22.67,18.75,20.71,0.0,79.73,96.49,14.07,2014-07-21
1,ID_JeycV8_20110904,2011,9,4,2.92,25.99,18.95,22.47,0.01,66.8,96.11,14.3,2011-09-04
2,ID_gdbFWm_20180416,2018,4,16,1.06,25.29,22.11,23.7,3.85,83.56,96.3,17.37,2018-04-16
3,ID_lR86N5_20190506,2019,5,6,0.66,26.04,22.93,24.49,2.8,83.68,96.15,18.3,2019-05-06
4,ID_v4KHmp_20200908,2020,9,8,1.86,25.35,20.98,23.16,2.58,77.9,96.1,16.24,2020-09-08


## fonction de transformation

In [3]:
def generate_weather_features(df):
    result = df.copy()
    
    # Extraire le jour, mois et jour de la semaine de la colonne DATE
    result['DATE'] = pd.to_datetime(result['DATE'])
    result['day'] = result['DATE'].dt.day
    result['month'] = result['DATE'].dt.month
    result['weekday'] = result['DATE'].dt.weekday
    
    # Variables dérivées
    result['HUMIDITY_SATURATION'] = result['QV2M'] / (result['RH2M']/100 + 1e-6)
    result['TEMP_PRESSURE_RATIO'] = result['T2M'] / result['PS']
    result['TEMP_HUMIDITY_INDEX'] = result['T2M'] * result['RH2M'] / 100
    result['DEW_POINT_SPREAD'] = result['T2M'] - result['T2MDEW']
    result['WET_BULB_DIFF'] = result['T2M'] - result['T2MWET']
    result['PRESSURE_TENDENCY'] = result['PS'].diff()
    
    # Variables temporelles cycliques
    result['MONTH_SIN'] = np.sin(2 * np.pi * result['MO'] / 12)
    result['MONTH_COS'] = np.cos(2 * np.pi * result['MO'] / 12)
    result['DAY_SIN'] = np.sin(2 * np.pi * result['DY'] / 31)
    result['DAY_COS'] = np.cos(2 * np.pi * result['DY'] / 31)
    
    # Saisons (Congo): 1=jan-fév, 4=mars-mai, 2=juin-sept, 3=oct-déc
    result['SEASON'] = result['MO']#.apply(
    #    lambda x: 1 if x in [1, 2] else 4 if x in [3, 4, 5] else 2 if x in [6, 7, 8, 9] else 3
    #)
    
    # Sélectionner uniquement les variables définies dans les tableaux markdown
    try:
        final_columns = [
            # Variables de base
            'WS2M', 'T2M', 'T2MDEW', 'T2MWET', 'RH2M', 'PS', 'QV2M',
            # Variables dérivées
            'HUMIDITY_SATURATION','weekday',
            'TEMP_PRESSURE_RATIO', 
            #'TEMP_HUMIDITY_INDEX',
            #'DEW_POINT_SPREAD', #'WET_BULB_DIFF', 
            'PRESSURE_TENDENCY',
            # Variables temporelles
            'MONTH_SIN', 'MONTH_COS', 'DAY_SIN', 'DAY_COS', 'SEASON','Target','DATE'
        ]
        
        return result[final_columns]
    except:
        final_columns = [
            # Variables de base
            'WS2M', 'T2M', 'T2MDEW', 'T2MWET', 'RH2M', 'PS', 'QV2M',
            # Variables dérivées
            'HUMIDITY_SATURATION','weekday',
            'TEMP_PRESSURE_RATIO',
            #'TEMP_HUMIDITY_INDEX',
            #'DEW_POINT_SPREAD',# 'WET_BULB_DIFF', 
            'PRESSURE_TENDENCY',
            # Variables temporelles
            'MONTH_SIN', 'MONTH_COS', 'DAY_SIN', 'DAY_COS', 'SEASON','DATE','ID'
        ]
        
        return result[final_columns]

## base de donnees avec les variables essentielle

In [4]:
train = generate_weather_features(train)
test = generate_weather_features(test)

In [5]:
train.head(2)

Unnamed: 0,WS2M,T2M,T2MDEW,T2MWET,RH2M,PS,QV2M,HUMIDITY_SATURATION,weekday,TEMP_PRESSURE_RATIO,PRESSURE_TENDENCY,MONTH_SIN,MONTH_COS,DAY_SIN,DAY_COS,SEASON,Target,DATE
0,1.83,22.67,18.75,20.71,79.73,96.49,14.07,17.647037,0,0.234947,,-0.5,-0.8660254,-0.897805,-0.440394,7,0.0,2014-07-21
1,2.92,25.99,18.95,22.47,66.8,96.11,14.3,21.407154,6,0.270419,-0.38,-1.0,-1.83697e-16,0.724793,0.688967,9,0.01,2011-09-04


In [6]:
test.head(2)

Unnamed: 0,WS2M,T2M,T2MDEW,T2MWET,RH2M,PS,QV2M,HUMIDITY_SATURATION,weekday,TEMP_PRESSURE_RATIO,PRESSURE_TENDENCY,MONTH_SIN,MONTH_COS,DAY_SIN,DAY_COS,SEASON,DATE,ID
0,1.5,24.58,21.27,22.92,82.72,96.19,16.51,19.958873,2,0.255536,,1.224647e-16,-1.0,0.101168,-0.994869,6,2011-06-15,ID_OdrVnE_20110615
1,1.8,25.27,20.66,22.97,76.89,96.25,15.89,20.665859,6,0.262545,0.06,-1.0,-1.83697e-16,0.897805,-0.440394,9,2023-09-10,ID_MdWBtG_20230910


### visualisation des donnees

In [7]:
def explore_categorical_variables(df):
    # Identifier variables qualitatives (object, category, ou peu de valeurs uniques)
    categorical_vars = []
    for col in df.columns:
        if df[col].dtype == 'object' or df[col].dtype.name == 'category' or df[col].nunique() <= 10:
            categorical_vars.append(col)
    
    if not categorical_vars:
        print("Aucune variable qualitative trouvée")
        return
    
    # Calculer layout des subplots
    n_vars = len(categorical_vars)
    cols = min(3, n_vars)
    rows = (n_vars + cols - 1) // cols
    
    # Créer subplots
    fig = make_subplots(
        rows=rows, cols=cols,
        subplot_titles=categorical_vars,
        specs=[[{"type": "bar"}] * cols for _ in range(rows)]
    )
    
    # Ajouter un graphique pour chaque variable
    for i, var in enumerate(categorical_vars):
        row = i // cols + 1
        col = i % cols + 1
        
        # Compter les valeurs
        counts = df[var].value_counts().sort_index()
        
        fig.add_trace(
            go.Bar(x=counts.index, y=counts.values, name=var, showlegend=False),
            row=row, col=col
        )
    
    # Mise en forme
    fig.update_layout(
        height=300 * rows,
        title_text="Exploration Univariée - Variables Qualitatives",
        title_x=0.5
    )
    
    
    
    return fig

def explore_quantitative_variables(df):
    # Identifier variables quantitatives (numériques avec plus de 10 valeurs uniques)
    quantitative_vars = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]) and df[col].nunique() > 10:
            quantitative_vars.append(col)
    
    if not quantitative_vars:
        print("Aucune variable quantitative trouvée")
        return
    
    # Calculer layout des subplots
    n_vars = len(quantitative_vars)
    cols = min(3, n_vars)
    rows = (n_vars + cols - 1) // cols
    
    # Créer subplots
    fig = make_subplots(
        rows=rows, cols=cols,
        subplot_titles=quantitative_vars,
        specs=[[{"type": "histogram"}] * cols for _ in range(rows)]
    )
    
    # Ajouter un histogramme pour chaque variable
    for i, var in enumerate(quantitative_vars):
        row = i // cols + 1
        col = i % cols + 1
        
        # Calculer min et max pour la plage
        var_data = df[var].dropna()
        min_val = var_data.min()
        max_val = var_data.max()
        
        fig.add_trace(
            go.Histogram(
                x=var_data, 
                name=var, 
                showlegend=False,
                nbinsx=30,
                xbins=dict(start=min_val, end=max_val, size=(max_val-min_val)/30)
            ),
            row=row, col=col
        )
        
        # Calculer statistiques descriptives complètes
        mean_val = var_data.mean()
        std_val = var_data.std()
        median_val = var_data.median()
        
        fig.add_annotation(
            text=f"μ={mean_val:.2f}<br>σ={std_val:.2f}<br>Med={median_val:.2f}<br>Min={min_val:.2f}<br>Max={max_val:.2f}",
            xref=f"x{i+1}", yref=f"y{i+1}",
            x=0.7, y=0.9, xanchor="left", yanchor="top",
            showarrow=False, font=dict(size=9),
            row=row, col=col
        )
    
    # Mise en forme
    fig.update_layout(
        height=300 * rows,
        title_text="Exploration Univariée - Variables Quantitatives",
        title_x=0.5
    )
    
    
    return fig

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

def analyze_seasonal_discrimination(df, season_col='SEASON'):
    # Identifier types de variables
    quantitative_vars = [col for col in df.columns 
                        if pd.api.types.is_numeric_dtype(df[col]) and df[col].nunique() > 10 and col != season_col]
    categorical_vars = [col for col in df.columns 
                       if (df[col].dtype == 'object' or df[col].dtype.name == 'category' or df[col].nunique() <= 10) and col != season_col]
    
    all_vars = quantitative_vars + categorical_vars
    if not all_vars:
        print("Aucune variable à analyser")
        return
    
    # Layout subplots
    n_vars = len(all_vars)
    cols = min(4, n_vars)
    rows = (n_vars + cols - 1) // cols
    
    # Créer subplots
    fig = make_subplots(
        rows=rows, cols=cols,
        subplot_titles=all_vars
    )
    
    # Couleurs pour les saisons
    season_colors = {1: '#FF6B6B', 2: '#4ECDC4', 3: '#45B7D1', 4: '#96CEB4'}
    season_names = {1: 'S1', 2: 'S2', 3: 'S3', 4: 'S4'}
    
    for i, var in enumerate(all_vars):
        row = i // cols + 1
        col = i % cols + 1
        
        if var in quantitative_vars:
            # Box plot pour variables quantitatives
            for season in sorted(df[season_col].unique()):
                season_data = df[df[season_col] == season][var].dropna()
                fig.add_trace(
                    go.Box(
                        y=season_data,
                        name=f'{season_names[season]}',
                        marker_color=season_colors[season],
                        showlegend=(i == 0),
                        legendgroup=str(season)
                    ),
                    row=row, col=col
                )
        
        else:
            # Barres groupées pour variables qualitatives
            for season in sorted(df[season_col].unique()):
                season_data = df[df[season_col] == season][var].value_counts()
                fig.add_trace(
                    go.Bar(
                        x=season_data.index,
                        y=season_data.values,
                        name=f'{season_names[season]}',
                        marker_color=season_colors[season],
                        showlegend=(i == 0),
                        legendgroup=str(season)
                    ),
                    row=row, col=col
                )
    
    # Mise en forme
    fig.update_layout(
        height=250 * rows,
        title_text="Discrimination des Variables par Saison",
        title_x=0.5,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )
    
    return fig




def analyze_target_distribution(df, target_col='Target', date_col='DATE', ref=40):
    target_data = df[target_col].dropna()
    
    # Calcul des outliers avec méthode IQR
    Q1 = target_data.quantile(0.25)
    Q3 = target_data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = target_data[(target_data < lower_bound) | (target_data > upper_bound)]
    n_outliers = len(outliers)
    pct_outliers = (n_outliers / len(target_data)) * 100

    # Calcul de la proportion au-dessus du seuil
    above_ref = target_data[target_data > ref]
    n_above_ref = len(above_ref)
    pct_above_ref = (n_above_ref / len(target_data)) * 100

    # Créer subplots
    fig = make_subplots(
        rows=3, cols=1,
        subplot_titles=[f'Distribution de {target_col}', 'Boxplot', 'Évolution temporelle'],
        row_heights=[0.4, 0.2, 0.4],
        vertical_spacing=0.08
    )

    # Histogramme
    fig.add_trace(
        go.Histogram(x=target_data, nbinsx=30, marker_color='steelblue', name='Distribution'),
        row=1, col=1
    )

    # Boxplot
    fig.add_trace(
        go.Box(x=target_data, marker_color='lightblue', name='Boxplot'),
        row=2, col=1
    )

    # Évolution temporelle
    df_clean = df[[date_col, target_col]].dropna()
    df_clean[date_col] = pd.to_datetime(df_clean[date_col])
    df_sorted = df_clean.sort_values(date_col)

    fig.add_trace(
        go.Scatter(x=df_sorted[date_col], y=df_sorted[target_col], 
                   mode='lines+markers', marker_color='green', name='Évolution'),
        row=3, col=1
    )

    # Ligne de référence
    fig.add_hline(y=ref, line_dash="dash", line_color="red", row=3, col=1)

    # Mise en forme
    fig.update_layout(
        height=800,
        template="plotly_white",
        showlegend=False,
        title_text=(
            f"Analyse de {target_col} - Outliers: {n_outliers} ({pct_outliers:.1f}%)"
            f" |seuil:  >{ref}: {n_above_ref} ({pct_above_ref:.1f}%)"
        )
    )

    fig.update_xaxes(title_text=target_col, row=2, col=1)
    fig.update_xaxes(title_text=date_col, row=3, col=1)
    fig.update_yaxes(title_text="Fréquence", row=1, col=1)
    fig.update_yaxes(title_text=target_col, row=3, col=1)

    return fig





In [8]:
explore_categorical_variables(train)

In [9]:
explore_quantitative_variables(train)

| SEASON | Classification des saisons du Congo-Brazzaville |
|--------|--------------------------------------------------|
| 1 | Petite saison sèche (janvier-février) |
| 2 | Grande saison sèche (juin-septembre) |
| 3 | Grande saison des pluies (octobre-décembre) |
| 4 | Petite saison des pluies (mars-mai) |

In [None]:

analyze_target_distribution(train,#.query('Target<=30'),
                             target_col='Target', date_col='DATE',ref=50)

In [38]:
def interactive_plot(target_col, ref):
    fig = analyze_target_distribution(train.query('Target<=80'),
                                      target_col=target_col, date_col='DATE', ref=ref)
    fig.show()

widgets.interact(interactive_plot, target_col=train.dtypes[:-3].index.tolist(), ref=10)

interactive(children=(Dropdown(description='target_col', options=('WS2M', 'T2M', 'T2MDEW', 'T2MWET', 'RH2M', '…

<function __main__.interactive_plot(target_col, ref)>

## etude des correlation

In [13]:
import plotly.express as px
import plotly.figure_factory as ff 

# Sélection des variables numériques hors DATE
cols_corr = [col for col in train.select_dtypes(include=['float64', 'int64']).columns if col != 'DATE']
corr_matrix = train[cols_corr].corr().round(2)

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.index.tolist(),
    colorscale='RdBu_r',
    showscale=True,
    annotation_text=corr_matrix.values.astype(str)
)
fig.update_layout(
    title="Matrice de corrélation entre les variables",
    font=dict(size=14),
    title_font=dict(size=20, family='Arial', color='darkblue'),
    plot_bgcolor='rgba(0,0,0,0)',height=600,
    margin=dict(l=40, r=40, t=60, b=40)
)
fig.show()

In [14]:
var_retain = ['WS2M',
 'T2M',
 'T2MDEW',
 'T2MWET',
 'RH2M',
 'PS',
 'MONTH_SIN',
 'MONTH_COS',
 'DAY_SIN','weekday',
 'DAY_COS',
 'SEASON',
 'Target']

### recodage de l humidite specifique car rapportant la meme information que humidite relative

In [15]:
# 'QV2M'

def plot_target_by_quintile(df, variable='QV2M', target='TARGET', q=5):
    """
    Affiche un boxplot de TARGET en fonction des quintiles d'une variable continue.
    """
    df = df[[variable, target]].dropna()
    df['quintile'] = pd.qcut(df[variable], q=q, labels=[f'Q{i+1}' for i in range(q)])
    
    fig = px.box(df, x='quintile', y=target, color='quintile',
                 title=f"{target} selon les quintiles de {variable}",
                 labels={'quintile': f'Quintiles de {variable}', target: target})
    
    fig.update_layout(showlegend=False, template='plotly_white')
    return fig

plot_target_by_quintile(train.query('Target<=38')
                        , variable='T2MDEW', target='Target', q=3)

# ajout d une nouvelle variables

In [16]:

def add_quintile_column_train_test(train_df, test_df, variable, q=5):
    """
    Détermine les quantiles sur train et applique ces seuils à test.
    Q1: [-∞, Q1], Q2: [Q1, Q2], ..., Qn: [Qn-1, +∞]
    """
    train_copy = train_df.copy()
    test_copy = test_df.copy()
    
    # ÉTAPE 1: Déterminer les seuils sur TRAIN
    quantiles = np.linspace(0, 1, q+1)
    seuils = train_copy[variable].quantile(quantiles).values
    
    # Remplacer les bornes par -∞ et +∞
    seuils[0] = -np.inf
    seuils[-1] = np.inf
    
    print(f"=== QUANTILES DÉTERMINÉS SUR TRAIN pour {variable} ===")
    for i in range(q):
        borne_inf = "-∞" if i == 0 else f"{seuils[i]:.3f}"
        borne_sup = "+∞" if i == q-1 else f"{seuils[i+1]:.3f}"
        bracket = "]" if i == q-1 else "["
        print(f"Q{i+1}: [{borne_inf}, {borne_sup}{bracket}")
    
    # Labels
    label_list = [f'Q{i+1}' for i in range(q)]
    
    # ÉTAPE 2: Former les classes avec ces seuils
    train_copy[f'quintile_{variable}'] = pd.cut(train_copy[variable], bins=seuils, labels=label_list, include_lowest=True)
    test_copy[f'quintile_{variable}'] = pd.cut(test_copy[variable], bins=seuils, labels=label_list, include_lowest=True)
    
    # Vérification des NaN (il ne devrait plus y en avoir)
    train_nan = train_copy[f'quintile_{variable}'].isna().sum()
    test_nan = test_copy[f'quintile_{variable}'].isna().sum()
    
    print(f"\nNaN créés - Train: {train_nan}, Test: {test_nan}")
    
    # Répartitions
    print(f"\nRépartition TRAIN:")
    print(train_copy[f'quintile_{variable}'].value_counts().sort_index())
    print(f"\nRépartition TEST:")
    print(test_copy[f'quintile_{variable}'].value_counts().sort_index())
    
    return train_copy, test_copy

q = 4



train,test = add_quintile_column_train_test(train, test, variable='T2MDEW', q=4)


=== QUANTILES DÉTERMINÉS SUR TRAIN pour T2MDEW ===
Q1: [-∞, 20.350[
Q2: [20.350, 21.520[
Q3: [21.520, 22.200[
Q4: [22.200, +∞]

NaN créés - Train: 0, Test: 0

Répartition TRAIN:
quintile_T2MDEW
Q1    897
Q2    906
Q3    887
Q4    889
Name: count, dtype: int64

Répartition TEST:
quintile_T2MDEW
Q1    365
Q2    400
Q3    379
Q4    391
Name: count, dtype: int64


# seuil 

### restriction aux valeur non abberante (target < 38)

In [17]:
train  = train.query('Target<=50')

## entrainement d un modele lineaire pour etude des VIF

In [18]:
def run_linear_regression_with_vif(df, features, target='Target', categorical_vars=None):
    """
    Effectue une régression linéaire et calcule les VIF.
    
    Parameters:
    - df : DataFrame
    - features : liste des variables explicatives (numériques)
    - target : variable cible
    - categorical_vars : liste des variables catégorielles à transformer en dummies
    
    Returns:
    - Résumé de la régression
    - DataFrame des VIF
    """
    df_model = df[features + [target]].dropna()

    if categorical_vars:
        df_model = pd.get_dummies(df_model, columns=categorical_vars, drop_first=True)
        # Mettre à jour les features avec les dummies
        dummy_cols = [col for col in df_model.columns if any(var + '_' in col for var in categorical_vars)]
        features = [col for col in df_model.columns if col != target and col in features] + dummy_cols

    X = df_model[features]
    y = df_model[target]
    #print(X)
    X_const = sm.add_constant(X)
    model = sm.OLS(y, X_const).fit()
    
    # Calcul des VIF
    vif_data = pd.DataFrame()
    vif_data['Variable'] = X_const.columns
    vif_data['VIF'] = [variance_inflation_factor(X_const.values, i)
                       for i in range(X_const.shape[1])]

    return model.summary(), vif_data


In [19]:


variables = ['WS2M', 'T2M', 
             #'T2MDEW', 
             #'T2MWET',
             'HUMIDITY_SATURATION',
             'TEMP_PRESSURE_RATIO',
              'PRESSURE_TENDENCY',
             'RH2M', 'PS',
             'MONTH_SIN', 'MONTH_COS'] 


summary, vif = run_linear_regression_with_vif(train, features=variables, target='Target')
print(summary)
print("\nVIF :\n", vif)


                            OLS Regression Results                            
Dep. Variable:                 Target   R-squared:                       0.247
Model:                            OLS   Adj. R-squared:                  0.245
Method:                 Least Squares   F-statistic:                     129.2
Date:                Fri, 01 Aug 2025   Prob (F-statistic):          6.93e-211
Time:                        10:57:15   Log-Likelihood:                -11038.
No. Observations:                3559   AIC:                         2.210e+04
Df Residuals:                    3549   BIC:                         2.216e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                7162.2123   1

In [20]:
qualitative_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()
qualitative_cols.append('SEASON')
quantitative_cols = [i for i in train.select_dtypes(include=['number']).columns.tolist() if i not in ['DAY_SIN','DAY_COS'#,'T2MWET','T2MDEW'
                                                                                                      ,'Target', 'SEASON']]
cols = qualitative_cols + quantitative_cols
target = 'Target'

### etude multivariee

In [21]:
Train = train[cols + ['Target']]
test = test[cols +['ID','DATE']]
test = test.set_index('ID')
Train['SEASON'] = Train['SEASON'].astype('str')
test['SEASON'] = test['SEASON'].astype('str')

In [22]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

def analyse_acp(df, variables=['WS2M', 'T2M', 'RH2M', 'PS', 'QV2M', 'MONTH_SIN', 'MONTH_COS']):
    df_acp = df[variables].dropna()
    
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_acp)
    
    pca = PCA()
    pca.fit(df_scaled)
    
    # Créer subplots
    fig = make_subplots(rows=1, cols=2,
                       subplot_titles=['Éboulis des valeurs propres', 'Cercle des corrélations'],
                       specs=[[{"type": "scatter"}, {"type": "scatter"}]])
    
    # Éboulis des valeurs propres
    n_components = len(pca.explained_variance_ratio_)
    fig.add_trace(go.Bar(x=list(range(1, n_components+1)), 
                        y=pca.explained_variance_ratio_,
                        marker_color='blue', name='Valeurs propres'), row=1, col=1)
    
    # Cercle des corrélations (composantes 1 et 2)
    pca_2d = PCA(n_components=2)
    pca_2d.fit(df_scaled)
    loadings = pca_2d.components_.T * np.sqrt(pca_2d.explained_variance_)
    
    for i, var in enumerate(variables):
        fig.add_trace(go.Scatter(x=[0, loadings[i, 0]], y=[0, loadings[i, 1]],
                                mode='lines+text', line=dict(color='red', width=2),
                                text=['', var], textposition='top center',
                                showlegend=False), row=1, col=2)
    
    # Cercle unitaire
    theta = np.linspace(0, 2*np.pi, 100)
    fig.add_trace(go.Scatter(x=np.cos(theta), y=np.sin(theta), mode='lines',
                            line=dict(color='gray', dash='dash'), 
                            showlegend=False), row=1, col=2)
    
    # Mise en forme
    fig.update_xaxes(title_text="Composantes", row=1, col=1)
    fig.update_yaxes(title_text="Variance expliquée", row=1, col=1)
    fig.update_xaxes(title_text=f'CP1 ({pca_2d.explained_variance_ratio_[0]:.1%})', 
                     range=[-1.2, 1.2], row=1, col=2)
    fig.update_yaxes(title_text=f'CP2 ({pca_2d.explained_variance_ratio_[1]:.1%})', 
                     range=[-1.2, 1.2], scaleanchor="x2", scaleratio=1, row=1, col=2)
    
    fig.update_layout(template='plotly_white', height=500, showlegend=False)
    
    return fig,pca ,scaler

fig,pca ,scaler= analyse_acp(train, variables=['WS2M', 'T2M', 'RH2M', 'PS', 'QV2M', 'MONTH_SIN', 'MONTH_COS','T2MWET','T2MDEW'])

fig

In [23]:
from sklearn.cluster import KMeans

def kmeans_elbow_acp(coords_acp, k_range=range(1, 11), k_final=4):
    inertias = []
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(coords_acp)
        inertias.append(kmeans.inertia_)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(k_range), y=inertias, mode='lines+markers', 
                            line=dict(color='blue', width=2), marker=dict(size=8)))
    fig.update_layout(title='Méthode du Coude - K-means', 
                     xaxis_title='Nombre de clusters (K)', yaxis_title='Inertie',
                     template='plotly_white', height=400)
    fig.show()
    
    # Prédictions avec K choisi
    kmeans_final = KMeans(n_clusters=k_final, random_state=42)
    predictions = kmeans_final.fit_predict(coords_acp)
    
    print(f"Clustering avec K={k_final}:")
    unique, counts = np.unique(predictions, return_counts=True)
    for cluster, count in zip(unique, counts):
        print(f"Cluster {cluster}: {count} observations")
    
    return predictions, inertias, fig,kmeans_final

train_pca =pca.transform(scaler.transform(train[['WS2M', 'T2M', 'RH2M', 'PS', 'QV2M', 'MONTH_SIN', 'MONTH_COS',
                                                 'T2MWET','T2MDEW'
                                                 ]]))[:, :3]
test_pca =pca.transform(scaler.transform(test[['WS2M', 'T2M', 'RH2M', 'PS', 'QV2M', 'MONTH_SIN', 'MONTH_COS',
                                               'T2MWET','T2MDEW'
                                               ]]))[:, :3]

predictions_train,inertias, fig,kmeans_final = kmeans_elbow_acp(train_pca, k_range=range(1, 11))
predictions_test= kmeans_final.predict(test_pca)

Clustering avec K=4:
Cluster 0: 1153 observations
Cluster 1: 1130 observations
Cluster 2: 759 observations
Cluster 3: 518 observations


In [24]:
Train['classe'] = predictions_train
test['classe'] = predictions_test

## Interprétation de l'Analyse en Composantes Principales

**Éboulis des valeurs propres :** Le graphique de gauche révèle que la première composante principale capture 46,5% de la variance totale des données, ce qui est substantiel et indique qu'une dimension majeure de variation existe dans vos variables météorologiques. La deuxième composante explique environ 21% de la variance, portant le total cumulé à près de 68% pour les deux premières dimensions. Cette répartition suggère une structure bidimensionnelle assez claire dans vos données climatiques. L'éboulis montre une décroissance progressive mais régulière, avec un léger coude après la deuxième composante, confirmant que ces deux premières dimensions sont les plus informatives. Les composantes suivantes (3 à 7) contribuent chacune de manière plus modeste, entre 5% et 13%, suggérant que l'essentiel de l'information peut être résumé efficacement dans un espace à deux dimensions. Cette configuration est idéale pour une visualisation et une interprétation simplifiées des relations entre variables.

**Cercle des corrélations :** Le cercle des corrélations révèle des patterns intéressants dans les relations entre vos variables météorologiques. On observe que T2M (température) et MONTH_COS sont fortement corrélés et contribuent positivement à la première composante, suggérant un lien entre température et cyclicité mensuelle. WS2M (vitesse du vent) et PS (pression) apparaissent également corrélés et orientés vers la droite du cercle, indiquant leur association positive sur l'axe principal. QV2M (humidité spécifique) et MONTH_SIN semblent former un autre groupe de variables liées, positionnées dans la partie gauche du cercle. RH2M (humidité relative) se distingue par sa position opposée sur la deuxième composante, suggérant qu'elle capture une dimension de variation différente des autres variables d'humidité. La proximité des flèches au cercle unitaire indique que la plupart des variables sont bien représentées dans ce plan factoriel. Ces relations révèlent la structure sous-jacente de votre système météorologique, avec des groupes de variables évoluant de manière cohérente.

## etude de liaison

In [25]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from itertools import combinations

def etude_relations_qualitatives(df, variables=['quintile_QV2M', 'quintile_T2MDEW', 'quintile_T2MWET', 'SEASON']):
    df_clean = df[variables].dropna()
    
    results = []
    cramer_matrix = np.zeros((len(variables), len(variables)))
    
    for i, var1 in enumerate(variables):
        for j, var2 in enumerate(variables):
            if i <= j:
                contingency = pd.crosstab(df_clean[var1], df_clean[var2])
                chi2, p_value, _, _ = chi2_contingency(contingency)
                n = contingency.sum().sum()
                cramer_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1)))
                cramer_matrix[i, j] = cramer_matrix[j, i] = cramer_v
                if i != j:
                    results.append({'Var1': var1, 'Var2': var2, 'Chi2': chi2, 'p_value': p_value, 'Cramer_V': cramer_v})
    
    fig = make_subplots(rows=1, cols=2, subplot_titles=['Matrice des Cramér V', 'Tests du Chi-2'])
    
    fig.add_trace(go.Heatmap(z=cramer_matrix, x=variables, y=variables, colorscale='Viridis', showscale=True), row=1, col=1)
    
    results_df = pd.DataFrame(results)
    fig.add_trace(go.Bar(x=[f"{r['Var1']}<br>vs<br>{r['Var2']}" for _, r in results_df.iterrows()], 
                        y=results_df['Cramer_V'], marker_color='steelblue'), row=1, col=2)
    
    fig.update_layout(height=500, template='plotly_white')
    fig.show()
    
    print("Résultats des tests:")
    for _, row in results_df.sort_values('Cramer_V', ascending=False).iterrows():
        print(f"{row['Var1']} vs {row['Var2']}: Cramér V = {row['Cramer_V']:.3f}, p-value = {row['p_value']:.4f}")
    
    return  results_df


etude_relations_qualitatives(Train, variables=[#'quintile_QV2M', 
                                               'quintile_T2MDEW', 
                                               #'quintile_T2MWET',
                                               'classe',
                                               'weekday',
                                               'SEASON'])

Résultats des tests:
classe vs SEASON: Cramér V = 0.810, p-value = 0.0000
quintile_T2MDEW vs classe: Cramér V = 0.555, p-value = 0.0000
quintile_T2MDEW vs SEASON: Cramér V = 0.524, p-value = 0.0000
weekday vs SEASON: Cramér V = 0.035, p-value = 1.0000
classe vs weekday: Cramér V = 0.028, p-value = 0.9744
quintile_T2MDEW vs weekday: Cramér V = 0.026, p-value = 0.9878


Unnamed: 0,Var1,Var2,Chi2,p_value,Cramer_V
0,quintile_T2MDEW,classe,3289.521465,0.0,0.554984
1,quintile_T2MDEW,weekday,7.250913,0.987834,0.026056
2,quintile_T2MDEW,SEASON,2936.918858,0.0,0.524397
3,classe,weekday,8.266361,0.974402,0.027821
4,classe,SEASON,7002.750516,0.0,0.809746
5,weekday,SEASON,25.891237,0.999998,0.034816


In [26]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
from scipy.stats import kruskal

def test_kruskal_wallis(df, variables=['quintile_QV2M', 'quintile_T2MDEW', 'quintile_T2MWET', 'SEASON'], target='Target'):
    results = []
    
    fig = make_subplots(rows=2, cols=2, subplot_titles=variables)
    positions = [(1,1), (1,2), (2,1), (2,2)]
    
    for i, var in enumerate(variables):
        df_clean = df[[var, target]].dropna()
        groups = [df_clean[df_clean[var] == group][target].values for group in df_clean[var].unique()]
        h_stat, p_value = kruskal(*groups)
        results.append({'Variable': var, 'H_statistic': h_stat, 'p_value': p_value})
        
        row, col = positions[i]
        for group in df_clean[var].unique():
            group_data = df_clean[df_clean[var] == group][target]
            fig.add_trace(go.Box(y=group_data, name=str(group), showlegend=False), row=row, col=col)
    
    fig.update_layout(height=600, template='plotly_white')
    fig.show()
    
    results_df = pd.DataFrame(results).sort_values('p_value')
    print("Tests de Kruskal-Wallis (Target vs variables qualitatives):")
    for _, row in results_df.iterrows():
        significance = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else ""
        print(f"{row['Variable']}: H = {row['H_statistic']:.3f}, p-value = {row['p_value']:.4f} {significance}")
    
    return  results_df


test_kruskal_wallis(Train, variables=[#'quintile_QV2M', 
                                      'quintile_T2MDEW', 
                                      #'quintile_T2MWET',
                                      'weekday',
                                      'classe',
                                      'SEASON'], target='Target')

Tests de Kruskal-Wallis (Target vs variables qualitatives):
quintile_T2MDEW: H = 1702.057, p-value = 0.0000 ***
classe: H = 1791.034, p-value = 0.0000 ***
SEASON: H = 1941.044, p-value = 0.0000 ***
weekday: H = 1.538, p-value = 0.9569 


Unnamed: 0,Variable,H_statistic,p_value
0,quintile_T2MDEW,1702.056512,0.0
2,classe,1791.033927,0.0
3,SEASON,1941.043531,0.0
1,weekday,1.537817,0.956943


Les p-values < 0.05 indiquent des différences significatives entre les groupes.

## modelisation

## train test split

In [27]:
def create_train_test_split(df, target_col='Target', test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    print(f"Train shape: {X_train.shape}")
    print(f"Test shape: {X_test.shape}")
    print(f"Train target shape: {y_train.shape}")
    print(f"Test target shape: {y_test.shape}")
    
    return X_train, X_test, y_train, y_test


Train['PRESSURE_TENDENCY'] = Train['PS'].diff().fillna(0)
test['PRESSURE_TENDENCY'] = test['PS'].diff().fillna(0)
X_train, X_test, y_train, y_test = create_train_test_split(Train, target_col='Target', test_size=0.10, random_state=42)

Train shape: (3204, 16)
Test shape: (356, 16)
Train target shape: (3204,)
Test target shape: (356,)


In [28]:
from lightgbm import LGBMRegressor
def compare_models(X_train, X_test, y_train, y_test):
    # Variables qualitatives et quantitatives
    categorical_vars = [#'quintile_QV2M',
                        'quintile_T2MDEW', 
                        #'quintile_T2MWET', 
                        'weekday',
                        'classe',
                        'SEASON']
    numerical_vars = ['WS2M', 'T2M', 'RH2M', 'PS', 'QV2M', 'MONTH_SIN',  
                      #'HUMIDITY_SATURATION','TEMP_PRESSURE_RATIO', 'PRESSURE_TENDENCY',
                      'T2MWET','T2MDEW',
                      'MONTH_COS']
    
    # Préprocesseur
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_vars),
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_vars)
        ]
    )
    
    # Modèles à tester
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(alpha=1.0),
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
    }
    
    results = []
    
    for name, model in models.items():
        # Créer pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Entraîner
        pipeline.fit(X_train, y_train)
        
        # Prédictions
        y_pred_train = pipeline.predict(X_train)
        y_pred_test = pipeline.predict(X_test)
        
        # Métriques
        results.append({
            'Model': name,
            'Train_R2': r2_score(y_train, y_pred_train),
            'Test_R2': r2_score(y_test, y_pred_test),
            'Train_RMSE': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'Test_RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'Train_MAE': mean_absolute_error(y_train, y_pred_train),
            'Test_MAE': mean_absolute_error(y_test, y_pred_test)
        })
    
    # Créer DataFrame des résultats
    results_df = pd.DataFrame(results)
    results_df = results_df.round(4)
    
    print("=== COMPARAISON DES MODÈLES ===")
    print(results_df.to_string(index=False))
    
    return results_df,models

results_df,models = compare_models(X_train, X_test, y_train, y_test)

=== COMPARAISON DES MODÈLES ===
           Model  Train_R2  Test_R2  Train_RMSE  Test_RMSE  Train_MAE  Test_MAE
LinearRegression    0.2887   0.3017      5.2366     5.0895     3.1448    3.0458
           Ridge    0.2878   0.3032      5.2399     5.0841     3.1398    3.0344
    RandomForest    0.8929   0.3232      2.0319     5.0105     1.1528    2.9525
GradientBoosting    0.4984   0.3890      4.3975     4.7606     2.5837    2.7765
         XGBoost    0.9706   0.0883      1.0640     5.8154     0.6805    3.3099


In [29]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

def visualize_feature_importance(X_train, X_test, y_train, y_test):
    # Variables qualitatives et quantitatives
    categorical_vars = [#'quintile_QV2M',
                        #'quintile_T2MDEW',
                        #'quintile_T2MWET',
                        'classe',
                        'weekday',
                        'SEASON']
    numerical_vars = ['WS2M', 'T2M', 'RH2M', 'PS', 'QV2M', 'MONTH_SIN', 
                      #'HUMIDITY_SATURATION','TEMP_PRESSURE_RATIO', 'PRESSURE_TENDENCY',
                      'T2MWET','T2MDEW',
                      'MONTH_COS']
    
    # Préprocesseur 
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_vars),
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_vars)
        ]
    )
    
    # Modèles
    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(alpha=1.0),
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
    }
    
    # Créer subplots
    fig = make_subplots(rows=2, cols=3, subplot_titles=list(models.keys()))
    positions = [(1,1), (1,2), (1,3), (2,1), (2,2)]
    
    # Entraîner et extraire importances
    for i, (name, model) in enumerate(models.items()):
        # Pipeline
        pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', model)])
        pipeline.fit(X_train, y_train)
        
        # Récupérer les noms des features après preprocessing
        feature_names = (numerical_vars + 
                        list(pipeline.named_steps['preprocessor']
                             .named_transformers_['cat']
                             .get_feature_names_out(categorical_vars)))
        
        # Extraire importance selon le type de modèle
        if hasattr(model, 'feature_importances_'):
            importance = model.feature_importances_
        else:  # LinearRegression, Ridge
            importance = np.abs(model.coef_)
        
        # Trier par importance
        indices = np.argsort(importance)[::-1][:15]  # Top 20
        top_features = [feature_names[j] for j in indices]
        top_importance = importance[indices]
        
        # Ajouter au subplot
        row, col = positions[i]
        fig.add_trace(
            go.Bar(x=top_importance, y=top_features, orientation='h',
                   marker_color='steelblue', showlegend=False),
            row=row, col=col
        )
    
    # Mise en forme
    fig.update_layout(height=800, template='plotly_white',
                     title_text="Importance des Variables par Modèle")
    
    for i in range(1, 7):
        fig.update_xaxes(title_text="Importance", row=(i-1)//3+1, col=(i-1)%3+1)
    
    #fig.show()
    return fig



visualize_feature_importance(X_train, X_test, y_train, y_test)

### fine tunage des models

In [30]:

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,RobustScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def compare_finetuned_models(X_train, X_test, y_train, y_test):
    # Variables qualitatives et quantitatives
    categorical_vars = [#'quintile_QV2M',
                        #'quintile_T2MDEW', 
                        #'quintile_T2MWET',
                        'classe',
                        #'weekday',
                        'SEASON']
    numerical_vars = ['WS2M', 'T2M', 'RH2M', 'PS', 'QV2M', 
                      'MONTH_SIN',
                      #'HUMIDITY_SATURATION','TEMP_PRESSURE_RATIO', 'PRESSURE_TENDENCY',
                      #'T2MWET',
                      #'T2MDEW',
                      'MONTH_COS'
                      ]
    
    # Préprocesseur
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', RobustScaler(), numerical_vars),
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_vars)
        ]
    )
    
    # Modèles avec grilles d'hyperparamètres
    models_params = {
        'LinearRegression': (LinearRegression(), {}),
        'Ridge': (Ridge(), {'regressor__alpha': [0.1, 1.0, 10.0, 100.0]}),
        'RandomForest': (RandomForestRegressor(random_state=42), {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__max_depth': [5, 10, None],
            'regressor__min_samples_split': [2, 5]
        }),
        'GradientBoosting': (GradientBoostingRegressor(random_state=42), {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__learning_rate': [0.01, 0.1, 0.2],
            'regressor__max_depth': [3, 5, 7]
        }),
        'XGBoost': (XGBRegressor(random_state=42), {
            'regressor__n_estimators': [50, 100, 200],
            'regressor__learning_rate': [0.01, 0.1, 0.2],
            'regressor__max_depth': [3, 5, 7]
        })
    }
    
    results = []
    feature_importances = {}
    trained_models = {}
    
    # Créer subplots pour l'importance
    fig = make_subplots(rows=2, cols=3, subplot_titles=list(models_params.keys()))
    positions = [(1,1), (1,2), (1,3), (2,1), (2,2)]
    
    for i, (name, (model, param_grid)) in enumerate(models_params.items()):
        print(f"Fine-tuning {name}...")
        
        # Créer pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        if param_grid:
            # GridSearch
            grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_pipeline = grid_search.best_estimator_
            print(f"Meilleurs paramètres: {grid_search.best_params_}")
        else:
            # Pas de tuning pour LinearRegression
            best_pipeline = pipeline
            best_pipeline.fit(X_train, y_train)
        
        # Stocker le modèle entraîné
        trained_models[name] = best_pipeline
        
        # Prédictions
        y_pred_train = best_pipeline.predict(X_train)
        y_pred_test = best_pipeline.predict(X_test)
        
        # Extraire importance des variables
        feature_names = (numerical_vars + 
                        list(best_pipeline.named_steps['preprocessor']
                             .named_transformers_['cat']
                             .get_feature_names_out(categorical_vars)))
        
        regressor = best_pipeline.named_steps['regressor']
        if hasattr(regressor, 'feature_importances_'):
            importance = regressor.feature_importances_
        else:  # LinearRegression, Ridge
            importance = np.abs(regressor.coef_)
        
        # Stocker les importances
        feature_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importance
        }).sort_values('Importance', ascending=False).head(20)
        
        feature_importances[name] = feature_df
        
        # Ajouter au graphique
        row, col = positions[i]
        fig.add_trace(
            go.Bar(x=feature_df['Importance'], y=feature_df['Feature'], orientation='h',
                   marker_color='steelblue', showlegend=False),
            row=row, col=col
        )
        
        # Métriques
        results.append({
            'Model': name,
            'Train_R2': r2_score(y_train, y_pred_train),
            'Test_R2': r2_score(y_test, y_pred_test),
            'Train_RMSE': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'Test_RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'Train_MAE': mean_absolute_error(y_train, y_pred_train),
            'Test_MAE': mean_absolute_error(y_test, y_pred_test)
        })
    
    # DataFrame des résultats
    results_df = pd.DataFrame(results)
    results_df = results_df.round(4)
    
    # Mise en forme du graphique
    fig.update_layout(height=800, template='plotly_white',
                     title_text="Importance des Variables - Modèles Fine-tunés")
    
    for i in range(1, 6):
        fig.update_xaxes(title_text="Importance", row=(i-1)//3+1, col=(i-1)%3+1)
    
    fig.show()
    
    print("\n=== MODÈLES FINE-TUNÉS ===")
    print(results_df.to_string(index=False))
    
    return results_df, feature_importances, trained_models

results_df, feature_importances, trained_models = compare_finetuned_models(X_train, X_test, y_train, y_test)



Fine-tuning LinearRegression...
Fine-tuning Ridge...
Meilleurs paramètres: {'regressor__alpha': 10.0}
Fine-tuning RandomForest...
Meilleurs paramètres: {'regressor__max_depth': 5, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
Fine-tuning GradientBoosting...
Meilleurs paramètres: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 50}
Fine-tuning XGBoost...
Meilleurs paramètres: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 50}



=== MODÈLES FINE-TUNÉS ===
           Model  Train_R2  Test_R2  Train_RMSE  Test_RMSE  Train_MAE  Test_MAE
LinearRegression    0.2552   0.2746      5.3582     5.1873     3.3060    3.1457
           Ridge    0.2551   0.2740      5.3586     5.1893     3.3036    3.1444
    RandomForest    0.3926   0.3491      4.8387     4.9135     2.7772    2.8351
GradientBoosting    0.4208   0.3801      4.7252     4.7954     2.7325    2.8085
         XGBoost    0.4037   0.3398      4.7946     4.9486     2.7524    2.8418


In [31]:
feature_importances['GradientBoosting']

Unnamed: 0,Feature,Importance
4,QV2M,0.398825
2,RH2M,0.272765
0,WS2M,0.096961
6,MONTH_COS,0.064537
3,PS,0.046274
1,T2M,0.043792
7,classe_1,0.029845
5,MONTH_SIN,0.023949
11,SEASON_11,0.01169
17,SEASON_6,0.006177


le modele avec les meileur perforance c est le GradientBoosting

In [32]:
trained_models['XGBoost']

## prediction

In [33]:
def make_predictions(model, df):
    model = trained_models[model] 
    #model = models[model]# ou models pour le modèle non fine-tuné
    predictions = model.predict(df)
    result_df = pd.DataFrame({
        'Target': predictions
    }, index=df.index)
    return result_df

In [34]:
submi  = make_predictions('GradientBoosting', test)
submi

Unnamed: 0_level_0,Target
ID,Unnamed: 1_level_1
ID_OdrVnE_20110615,0.284823
ID_MdWBtG_20230910,1.607638
ID_YQ3N92_20180517,2.521405
ID_73qWlF_20130902,1.167793
ID_3u9JmK_20210212,7.069773
...,...
ID_VjdphP_20160605,0.803074
ID_5PkcqO_20200608,4.531418
ID_hGQUYi_20130111,6.169665
ID_6qf5LE_20220522,9.490794


In [35]:
submi['Target'].mean()

4.3081461725063335

In [36]:
submi.to_csv("SampleSubmission2.csv", index=True)

## output visualisation

In [37]:

def visualize_predictions_vs_time(trained_models, df, date_col='DATE'):
    df_plot = df.copy()
    df_plot[date_col] = pd.to_datetime(df_plot[date_col])
    df_plot = df_plot.sort_values(date_col)
    
    n_models = len(trained_models)
    cols = min(3, n_models)
    rows = (n_models + cols - 1) // cols
    
    fig = make_subplots(
        rows=rows, cols=cols,
        subplot_titles=list(trained_models.keys())
    )
    
    colors = ['blue', 'red', 'green', 'orange', 'purple']
    positions = [(i//cols+1, i%cols+1) for i in range(n_models)]
    
    for i, (model_name, model) in enumerate(trained_models.items()):
        predictions = model.predict(df_plot)
        row, col = positions[i]
        
        fig.add_trace(go.Scatter(
            x=df_plot[date_col],
            y=predictions,
            mode='lines+markers',
            name=model_name,
            line=dict(color=colors[i % len(colors)]),
            marker=dict(size=3),
            showlegend=False
        ), row=row, col=col)
    
    fig.update_layout(
        title='Évolution des Prédictions par Modèle dans le Temps',
        template='plotly_white',
        height=300 * rows
    )
    
    for i in range(1, n_models + 1):
        row, col = positions[i-1]
        fig.update_xaxes(title_text="Date", row=row, col=col)
        fig.update_yaxes(title_text="Prédictions", row=row, col=col)
    
    
    return fig

visualize_predictions_vs_time(trained_models, test, date_col='DATE')