In [98]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import pandas as pd
import ast
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [99]:
# Paths to the train/test sets
# path_train = "~/hfactory_magic_folders/water_shortage_prediction/X_train_Hi5.csv"
# path_test = "~/hfactory_magic_folders/water_shortage_prediction/X_test_Hi5.csv"

# Import data

In [None]:
train_data = pd.read_csv('X_train_Hi5.csv', low_memory=False, index_col='row_index')

# Preprocess

## Features

### Piezo qualification

We only keep the rows where the measure is qualified as "Correct" or not yet qualified:

In [282]:
train_data['piezo_qualification'].value_counts()

piezo_qualification
Correcte        2718869
Non qualifié      93199
Incorrecte         9462
Incertaine         8786
Name: count, dtype: int64

In [447]:
train_data_filtered = train_data[train_data['piezo_qualification'].isin(['Correcte', 'Non qualifié'])].drop(columns=['piezo_qualification'])

### Irrelevant columns

In [487]:
columns_to_drop = [
    'piezo_station_department_code',
    'piezo_station_update_date',
    'piezo_station_department_name',
    'piezo_station_commune_code_insee',
    'piezo_station_pe_label',
    'piezo_station_bss_code',
    'piezo_station_commune_name',
    'piezo_station_bss_id',
    'piezo_bss_code',
    'piezo_obtention_mode',
    'piezo_continuity_code',
    'piezo_continuity_name',
    'piezo_producer_code',
    'piezo_producer_name',
    'piezo_measure_nature_code',
    'meteo_id',
    'meteo_name',
    'meteo_date',
    'meteo_DRR',
    'meteo_temperature_avg_threshold',
    'meteo_temperature_min_50cm',
    'meteo_pressure_avg',
    'meteo_pression_maxi',
    'meteo_wind_speed_avg_2m',
    'meteo_wind_max_2m',
    'meteo_wind_direction_max_inst_2m',
    'meteo_time_wind_max_2m',
    'meteo_wetting_duration',
    'meteo_sunshine_duration',
    'meteo_longitude',
    'meteo_latitude',
    'meteo_radiation',
    'meteo_radiation_direct',
    'meteo_sunshine_%',
    'meteo_radiation_IR',
    'meteo_radiation_UV_max',
    'meteo_cloudiness',
    'meteo_cloudiness_height',
    'meteo_if_snow',
    'meteo_if_fog',
    'meteo_if_thunderstorm',
    'meteo_if_sleet',
    'meteo_if_hail',
    'meteo_if_dew',
    'meteo_if_black_ice',
    'meteo_if_snow_ground',
    'meteo_if_frost',
    'meteo_if_smoke',
    'meteo_if_mist',
    'meteo_if_lightning',
    'meteo_evapotranspiration_Monteith',
    'meteo_radiation_UV',
    'meteo_snow_height',
    'meteo_snow_thickness_max',
    'meteo_snow_thickness_6h',
    'hydro_station_code',
    'hydro_observation_date_elab',
    'hydro_status_label',
    'hydro_method_label',
    'hydro_qualification_label',
    'hydro_longitude',
    'hydro_latitude',
    'hydro_hydro_quantity_elab',
    'prelev_structure_code_0',
    'prelev_volume_obtention_mode_label_0',
    'prelev_commune_code_insee_0',
    'prelev_structure_code_1',
    'prelev_volume_obtention_mode_label_1',
    'prelev_commune_code_insee_1',
    'prelev_structure_code_2',
    'prelev_volume_obtention_mode_label_2',
    'prelev_commune_code_insee_2'
]

In [None]:
train_data_filtered = train_data_filtered.drop(columns=columns_to_drop)
# train_data_filtered = train_data.drop(columns=columns_to_drop) ### FOR TEST

### Date

For the date, we apply cyclical encoding:

In [387]:
train_data_filtered['piezo_measurement_date']

row_index
0          2020-01-01
1          2020-01-01
2          2020-01-01
3          2020-01-01
4          2020-01-01
              ...    
3294080    2023-05-31
3294081    2023-05-31
3294082    2023-05-31
3294083    2023-05-31
3294084    2023-05-31
Name: piezo_measurement_date, Length: 2812068, dtype: object

In [489]:
train_data_filtered['piezo_measurement_date'] = pd.to_datetime(train_data_filtered['piezo_measurement_date'])
train_data_filtered['day_of_year'] = train_data_filtered['piezo_measurement_date'].dt.dayofyear
train_data_filtered['day_sin'] = np.sin(2 * np.pi * train_data_filtered['day_of_year'] / 365)
train_data_filtered['day_cos'] = np.cos(2 * np.pi * train_data_filtered['day_of_year'] / 365)

train_data_filtered['month'] = train_data_filtered['piezo_measurement_date'].dt.month
train_data_filtered['year'] = train_data_filtered['piezo_measurement_date'].dt.year

In [490]:
train_data_filtered = train_data_filtered[train_data_filtered['piezo_measurement_date'].dt.month.between(3, 10)]

In [491]:
train_data_filtered = train_data_filtered.drop(columns=['piezo_measurement_date', 'day_of_year'])

### BDLISA code

In [492]:
eh_code_prefix_to_category = {
    '101': 'Sands', '104': 'Sands', '107': 'Limestones', '110': 'Marls', 
    '113': 'Limestones', '117': 'Other', '119': 'Other', '121': 'Other', 
    '123': 'Marls', '125': 'Marls', '127': 'Sands', '131': 'Limestones', 
    '133': 'Marls', '135': 'Limestones', '137': 'Marls', '139': 'Limestones', 
    '141': 'Marls', '143': 'Gravel', '144': 'Sands', '145': 'Other', 
    '147': 'Gravel', '149': 'Limestones', '151': 'Massif', '153': 'Massif', 
    '154': 'Massif', '155': 'Massif', '156': 'Massif', '158': 'Massif', 
    '159': 'Massif', '160': 'Massif', '161': 'Sands', '163': 'Massif', 
    '164': 'Other', '165': 'Other', '167': 'Other', '169': 'Massif', 
    '170': 'Massif', '171': 'Massif', '173': 'Other', '174': 'Basement', 
    '175': 'Basement', '177': 'Basement', '179': 'Basement', '181': 'Basement', 
    '183': 'Basement', '185': 'Basement', '186': 'Basement', '187': 'Basement', 
    '189': 'Other', '191': 'Other', '193': 'Other', '195': 'Other', 
    '197': 'Other', '199': 'Other', '201': 'Basement', '203': 'Basement', 
    '205': 'Basement', '206': 'Basement', '207': 'Basement', '208': 'Basement', 
    '211': 'Other', '221': 'Alluvions', '222': 'Alluvions', '223': 'Other', 
    '225': 'Paleogeological Epochs', '226': 'Gravel', '227': 'Marls', 
    '230': 'Gravel', '231': 'Basement', '233': 'Volcanic', '306': 'Gravel', 
    '308': 'Sands', '312': 'Sands', '316': 'Sands', '318': 'Paleogeological Epochs', 
    '320': 'Limestones', '322': 'Paleogeological Epochs', '324': 'Limestones', 
    '326': 'Marls', '328': 'Marls', '330': 'Marls', '332': 'Massif', 
    '334': 'Gravel', '338': 'Other', '340': 'Marls', '342': 'Other', 
    '344': 'Limestones', '346': 'Marls', '348': 'Limestones', '350': 'Gravel', 
    '352': 'Limestones', '356': 'Marls', '358': 'Limestones', '359': 'Limestones', 
    '360': 'Marls', '362': 'Limestones', '364': 'Geological Epochs', '366': 'Gravel', 
    '368': 'Gravel', '370': 'Basement', '372': 'Basement', '374': 'Basement', 
    '400': 'Limestones', '402': 'Limestones', '404': 'Massif', '502': 'Other', 
    '505': 'Other', '507': 'Other', '509': 'Paleogeological Epochs', '513': 'Limestones', 
    '515': 'Limestones', '516': 'Other', '517': 'Limestones', '519': 'Marls', 
    '521': 'Other', '523': 'Limestones', '525': 'Other', '527': 'Basement', 
    '529': 'Paleogeological Epochs', '531': 'Limestones', '533': 'Limestones', 
    '534': 'Limestones', '548': 'Marls', '561': 'Other', '563': 'Other', 
    '565': 'Limestones', '567': 'Massif', '569': 'Limestones', '571': 'Limestones', 
    '573': 'Geological Epochs', '577': 'Marls', '581': 'Massif', '583': 'Marls', 
    '585': 'Marls', '600': 'Paleogeological Epochs', '602': 'Basement', 
    '604': 'Basement', '621': 'Limestones', '631': 'Limestones', '643': 'Marls', 
    '647': 'Alluvions', '651': 'Limestones', '657': 'Other', '671': 'Alluvions', 
    '679': 'Marls', '681': 'Marls', '691': 'Basement', '693': 'Basement', 
    '699': 'Basement', '710': 'Alluvions', '712': 'Alluvions', '714': 'Alluvions', 
    '716': 'Alluvions', '718': 'Alluvions', '719': 'Alluvions', '720': 'Alluvions', 
    '901': 'Basement', '902': 'Sands', '910': 'Alluvions', '912': 'Alluvions', 
    '914': 'Alluvions', '916': 'Alluvions', '918': 'Alluvions', '920': 'Alluvions', 
    '922': 'Alluvions', '924': 'Alluvions', '926': 'Alluvions', '928': 'Alluvions', 
    '930': 'Alluvions', '932': 'Alluvions', '935': 'Alluvions', '936': 'Alluvions', 
    '937': 'Alluvions', '938': 'Alluvions', '940': 'Alluvions', '942': 'Alluvions', 
    '944': 'Alluvions', '946': 'Alluvions', '948': 'Alluvions', '949': 'Alluvions', 
    '950': 'Alluvions', '952': 'Alluvions', '954': 'Alluvions', '971': 'Volcanic', 
    '972': 'Volcanic', '974': 'Volcanic', '976': 'Volcanic'
}

In [493]:
def map_code_to_category(code):
    prefix = str(code)[:3]
    return eh_code_prefix_to_category.get(prefix, 'Other')

In [494]:
def map_codes_in_list(codes_list):
    return [map_code_to_category(code) for code in codes_list]

In [495]:
def safe_eval(value):
    if isinstance(value, str):
        try:
            return ast.literal_eval(value)
        except (ValueError, SyntaxError):
            return []
    else:
        return []

In [496]:
train_data_filtered['Mapped_Category'] = train_data_filtered['piezo_station_bdlisa_codes'].apply(lambda x: map_codes_in_list(safe_eval(x)))

In [497]:
mlb = MultiLabelBinarizer()
train_data_filtered[mlb.classes_] = mlb.fit_transform(train_data_filtered['Mapped_Category'])
train_data_filtered

Unnamed: 0_level_0,piezo_station_investigation_depth,piezo_station_bdlisa_codes,piezo_station_altitude,piezo_station_longitude,piezo_station_latitude,piezo_status,piezo_qualification,piezo_measure_nature_name,meteo_altitude,meteo_rain_height,...,Alluvions,Basement,Geological Epochs,Gravel,Limestones,Marls,Massif,Other,Paleogeological Epochs,Sands
row_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2331795,20.00,['712AH37'],232.00,5.748241,45.828148,Donnée contrôlée niveau 2,Correcte,Naturel,330,7.4,...,1,0,0,0,0,0,0,0,0,0
2331796,35.60,['712GB05'],247.25,5.356637,46.028102,Donnée contrôlée niveau 2,Correcte,Naturel,250,3.4,...,1,0,0,0,0,0,0,0,0,0
2331797,35.22,['040AJ43'],218.77,5.220795,45.895734,Donnée contrôlée niveau 2,Correcte,Naturel,196,,...,0,0,0,0,0,0,0,1,0,0
2331798,34.20,"['516AA00', '516AF00']",499.85,5.948977,46.201180,Donnée contrôlée niveau 2,Correcte,Naturel,1133,12.8,...,0,0,0,0,0,0,0,1,0,0
2331799,37.30,['507AB00'],260.00,5.313353,46.136402,Donnée contrôlée niveau 2,Correcte,Naturel,260,2.2,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3610818,24.00,['121AZ01'],63.00,1.711584,49.197517,Donnée contrôlée niveau 2,Correcte,Naturel,105,0.0,...,0,0,0,0,0,0,0,1,0,0
3610819,25.50,['113AQ27'],103.00,1.901276,49.145695,Donnée contrôlée niveau 2,Correcte,Naturel,105,0.0,...,0,0,0,0,1,0,0,0,0,0
3610820,30.00,['121AP01'],83.00,1.897576,49.083846,Donnée contrôlée niveau 2,Correcte,Naturel,105,0.0,...,0,0,0,0,0,0,0,1,0,0
3610821,630.00,['127AA99'],24.00,2.209806,49.105985,Donnée contrôlée niveau 2,Correcte,Naturel,171,0.2,...,0,0,0,0,0,0,0,0,0,1


In [498]:
train_data_filtered = train_data_filtered.drop(columns=['piezo_station_bdlisa_codes', 'Mapped_Category'])

In [499]:
piezo_status_to_id = {
    'Donnée brute': 0,
    'Donnée contrôlée niveau 1': 1,
    'Donnée contrôlée niveau 2': 2,
    'Donnée interprétée': 3
}

In [500]:
piezo_measure_nature_name_to_id = {
    'Inconnue': np.nan,
    'Naturel': 1,
    'Influencé': 2,
    'Dynamique': 3,
    'Sec': 4
}

In [501]:
train_data_filtered['piezo_status'] = train_data_filtered['piezo_status'].map(piezo_status_to_id)
train_data_filtered['piezo_measure_nature_name'] = train_data_filtered['piezo_measure_nature_name'].map(piezo_measure_nature_name_to_id)

## Distance computation

In [502]:
prelev_columns = [col for col in train_data_filtered.columns if col.startswith("prelev_")]
X_prelev = train_data_filtered[[col for col in train_data_filtered.columns if col.startswith("prelev_")]]
train_data_filtered[prelev_columns].head()

Unnamed: 0_level_0,prelev_volume_0,prelev_usage_label_0,prelev_longitude_0,prelev_latitude_0,prelev_volume_1,prelev_usage_label_1,prelev_longitude_1,prelev_latitude_1,prelev_volume_2,prelev_usage_label_2,prelev_longitude_2,prelev_latitude_2,prelev_other_volume_sum
row_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2331795,,,,,,,,,,,,,0.0
2331796,,,,,,,,,,,,,0.0
2331797,,,,,,,,,,,,,0.0
2331798,,,,,,,,,,,,,0.0
2331799,,,,,,,,,,,,,0.0


In [503]:
numeric_columns_prelev = X_prelev.dtypes[((X_prelev.dtypes=="float64"))|((X_prelev.dtypes=="int64"))].index.values.tolist()
cat_columns = X_prelev.dtypes[((X_prelev.dtypes=="object"))].index.values.tolist()

In [504]:
X_prelev[cat_columns].head()

Unnamed: 0_level_0,prelev_usage_label_0,prelev_usage_label_1,prelev_usage_label_2
row_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2331795,,,
2331796,,,
2331797,,,
2331798,,,
2331799,,,


In [505]:
X_prelev_cl = X_prelev[numeric_columns_prelev]
X_prelev_cl["prelev_usage_label_0"] = X_prelev["prelev_usage_label_0"]
X_prelev_cl["prelev_usage_label_1"] = X_prelev["prelev_usage_label_1"]
X_prelev_cl["prelev_usage_label_2"] = X_prelev["prelev_usage_label_2"]
X_prelev_cl["prelev_other_volume_sum"] = X_prelev["prelev_other_volume_sum"]


X_prelev_cl["distance_0"] = (train_data_filtered["piezo_station_longitude"] - train_data_filtered["prelev_longitude_0"])**2 + (train_data_filtered["piezo_station_latitude"] - train_data_filtered["prelev_latitude_0"])**2

X_prelev_cl["distance_1"] = (train_data_filtered["piezo_station_longitude"] - train_data_filtered["prelev_longitude_1"])**2 + (train_data_filtered["piezo_station_latitude"] - train_data_filtered["prelev_latitude_1"])**2
X_prelev_cl["distance_2"] = (train_data_filtered["piezo_station_longitude"] - train_data_filtered["prelev_longitude_2"])**2 + (train_data_filtered["piezo_station_latitude"] - train_data_filtered["prelev_latitude_2"])**2
X_prelev_cl = X_prelev_cl.drop(columns=["prelev_longitude_0","prelev_longitude_1","prelev_longitude_2","prelev_latitude_0","prelev_latitude_1","prelev_latitude_2"])
X_prelev_cl["prelev_all_volume_sum"] = train_data_filtered["prelev_volume_0"] + train_data_filtered["prelev_volume_1"] + train_data_filtered["prelev_volume_2"] + train_data_filtered["prelev_other_volume_sum"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_prelev_cl["prelev_usage_label_0"] = X_prelev["prelev_usage_label_0"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_prelev_cl["prelev_usage_label_1"] = X_prelev["prelev_usage_label_1"]


In [506]:
X_prelev_cl.head()

Unnamed: 0_level_0,prelev_volume_0,prelev_volume_1,prelev_volume_2,prelev_other_volume_sum,prelev_usage_label_0,prelev_usage_label_1,prelev_usage_label_2,distance_0,distance_1,distance_2,prelev_all_volume_sum
row_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2331795,,,,0.0,,,,,,,
2331796,,,,0.0,,,,,,,
2331797,,,,0.0,,,,,,,
2331798,,,,0.0,,,,,,,
2331799,,,,0.0,,,,,,,


In [507]:
train_data_filtered['prelev_usage_label_0'].value_counts()

prelev_usage_label_0
EAU POTABLE                                                           114453
EAU TURBINEE (barrage)                                                 31574
INDUSTRIE et ACTIVITES ECONOMIQUES (hors irrigation, hors énergie)     31230
IRRIGATION                                                             27497
CANAUX                                                                 16286
ENERGIE                                                                10087
Name: count, dtype: int64

In [508]:
prelev_columns = [col for col in X_prelev_cl.columns if col.startswith("prelev_")]

train_data_filtered = train_data_filtered.drop(columns=[col for col in train_data_filtered.columns if col.startswith("prelev_")])

train_data_filtered[prelev_columns] = X_prelev_cl[prelev_columns]

### Handle Insee data

In [509]:
insee_cols_modif = [
    'insee_%_agri',
    'insee_med_living_level',
    'insee_%_ind',
    'insee_%_const'
]

In [510]:
train_data_filtered[insee_cols_modif] = train_data_filtered[insee_cols_modif].apply(pd.to_numeric, errors='coerce')

In [511]:
deviation_data = {
    2020: {'temperature': 1.15, 'precipitation': 0.97},
    2021: {'temperature': -0.01, 'precipitation': 0.99},
    2022: {'temperature': 1.6, 'precipitation': 0.76},
    2023: {'temperature': 1.4, 'precipitation': 1.035},
}

train_data_filtered['mean_annual_temperature_deviation'] = train_data_filtered['year'].map(
    lambda year: deviation_data[year]['temperature'] if year in deviation_data else None
)
train_data_filtered['mean_annual_precipitation_deviation'] = train_data_filtered['year'].map(
    lambda year: deviation_data[year]['precipitation'] if year in deviation_data else None
)

## Split data

In [373]:
train_data_filtered.shape

(1761874, 70)

In [512]:
X_test = train_data_filtered
X_test = X_test.drop(columns='piezo_qualification')

In [279]:
X_test.shape

(611208, 69)

In [474]:
y_train = train_data_filtered['piezo_groundwater_level_category']
X_train = train_data_filtered.drop(columns=['piezo_groundwater_level_category'])

In [475]:
groundwater_cat_to_id = {
    'Very High': 2,
    'High': 1,
    'Average': 0,
    'Low': -1,
    'Very Low': -2
}

y_train = y_train.map(groundwater_cat_to_id) + 2

In [None]:
groundwater_id_to_cat = {
    4: 'Very High',
    3: 'High',
    2: 'Average',
    1: 'Low',
    0: 'Very Low'
}

y_test_pred_label = y_test.map(groundwater_id_to_cat)

In [482]:
categorical_columns = [
    # 'piezo_status', 
    # 'piezo_measure_nature_name', 
    'prelev_usage_label_0', 
    'prelev_usage_label_1', 
    'prelev_usage_label_2'
]

for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')

In [315]:
# X_train_full, X_test, y_train_full, y_test = train_test_split(X_train, y_train, test_size=0.15, random_state=42, stratify=y_train)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)

In [179]:
xgb_reg_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.1,
    max_depth=12,
    n_estimators=700,
    min_child_weight=1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    enable_categorical=True
)
xgb_reg_model.fit(X_train, y_train)
y_preds = np.round(xgb_reg_model.predict(X_val)).astype(int)
accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

(0.8966206388887901,
 0.8966206388887901,
 0.6426049583381721,
 0.8972033187029154)

In [247]:
y_preds = np.clip(np.round(xgb_reg_model.predict(X_val)).astype(int), 0, 4)
accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

(0.896710726736295, 0.896710726736295, 0.8997071040104417, 0.8972533076010903)

In [254]:
np.unique(y_preds, return_counts=True)

(array([0, 1, 2, 3, 4]),
 array([155023, 190843, 198169, 178144, 121442], dtype=int64))

In [235]:
len(y_preds), len(test_preds)

(843621, 611208)

In [513]:
for col in categorical_columns:
    X_test[col] = X_test[col].astype('category')

In [None]:
test_preds = np.round(xgb_reg_model.predict(X_test)).astype(int)

In [230]:
test_preds = np.clip(test_preds, 0, 4)

In [238]:
len(test_preds)

611208

In [None]:
y_preds_df = pd.DataFrame({
    'row_index': X_test.index,
    'piezo_groundwater_level_category': test_preds
})

groundwater_id_to_cat = {
    4: 'Very High',
    3: 'High',
    2: 'Average',
    1: 'Low',
    0: 'Very Low'
}

y_preds_df['piezo_groundwater_level_category'] = y_preds_df['piezo_groundwater_level_category'].map(groundwater_id_to_cat)

In [240]:
y_preds_df.to_csv('y_test_pred_xgb_reg.csv', index=False)

In [None]:
xgb_reg_model_1 = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.1,
    max_depth=12,
    n_estimators=700,
    min_child_weight=1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=1.0,
    reg_lambda=3.0,
    random_state=42,
    enable_categorical=True
)

xgb_reg_model_1.fit(X_train, y_train)
y_preds = np.clip(np.round(xgb_reg_model_1.predict(X_val)).astype(int),0,4)
accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

(0.8934082959053888,
 0.8934082959053888,
 0.8964068703676776,
 0.8939783179521726)

In [253]:
xgb_reg_model_4 = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.1,
    max_depth=15,
    n_estimators=700,
    min_child_weight=1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=1.0,
    reg_lambda=1.0,
    random_state=42,
    enable_categorical=True
)

xgb_reg_model_4.fit(X_train, y_train)
y_preds = np.clip(np.round(xgb_reg_model_4.predict(X_val)).astype(int),0,4)
accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

(0.920607713653406, 0.920607713653406, 0.9228668549906063, 0.9208751202988043)

In [None]:
# gooooood
xgb_reg_model_5 = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.1,
    max_depth=10,
    n_estimators=700,
    min_child_weight=1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=2.0,
    reg_lambda=1.0,
    random_state=42,
    enable_categorical=True
)

xgb_reg_model_5.fit(X_train, y_train)
y_preds = np.clip(np.round(xgb_reg_model_5.predict(X_val)).astype(int),0,4)
accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

(0.846157219889026, 0.846157219889026, 0.8502545531681169, 0.8474320628249854)

In [322]:
# to later train on everything
y_train_full = train_data_filtered['piezo_groundwater_level_category']
X_train_full = train_data_filtered.drop(columns=['piezo_groundwater_level_category'])

In [324]:
for col in categorical_columns:
    X_train_full[col] = X_train_full[col].astype('category')

In [326]:
y_train_full = y_train_full.map(groundwater_cat_to_id) + 2

In [327]:
xgb_reg_model_5.fit(X_train_full, y_train_full)
# y_preds = np.clip(np.round(xgb_reg_model_5.predict(X_val)).astype(int),0,4)
# accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

In [366]:
xgb_reg_model_5.fit(X_train, y_train)

In [414]:
xgb_reg_model_5.fit(X_train, y_train)

In [270]:
xgb_reg_model_6 = xgb.XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.1,
    max_depth=8,
    n_estimators=700,
    min_child_weight=1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=2.0,
    reg_lambda=1.0,
    random_state=42,
    enable_categorical=True
)

xgb_reg_model_6.fit(X_train, y_train)
y_preds = np.clip(np.round(xgb_reg_model_6.predict(X_val)).astype(int),0,4)
accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

(0.7448486939040161, 0.744848693904016, 0.7498406900860877, 0.747551759627985)

In [368]:
X_test_not_augmented = X_test.drop(columns=['mean_annual_temperature_deviation', 'mean_annual_precipitation_deviation'])

In [None]:
y_test_preds = lgb_model.predict(X_test)

In [515]:
test_preds = np.round(lgb_model.predict(X_test)).astype(int)
test_preds = np.clip(test_preds, 0, 4)



In [516]:
y_preds_df = pd.DataFrame({
    'row_index': X_test.index,
    'piezo_groundwater_level_category': test_preds
})

y_preds_df['piezo_groundwater_level_category'] = y_preds_df['piezo_groundwater_level_category'].map(groundwater_id_to_cat)

In [517]:
y_preds_df.to_csv('y_test_pred_lgb_depth_10_augmented_and_full_exclude_winter.csv', index=False)

In [None]:
y_preds

In [None]:
# xgb_reg_model = xgb.XGBRegressor(
#     objective='reg:squarederror',
#     learning_rate=0.1,
#     max_depth=10,
#     n_estimators=500,
#     random_state=42,
#     enable_categorical=True
# )

print('XGB Reg converting piezo_status and piezo_measure_nature_name to number')
# y_preds = xgb_reg_model.predict(X_val)
y_preds = np.round(xgb_reg_model.predict(X_val)).astype(int)
accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

XGB Reg converting piezo_status and piezo_measure_nature_name to number


(0.8261577177429201, 0.82615771774292, 0.5934371188109461, 0.8279361012965908)

In [None]:
# objective='reg:squarederror',
# learning_rate=0.1,
# max_depth=10,
# n_estimators=500,
# random_state=42,
# enable_categorical=True

In [138]:
print('XGB Reg')
# y_preds = xgb_reg_model.predict(X_val)
y_preds = np.round(xgb_reg_model.predict(X_val)).astype(int)
accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

XGB Reg


(0.8277484794712318,
 0.8277484794712316,
 0.5945197883126434,
 0.8294912182681218)

In [78]:
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, enable_categorical=True)

In [79]:
xgb_model.fit(X_train, y_train)
print('With year, drop _prelev, keep all corr, keep only CORRECT')
y_preds = xgb_model.predict(X_val)
accuracy_score(y_pred=y_preds, y_true=y_val), f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

Parameters: { "use_label_encoder" } are not used.



With year, drop _prelev, keep all corr, keep only CORRECT


(0.6462205081590225,
 0.6462205081590225,
 0.6535620067752111,
 0.6456027626169899)

In [45]:
xgb_model.fit(X_train, y_train)
print('With year, drop _prelev, keep all corr')
y_preds = xgb_model.predict(X_val)
accuracy_score(y_pred=y_preds, y_true=y_val)

Parameters: { "use_label_encoder" } are not used.



With year, drop _prelev, keep all corr


0.6474933649114946

In [46]:
f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

(0.6474933649114946, 0.6541321291660024, 0.6466243894333445)

In [44]:
f1_score(y_pred=y_preds, y_true=y_val, average='micro'), f1_score(y_pred=y_preds, y_true=y_val, average='macro'), f1_score(y_pred=y_preds, y_true=y_val, average='weighted')

(0.6474933649114946, 0.6541321291660024, 0.6466243894333445)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

In [88]:
xgb_model.fit(X_train, y_train)
print('Drop num cols with corr < 1%')
y_preds = xgb_model.predict(X_val)
accuracy_score(y_preds, y_val)

Parameters: { "use_label_encoder" } are not used.



Drop num cols with corr < 1%


0.6288125829015636

In [45]:
print('Truly drop prelev_')
y_preds = xgb_model.predict(X_val)
accuracy_score(y_preds, y_val)

Truly drop prelev_


0.6472687379759394

In [54]:
!pip install graphviz

Collecting graphviz


[notice] A new release of pip is available: 23.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
   ---------------------------------------- 0.0/47.1 kB ? eta -:--:--
   ---------------------------------------- 47.1/47.1 kB 2.3 MB/s eta 0:00:00
Installing collected packages: graphviz
Successfully installed graphviz-0.20.3


In [51]:
tree_df = xgb_model.get_booster().trees_to_dataframe()

# Calculate the maximum depth of each tree
tree_depths = tree_df.groupby('Tree')['Node'].max()
print(tree_depths)

Tree
0      124
1      126
2      126
3      126
4      124
      ... 
495    126
496    124
497     94
498    120
499    112
Name: Node, Length: 500, dtype: int64


In [None]:
importance = xgb_model.get_booster().get_score(importance_type='gain')

importance_df = pd.DataFrame({
    'Feature': list(importance.keys()),
    'Importance': list(importance.values())
}).sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
50,Alluvions,320.182220
52,Gravel,273.660919
49,year,269.156921
53,Limestones,265.247223
47,day_sin,260.928833
...,...,...
17,meteo_amplitude_tn_tx,22.345385
29,meteo_time_humidity_min,18.139111
13,meteo_time_tx,15.754864
31,meteo_time_humidity_max,13.913808


In [57]:
importance_df.head(20)

Unnamed: 0,Feature,Importance
50,Alluvions,320.18222
52,Gravel,273.660919
49,year,269.156921
53,Limestones,265.247223
47,day_sin,260.928833
55,Massif,252.686462
56,Other,250.05011
7,meteo_longitude,245.815521
2,piezo_station_longitude,244.697189
6,meteo_latitude,238.387543


In [None]:
print('Keeping year')
y_preds = xgb_model.predict(X_val)
accuracy_score(y_preds, y_val)

Keeping year


0.648771189906368

In [None]:
importance = xgb_model.get_booster().get_score(importance_type='gain')

importance_df = pd.DataFrame({
    'Feature': list(importance.keys()),
    'Importance': list(importance.values())
}).sort_values(by='Importance', ascending=False)

In [None]:
importance_df.head(30)

Unnamed: 0,Feature,Importance
48,prelev_longitude_1,380.47113
62,year,316.179932
63,Alluvions,301.569519
69,Other,263.12738
7,meteo_longitude,256.715637
60,day_sin,253.541962
65,Gravel,250.333847
45,prelev_latitude_0,244.237
66,Limestones,240.191635
67,Marls,239.629242
