## Loading modules and the dataset

In [3]:
# Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 
import xgboost

In [4]:
# Dataset
original_df = pd.read_csv("data/train.csv", delimiter=",")

## Exploring the dataset
### Basics information

In [5]:
# Basic information (shape, ...)
print(f"Dataset shape : {original_df.shape}")
print(f"Number features : {original_df.shape[1]}")
print(f"Number sample : {original_df.shape[0]}")
original_df.info()

Dataset shape : (225000, 325)
Number features : 325
Number sample : 225000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225000 entries, 0 to 224999
Columns: 325 entries, _STATE to TARGET
dtypes: bool(1), float64(318), int64(6)
memory usage: 556.4 MB


In [6]:
# Showing a bit of the dataset
original_df.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENM1,...,DRNKANY6,DROCDY4_,_RFBING6,_DRNKWK2,_RFDRHV8,_FLSHOT7,_PNEUMO3,_AIDTST4,ID,TARGET
0,2.0,5.0,5132022,5,13,2022,1100.0,2022000436,2022000000.0,1.0,...,1.0,27.0,1.0,187.0,1.0,1.0,1.0,2.0,0,True
1,9.0,1.0,2282022,2,28,2022,1200.0,2022005381,2022005000.0,,...,9.0,900.0,9.0,99900.0,9.0,,,,1,False
2,19.0,1.0,1202022,1,20,2022,1100.0,2022000927,2022001000.0,1.0,...,2.0,5.397605e-79,1.0,5.397605e-79,1.0,,,9.0,2,False
3,39.0,8.0,8122022,8,12,2022,1100.0,2022014408,2022014000.0,,...,2.0,5.397605e-79,1.0,5.397605e-79,1.0,,,2.0,3,False
4,25.0,7.0,7222022,7,22,2022,1200.0,2022010910,2022011000.0,,...,1.0,17.0,2.0,467.0,1.0,,,2.0,4,False


In [7]:
# Pourcentage de valeurs manquantes par colonne
missing_percent = original_df.isna().mean() * 100

# Affichage propre, trié du plus manquant au moins manquant
missing_percent = missing_percent.sort_values(ascending=False)

print(missing_percent)

TOLDCFS     100.000000
HAVECFS     100.000000
WORKCFS     100.000000
COLGHOUS     99.994222
COLGSEX1     99.993778
               ...    
_TOTINDA      0.000000
_HCVU652      0.000000
_HLTHPLN      0.000000
_MENT14D      0.000000
TARGET        0.000000
Length: 325, dtype: float64


In [9]:
# Calcul du pourcentage de valeurs manquantes par feature
missing_pct = original_df.isna().mean() * 100

# Définition des tranches (bins) de 0 à 100 par pas de 10
bins = np.arange(0, 110, 10)  # 0,10,20,...,100
labels = [f"{i}-{i+10}%" for i in range(0, 100, 10)]

# Catégorisation des features selon leur tranche de % de manquants
categories = pd.cut(missing_pct, bins=bins, labels=labels, right=False, include_lowest=True)

# Comptage du nombre de features dans chaque tranche
summary = categories.value_counts().sort_index()

# Transformation en DataFrame pour un rendu propre
missing_tranches_df = pd.DataFrame({
    'Tranche_%_manquants': summary.index,
    'Nb_features': summary.values
})

print(missing_tranches_df)


  Tranche_%_manquants  Nb_features
0               0-10%          105
1              10-20%            9
2              20-30%           10
3              30-40%            4
4              40-50%           14
5              50-60%           13
6              60-70%           22
7              70-80%           20
8              80-90%           31
9             90-100%           94
