# Contrôle Continu

### Nom et prénom de l'étudiant

# Préparation des données

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

## 1- Charger la donnée

In [5]:
X = pd.read_csv("sup/X.csv")
y = pd.read_csv("sup/y.csv")

In [4]:
print("Dimensions X :", X.shape)
print("Dimensions y :", y.shape)


Dimensions X : (26707, 36)
Dimensions y : (26707, 3)


## 2- AFFICHER TYPES ET VALEURS MANQUANTES

In [6]:
print("\nTypes de variables :")
print(X.dtypes)


Types de variables :
respondent_id                    int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex

In [7]:
print("\nValeurs manquantes par colonne :")
print(X.isnull().sum())


Valeurs manquantes par colonne :
respondent_id                      0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
inco

## 3. TRAITEMENT DES VALEURS MANQUANTES

## a) Variables numériques

In [8]:
num_vars = X.select_dtypes(include=['int64', 'float64']).columns
num_imputer = SimpleImputer(strategy="median")
X[num_vars] = num_imputer.fit_transform(X[num_vars])

## b) Variables catégorielles

In [9]:
cat_vars = X.select_dtypes(include=['object']).columns
cat_imputer = SimpleImputer(strategy="most_frequent")
X[cat_vars] = cat_imputer.fit_transform(X[cat_vars])

## 4. ENCODAGE DES VARIABLES CATÉGORIELLES

### On encode toutes les colonnes objet en one-hot

In [1]:
X_encoded = pd.get_dummies(X, columns=cat_vars, drop_first=True)

NameError: name 'pd' is not defined

## 5. NORMALISATION DES VARIABLES NUMÉRIQUES

In [13]:
scaler = StandardScaler()
X_encoded[num_vars] = scaler.fit_transform(X_encoded[num_vars])

## 6. SÉPARATION EN TRAIN/TEST

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

In [18]:
print("\nNouvelles dimensions après encodage et split :")
print("X_train :", X_train.shape, "X_test :", X_test.shape)


Nouvelles dimensions après encodage et split :
X_train : (21365, 94) X_test : (5342, 94)


## 7. AFFICHER UN ÉCHANTILLON

In [19]:
print("\nExemple de données prêtes pour la modélisation :")
print(X_train.head())


Exemple de données prêtes pour la modélisation :
       respondent_id  h1n1_concern  h1n1_knowledge  behavioral_antiviral_meds  \
24706       1.472571      0.418262       -0.423626                  -0.226293   
5393       -1.032473      0.418262       -0.423626                   4.419056   
20898       0.978644     -1.781960       -0.423626                  -0.226293   
3429       -1.287218     -0.681849       -0.423626                  -0.226293   
8731       -0.599509     -0.681849       -2.044279                  -0.226293   

       behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
24706              0.611637             -0.272097               0.459149   
5393               0.611637              3.675158               0.459149   
20898             -1.634957             -0.272097               0.459149   
3429              -1.634957             -0.272097              -2.177944   
8731              -1.634957             -0.272097               0.459149   

      