In [1]:
from sklearn.preprocessing import OrdinalEncoder

import pandas as pd

In [2]:
# Charger les données
train_fn = '~/projects/Neurohack_Group8_Miniproject/all_subject_brainData_train.parquet'
test_fn = '~/projects/Neurohack_Group8_Miniproject/all_subject_brainData_test.parquet'

train_df = pd.read_parquet(train_fn)
test_df = pd.read_parquet(test_fn)

In [3]:
# Identifier les colonnes non numériques (catégorielles)
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Initialiser un OrdinalEncoder qui gère les valeurs inconnues
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Encoder les colonnes catégorielles
train_df[categorical_cols] = encoder.fit_transform(train_df[categorical_cols].astype(str))
test_df[categorical_cols] = encoder.transform(test_df[categorical_cols].astype(str))

In [4]:
train_df.head(10)

Unnamed: 0,subject_id,StructName,SurfArea,GrayVol,study,study_site,session_id,wave,age,sex,...,bmi,handedness,participant_education,parent_1_education,parent_2_education,p_factor,internalizing_mcelroy_harmonized_all_samples,externalizing_mcelroy_harmonized_all_samples,attention_mcelroy_harmonized_all_samples,cubids_acquisition_group
0,0.0,3145.0,1121,3493,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1
1,0.0,3146.0,2236,7030,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1
2,0.0,3147.0,2619,5753,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1
3,0.0,3148.0,549,2714,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1
4,0.0,3151.0,2822,8180,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1
5,0.0,3152.0,4819,13525,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1
6,0.0,3153.0,3339,11870,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1
7,0.0,3155.0,1002,2545,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1
8,0.0,3156.0,6271,14082,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1
9,0.0,3157.0,2920,9159,0.0,0.0,0.0,1,15.583333,1.0,...,22.15,2.0,11.0,0.0,1.0,0.589907,-0.449373,-0.63078,-1.842178,1


In [5]:
test_df.shape

(7308276, 22)

In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# -------------------------------
# 2. Séparation des features et target
# -------------------------------
features = [col for col in train_df.columns if col != 'p_factor']

X_train = train_df[features].values.astype(np.float32)
y_train = train_df['p_factor'].values.astype(np.float32).reshape(-1, 1)
X_test = test_df[features].values.astype(np.float32)

In [7]:
# -------------------------------
# 3. Normalisation
# -------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -------------------------------
# 4. Création des DataLoader
# -------------------------------
X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

# -------------------------------
# 5. Modèle PyTorch
# -------------------------------
class SimpleRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)

model = SimpleRegressor(input_dim=X_train.shape[1])

# -------------------------------
# 6. Entraînement du modèle
# -------------------------------
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(10):
    model.train()
    epoch_loss = 0
    for xb, yb in train_loader:
        pred = model(xb)
        loss = criterion(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(train_loader):.4f}")

# -------------------------------
# 7. Prédiction sur test_df
# -------------------------------
model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test)
    y_pred_test = model(X_test_tensor).numpy().flatten()

# -------------------------------
# 8. Ajout dans le DataFrame test_df
# -------------------------------
test_df['p_factor'] = y_pred_test

Epoch 1, Loss: nan
Epoch 2, Loss: nan
Epoch 3, Loss: nan
Epoch 4, Loss: nan
Epoch 5, Loss: nan
Epoch 6, Loss: nan
Epoch 7, Loss: nan
Epoch 8, Loss: nan
Epoch 10, Loss: nan
