In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [3]:
train_features = pd.read_csv(r"C:\Playground\contest\ASIAChallenge_ShareFile\train_features.csv")
metadata = pd.read_csv(r"C:\Playground\contest\ASIAChallenge_ShareFile\metadata.csv")
train_outcome_functional = pd.read_csv(r"C:\Playground\contest\ASIAChallenge_ShareFile\train_outcomes_functional.csv")
train_outcome_ms_excel = pd.read_csv(r"C:\Playground\contest\ASIAChallenge_ShareFile\train_outcomes_MS.csv")

In [4]:
# Merge datasets on 'PID'
df = train_features.merge(metadata, on="PID", how="left")
df = df.merge(train_outcome_functional, on="PID", how="left")
df = df.merge(train_outcome_ms_excel, on=["PID", "time"], how="left")

In [5]:
categorical_cols = ["bmi_category", "age_category", "sexcd", "tx1_r", "srdecc1", "surgcd1", "spcsuc1", "scdecc1", "hemccd1"]
numeric_cols = df.drop(columns=["PID", "modben", "time"] + categorical_cols).columns.tolist()

In [6]:

# Define preprocessing steps
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [8]:
print(df.dtypes)  # Check column data types
print(df.select_dtypes(include=['object']).head())  # Preview categorical columns


PID          object
elbfll01    float64
wrextl01    float64
elbexl01    float64
finfll01    float64
             ...   
hipflr      float64
kneetr      float64
ankdor      float64
gretor      float64
ankplr      float64
Length: 576, dtype: object
       PID   ais1   ais4   ais8  ais16 bmi_category age_category tx1_r
0   PID_62    NaN    NaN     ND     ND      Healthy          <65    D1
1  PID_148  AIS A  AIS A  AIS A  AIS A      Healthy          <65     P
2  PID_508  AIS A  AIS A  AIS A  AIS B  Underweight          <65     P
3  PID_254  AIS A  AIS A  AIS A  AIS A   Overweight          <65    D2
4  PID_189  AIS A  AIS C  AIS A  AIS A   Overweight          <65     P


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Identify columns
num_cols = df.select_dtypes(include=['float64']).columns  # Numeric columns
cat_cols = ['ais1', 'ais4', 'ais8', 'ais16', 'bmi_category', 'age_category', 'tx1_r']  # Categorical

# Define transformers
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing categorical values
    ("encoder", OneHotEncoder(handle_unknown="ignore"))  # Convert categories to numeric
])

# Create column transformer
preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols)
])

# Apply transformation
df_processed = preprocessor.fit_transform(df)


In [10]:
import pandas as pd

# Get transformed feature names
cat_features = preprocessor.named_transformers_["cat"].named_steps["encoder"].get_feature_names_out(cat_cols)
all_features = list(num_cols) + list(cat_features)

# Convert to DataFrame
df_processed = pd.DataFrame(df_processed, columns=all_features)

# Save preprocessed data
df_processed.to_csv("preprocessed_data.csv", index=False)


In [13]:
import torch
from torch_geometric.data import Data

# Example: Assuming PID is a node and features are the node attributes
node_features = torch.tensor(df_processed.values, dtype=torch.float)  # Node features

# Define edges (if applicable)
edge_index = torch.tensor([[0, 1, 2], [1, 2, 0]], dtype=torch.long)  # Example edge list

# Create PyG Data object
graph_data = Data(x=node_features, edge_index=edge_index)

# Check the graph
print(graph_data)


Data(x=[675, 596], edge_index=[2, 3])


In [14]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_dim, out_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x


In [16]:
print(graph_data.y if hasattr(graph_data, "y") else "No target labels found!")


None


In [17]:
import torch
from torch_geometric.data import Data

# Convert features
node_features = torch.tensor(df_processed.values, dtype=torch.float)

# Convert target (Replace 'week_26_target_column' with the actual column name)
y_target = torch.tensor(df["week_26_target_column"].values, dtype=torch.float).view(-1, 1)

# Define edges
edge_index = torch.tensor(edge_list, dtype=torch.long)  # Make sure this is correct

# Create PyG Data object
graph_data = Data(x=node_features, edge_index=edge_index, y=y_target)

# Verify y
print(graph_data)


KeyError: 'week_26_target_column'