In [None]:
# 🔁 Step 0: Factory reset first (important!)
# Runtime > Factory Reset Runtime

# 🔥 Step 1: Remove broken versions
!pip uninstall -y torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric

# ⚙️ Step 2: Clean install for PyTorch 2.1.0 + CPU wheels
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cpu
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
!pip install torch-geometric


Found existing installation: torch-scatter 2.1.2+pt21cpu
Uninstalling torch-scatter-2.1.2+pt21cpu:
  Successfully uninstalled torch-scatter-2.1.2+pt21cpu
Found existing installation: torch-sparse 0.6.18+pt21cpu
Uninstalling torch-sparse-0.6.18+pt21cpu:
  Successfully uninstalled torch-sparse-0.6.18+pt21cpu
Found existing installation: torch-cluster 1.6.3+pt21cpu
Uninstalling torch-cluster-1.6.3+pt21cpu:
  Successfully uninstalled torch-cluster-1.6.3+pt21cpu
Found existing installation: torch-spline-conv 1.2.2+pt21cpu
Uninstalling torch-spline-conv-1.2.2+pt21cpu:
  Successfully uninstalled torch-spline-conv-1.2.2+pt21cpu
Found existing installation: torch-geometric 2.6.1
Uninstalling torch-geometric-2.6.1:
  Successfully uninstalled torch-geometric-2.6.1
Looking in indexes: https://download.pytorch.org/whl/cpu
Looking in links: https://data.pyg.org/whl/torch-2.1.0+cpu.html
Collecting torch-scatter
  Using cached https://data.pyg.org/whl/torch-2.1.0%2Bcpu/torch_scatter-2.1.2%2Bpt21cpu-cp

In [None]:
from torch_geometric.nn import radius_graph
print("radius_graph is working ✅")


radius_graph is working ✅


In [None]:
# Unzip the uploaded dataset
!unzip -o -q /content/mppc.zip -d /content/mppc

In [None]:
!pip install numpy==1.26.4





In [None]:
# Install dependencies
!pip install -q ase rdkit-pypi tqdm torch torch_geometric torch-scatter torch-sparse e3nn torch-cluster -f https://data.pyg.org/whl/torch-2.0.0+cu118.html

# Imports
import os
import pandas as pd
import torch
from torch_geometric.loader import DataLoader
from torch_geometric.nn.models import SchNet
from torch_geometric.data import Data
from tqdm import tqdm
from ase.io import read
from sklearn.metrics import mean_squared_error

# Set paths
ROOT = "/content/mppc"
TRAIN_CSV = os.path.join(ROOT, "dipole_moments_train.csv")
TEST_CSV = os.path.join(ROOT, "dipole_moments_test.csv")
TRAIN_STRUCT_DIR = os.path.join(ROOT, "structures_train")
TEST_STRUCT_DIR = os.path.join(ROOT, "structures_test")

# Define custom dataset
class DipoleDataset:
    def __init__(self, csv_file, xyz_folder, is_train=True):
        self.df = pd.read_csv(csv_file)
        self.xyz_folder = xyz_folder
        self.is_train = is_train
        self.data_list = self.process()

    def process(self):
        data_list = []
        for idx, row in tqdm(self.df.iterrows(), total=len(self.df), desc="Processing XYZ"):
            mol_id = row.get('molecule_name') or row.get('ID')
            file_path = os.path.join(self.xyz_folder, f"{mol_id}.xyz")
            if not os.path.exists(file_path):
                continue
            try:
                mol = read(file_path)
                pos = torch.tensor(mol.get_positions(), dtype=torch.float)
                z = torch.tensor(mol.get_atomic_numbers(), dtype=torch.long)
                y = torch.tensor([row['dipole_moment']], dtype=torch.float) if self.is_train else None
                data = Data(z=z, pos=pos, y=y)
                data_list.append(data)
            except Exception as e:
                print(f"⚠️ Skipped {file_path}: {e}")
        return data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]

# Prepare datasets
train_dataset = DipoleDataset(TRAIN_CSV, TRAIN_STRUCT_DIR, is_train=True)
test_dataset = DipoleDataset(TEST_CSV, TEST_STRUCT_DIR, is_train=False)

# Create loaders
train_loader = DataLoader(train_dataset.data_list, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset.data_list, batch_size=32, shuffle=False)

# Define model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SchNet(hidden_channels=128, num_filters=128, num_interactions=6, cutoff=10.0).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = torch.nn.MSELoss()

# Train model
epochs = 10
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        pred = model(batch.z, batch.pos, batch.batch)
        loss = loss_fn(pred.view(-1), batch.y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"📚 Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.6f}")

# Predict on test set
model.eval()
predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting"):
        batch = batch.to(device)
        preds = model(batch.z, batch.pos, batch.batch)
        predictions.extend(preds.view(-1).cpu().numpy())

# Save predictions without prompt
test_df = pd.read_csv(TEST_CSV)
test_df['dipole_moment'] = predictions
test_df.to_csv("gnn_submission.csv", index=False, mode='w')
print("✅ Submission file saved as gnn_submission.csv")

from google.colab import files
files.download("gnn_submission.csv")


# Evaluate on train set (optional)
true_vals, pred_vals = [], []
with torch.no_grad():
    for batch in train_loader:
        batch = batch.to(device)
        pred = model(batch.z, batch.pos, batch.batch)
        true_vals.extend(batch.y.view(-1).cpu().numpy())
        pred_vals.extend(pred.view(-1).cpu().numpy())

mse = mean_squared_error(true_vals, pred_vals)
print(f"📊 Train Set MSE: {mse:.6f}")


Processing XYZ: 100%|██████████| 20000/20000 [00:29<00:00, 685.18it/s]
Processing XYZ: 100%|██████████| 5000/5000 [00:08<00:00, 596.36it/s]


📚 Epoch 1/10, Loss: 4.382792
📚 Epoch 2/10, Loss: 0.949474
📚 Epoch 3/10, Loss: 0.718826
📚 Epoch 4/10, Loss: 0.541757
📚 Epoch 5/10, Loss: 0.451399
📚 Epoch 6/10, Loss: 0.372449
📚 Epoch 7/10, Loss: 0.320102
📚 Epoch 8/10, Loss: 0.286320
📚 Epoch 9/10, Loss: 0.280206
📚 Epoch 10/10, Loss: 0.208135


Predicting: 100%|██████████| 157/157 [00:45<00:00,  3.48it/s]

✅ Submission file saved as gnn_submission.csv





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📊 Train Set MSE: 0.150864
