<a href="https://colab.research.google.com/github/kavyagl2/Google-Summer-of-Code-24/blob/main/GIN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
pip install datasets torch_geometric

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.nn import GINConv, global_add_pool
import json


# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load T5 Tokenizer and Model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# URLs for datasets
train_url = "https://storage.googleapis.com/gresearch/kelm-corpus/updated-2021/quadruples-train.tsv"
validation_url = "https://storage.googleapis.com/gresearch/kelm-corpus/updated-2021/quadruples-validation.tsv"
test_url = "https://storage.googleapis.com/gresearch/kelm-corpus/updated-2021/quadruples-test.tsv"

# Stream the datasets
train_dataset = load_dataset("csv", data_files=train_url, delimiter='\t')
validation_dataset = load_dataset("csv", data_files=validation_url, delimiter='\t')
test_dataset = load_dataset("csv", data_files=test_url, delimiter='\t')

# Function to parse the nested JSON structure
def parse_nested_json(examples):
    # Initialize lists to store parsed data
    all_triples = []
    all_serialized_triples = []
    all_sentences = []

    for example in examples.values():
        for item in example:
            if isinstance(item, str):
                try:
                    data = json.loads(item)
                    if isinstance(data, dict):
                        # Convert list of triples to JSON string and encode to bytes before appending
                        all_triples.append(json.dumps(data.get("triples", [])).encode('utf-8'))

                        # Check if serialized_triples is a string before encoding
                        serialized_triples = data.get("serialized_triples", "")
                        if isinstance(serialized_triples, str):
                            all_serialized_triples.append(serialized_triples.encode('utf-8'))
                        else:
                            all_serialized_triples.append(str(serialized_triples).encode('utf-8'))

                        # Similarly, check for sentence
                        sentence = data.get("sentence", "")
                        if isinstance(sentence, str):
                            all_sentences.append(sentence.encode('utf-8'))
                        else:
                            all_sentences.append(str(sentence).encode('utf-8'))

                except json.JSONDecodeError:
                    pass

    return {
        "triples": all_triples, # Now a list of JSON strings
        "serialized_triples": all_serialized_triples,
        "sentence": all_sentences
    }

# Apply parsing function to each example in the dataset
train_dataset = train_dataset.map(parse_nested_json, batched=True, batch_size=1000)
validation_dataset = validation_dataset.map(parse_nested_json, batched=True, batch_size=1000)
test_dataset = test_dataset.map(parse_nested_json, batched=True, batch_size=1000)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.31G [00:00<?, ?B/s]

In [None]:
# Preprocessing function for tokenization
def preprocess_function(examples):
    # Decode the byte objects to strings
    inputs = [x.decode('utf-8') for x in examples["serialized_triples"]]
    targets = [x.decode('utf-8') for x in examples["sentence"]]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

Map:   0%|          | 0/6310060 [00:00<?, ? examples/s]



Map:   0%|          | 0/788745 [00:00<?, ? examples/s]

Map:   0%|          | 0/796981 [00:00<?, ? examples/s]

In [None]:
class GIN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GIN, self).__init__()
        self.conv1 = GINConv(Sequential(Linear(num_node_features, 64), ReLU(), Linear(64, 64)))
        self.conv2 = GINConv(Sequential(Linear(64, 64), ReLU(), Linear(64, 64)))
        self.conv3 = GINConv(Sequential(Linear(64, 64), ReLU(), Linear(64, 64)))
        self.conv4 = GINConv(Sequential(Linear(64, 64), ReLU(), Linear(64, 64)))
        self.conv5 = GINConv(Sequential(Linear(64, 64), ReLU(), Linear(64, 64)))
        self.fc1 = Linear(64, 64)
        self.fc2 = Linear(64, num_classes)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = F.relu(self.conv3(x, edge_index))
        x = F.relu(self.conv4(x, edge_index))
        x = F.relu(self.conv5(x, edge_index))
        x = global_add_pool(x, batch)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


In [None]:
from torch_geometric.data import Data, DataLoader as GeometricDataLoader

# Example graph data preparation (this will vary based on your specific dataset)
def create_graph_data(triples):
    # Handle potential integers and decode byte strings
    triples_decoded = [json.loads(t.decode('utf-8') if isinstance(t, bytes) else str(t)) for t in triples]

    # Handle the case when triples_decoded is empty or contains invalid triples
    if not triples_decoded or not all(isinstance(t, (list, tuple)) and len(t) >= 3 for t in triples_decoded):
        # Return an empty graph or handle the invalid data appropriately
        return Data()

    nodes = list(set([triple[0] for triple in triples_decoded] + [triple[2] for triple in triples_decoded]))
    node_features = {node: i for i, node in enumerate(nodes)}

    edge_index = []
    for triple in triples_decoded:
        edge_index.append([node_features[triple[0]], node_features[triple[2]]])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    x = torch.eye(len(nodes), dtype=torch.float)  # Example: One-hot encoding for node features
    y = torch.tensor([0])  # Example: Dummy label

    return Data(x=x, edge_index=edge_index, y=y)

# Convert datasets to graph data - Access the correct splits of the datasets
train_graph_data_list = [create_graph_data(ex['triples']) for ex in train_dataset['train']]
validation_graph_data_list = [create_graph_data(ex['triples']) for ex in validation_dataset['validation']] # Access validation split
test_graph_data_list = [create_graph_data(ex['triples']) for ex in test_dataset['test']] # Access test split

# Create Geometric DataLoaders
train_graph_loader = GeometricDataLoader(train_graph_data_list, batch_size=8, shuffle=True)
validation_graph_loader = GeometricDataLoader(validation_graph_data_list, batch_size=8)
test_graph_loader = GeometricDataLoader(test_graph_data_list, batch_size=8)

In [None]:
# Initialize GIN model
num_node_features = train_graph_loader.dataset[0].num_node_features
num_classes = 2  # Example: Adjust based on your task
gin_model = GIN(num_node_features=num_node_features, num_classes=num_classes).to(device)
gin_optimizer = AdamW(gin_model.parameters(), lr=5e-5)

for epoch in range(3):
    model.train()
    gin_model.train()
    total_loss = 0
    total_gin_loss = 0

    for batch, graph_batch in zip(train_dataloader, train_graph_loader):
        # Training T5 Model
        inputs = {k: torch.tensor(v).to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

        # Training GIN Model
        graph_batch = graph_batch.to(device)
        out = gin_model(graph_batch.x, graph_batch.edge_index, graph_batch.batch)
        gin_loss = F.nll_loss(out, graph_batch.y)
        gin_loss.backward()
        gin_optimizer.step()
        gin_optimizer.zero_grad()
        total_gin_loss += gin_loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    avg_gin_train_loss = total_gin_loss / len(train_graph_loader)
    print(f"Epoch {epoch + 1} - Average Training Loss: {avg_train_loss}, Average GIN Training Loss: {avg_gin_train_loss}")

    # Validation for T5
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in validation_dataloader:
            inputs = {k: torch.tensor(v).to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            eval_loss += outputs.loss.item()
    avg_eval_loss = eval_loss / len(validation_dataloader)
    print(f"Epoch {epoch + 1} - Average Validation Loss: {avg_eval_loss}")

    # Validation for GIN
    gin_model.eval()
    eval_gin_loss = 0
    with torch.no_grad():
        for graph_batch in validation_graph_loader:
            graph_batch = graph_batch.to(device)
            out = gin_model(graph_batch.x, graph_batch.edge_index, graph_batch.batch)
            gin_loss = F.nll_loss(out, graph_batch.y)
            eval_gin_loss += gin_loss.item()
    avg_eval_gin_loss = eval_gin_loss / len(validation_graph_loader)
    print(f"Epoch {epoch + 1} - Average GIN Validation Loss: {avg_eval_gin_loss}")

# Testing for T5
model.eval()
test_loss = 0
with torch.no_grad():
    for batch in test_dataloader:
        inputs = {k: torch.tensor(v).to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        test_loss += outputs.loss.item()
avg_test_loss = test_loss / len(test_dataloader)
print(f"Average Test Loss: {avg_test_loss}")

# Testing for GIN
gin_model.eval()
test_gin_loss = 0
with torch.no_grad():
    for graph_batch in test_graph_loader:
        graph_batch = graph_batch.to(device)
        out = gin_model(graph_batch.x, graph_batch.edge_index, graph_batch.batch)
        gin_loss = F.nll_loss(out, graph_batch.y)
        test_gin_loss += gin_loss.item()
avg_test_gin_loss = test_gin_loss / len(test_graph_loader)
print(f"Average GIN Test Loss: {avg_test_gin_loss}")

# Save the models
model.save_pretrained("t5_model")
tokenizer.save_pretrained("t5_tokenizer")
torch.save(gin_model.state_dict(), "gin_model.pth")




KeyError: "Invalid key: 0. Please first select a split. For example: `my_dataset_dictionary['train'][0]`. Available splits: ['train']"