In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA

from discopy.grammar import Word
from discopy.rigid import Cup, Id, Ty
import torch
import random

from sentence_transformers import SentenceTransformer

from lambeq import LossFunction, PennyLaneModel, PytorchTrainer, QuantumTrainer, SPSAOptimizer, NumpyModel, MSELoss, Dataset, AtomicType, IQPAnsatz, BobcatParser
from lambeq.pregroups import remove_cups

import jax.numpy as jnp
import jax
jax.devices()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from discopy.grammar import Word
from discopy.rigid import Cup, Id, Ty
import torch
import random
from sentence_transformers import SentenceTransformer
from lambeq import LossFunction, PennyLaneModel, PytorchTrainer, QuantumTrainer, SPSAOptimizer, NumpyModel, MSELoss, Dataset, AtomicType, IQPAnsatz, Sim14Ansatz, Sim15Ansatz, StronglyEntanglingAnsatz, BobcatParser
from lambeq.pregroups import remove_cups
import jax as jnp
from jax import numpy as jnp
import os
import joblib

In [35]:
def read_and_preprocess_data():
    df = pd.read_csv('Data/LargerSadrKartTransative.txt', sep=' ')
    df.columns = ['annotator', 'subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2', 'score']
    # group the data by the three sentence columns and calculate the mean and standard deviation of the score column
    grouped_data = df.groupby(['subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2']).agg({'score': [np.mean, np.std]}).reset_index()
    # flatten the multi-level column names of the grouped data
    grouped_data.columns = [' '.join(col).strip() for col in grouped_data.columns.values]
    # rename the mean and std columns to 'score' and 'range' respectively
    grouped_data.rename(columns={'score mean': 'score', 'score std': 'range'}, inplace=True)
    grouped_data['score'] = grouped_data['score']/grouped_data['score'].max()
    unique_word_list = []
    for ind, row in grouped_data.iterrows():
        for i in [row['subject1'],row['verb1'],row['object1'], row['subject2'],row['verb2'],row['object2']]:
            unique_word_list.append(i)
    unique_word_list = list(set(unique_word_list))
    grouped_data.to_csv("Data/AveragedLargerSadrKartTransative.txt")
    #scaler = MinMaxScaler(feature_range=(-1, 1))
    # Rescale the 'score' column
    #grouped_data['score'] = scaler.fit_transform(grouped_data[['score']])
    return grouped_data, unique_word_list

def diagram_to_sentence(word_list):
    n, s = Ty('n'), Ty('s')
    words = [
        Word(word_list[0], n),
        Word(word_list[1], n.r @ s @ n.l),
        Word(word_list[2], n)
    ]
    cups = Cup(n, n.r) @ Id(s) @ Cup(n.l, n)
    assert Id().tensor(*words) == words[0] @ words[1] @ words[2]
    assert Ty().tensor(*[n.r, s, n.l]) == n.r @ s @ n.l
    diagram = Id().tensor(*words) >> cups
    return diagram

def retrive_nth_rows_sentences(data, row1, row2=None):
    if not row2:
        row2=row1
    sentence1 = data['subject'+str(1)][row1] + " " + data['verb'+str(1)][row1]  + " " + data['object'+str(1)][row1] 
    sentence2 = data['subject'+str(2)][row2] + " " + data['verb'+str(2)][row2]  + " " + data['object'+str(2)][row2] 
    return sentence1, sentence2

def make_circuit_from_df_row(data, row_number, ansatz):
    diagram1, diagram2 = make_diagrams(data, row_number)
    qml_circuit, discopy_circuit = make_circuit_from_diagrams(diagram1, diagram2, ansatz, False)
    return qml_circuit, discopy_circuit

def make_sentence_a_state(sentence):
    diagram = diagram_to_sentence(sentence.split(" "))
    diagram = remove_cups(diagram)
    return diagram

def make_diagram_a_circuit(diagram, ansatz, dagger=False):
    discopy_circuit = ansatz(diagram)
    if dagger:
        discopy_circuit = discopy_circuit.dagger()
    return discopy_circuit

def concat_circuits_into_inner_product(circuit1, circuit2):
    concat_circuit = circuit1 >> circuit2
    return concat_circuit

def make_diagrams(data, sentence1, sentence2=None):
    if type(sentence1) == int:
        sentence1, sentence2 = retrive_nth_rows_sentences(data, sentence1, sentence2)
    diagram1 = make_sentence_a_state(sentence1)
    diagram2 = make_sentence_a_state(sentence2)
    return diagram1, diagram2

def make_circuit_from_diagrams(diagram1, diagram2, ansatz, drawing=False):
    discopy_circuit1 = make_diagram_a_circuit(diagram1, ansatz)
    discopy_circuit2 = make_diagram_a_circuit(diagram2, ansatz, dagger=True)
    discopy_circuit = concat_circuits_into_inner_product(discopy_circuit1, discopy_circuit2)

    if drawing:
        discopy_circuit1.draw(figsize=(5, 5))
        discopy_circuit2.draw(figsize=(5, 5))
        discopy_circuit.draw(figsize=(5, 10))   

    pennylane_circuit = discopy_circuit.to_pennylane()
    return pennylane_circuit, discopy_circuit

def make_circuit_from_df_row(data, row_number, ansatz):
    diagram1, diagram2 = make_diagrams(data, row_number)
    qml_circuit, discopy_circuit = make_circuit_from_diagrams(diagram1, diagram2, ansatz, False)
    return qml_circuit, discopy_circuit


def make_circuit_from_df_row(data, row_number, ansatz):
    diagram1, diagram2 = make_diagrams(data, row_number)
    qml_circuit, discopy_circuit = make_circuit_from_diagrams(diagram1, diagram2, ansatz, False)
    return qml_circuit, discopy_circuit

dataset, unique_word_list = read_and_preprocess_data()

In [36]:
def get_datasets(ansatz, BATCH_SIZE):
    labels = dataset['score']

    training = pd.read_csv("Data/TrainingData.txt")
    test = pd.read_csv("Data/TestData.txt")

    train_data =  [make_circuit_from_df_row(training, i, ansatz)[1] for i in range(len(training))]
    train_labels = labels[training['Unnamed: 0'].values]
    val_data = [make_circuit_from_df_row(test, i, ansatz)[1] for i in range(len(test))] 
    val_labels = labels[test['Unnamed: 0'].values]

    diagrams = train_data + val_data

    train_dataset = Dataset(train_data,train_labels,batch_size=BATCH_SIZE, shuffle=False)
    val_dataset = Dataset(val_data, val_labels, batch_size=BATCH_SIZE, shuffle=False)
    return diagrams, train_dataset, val_dataset

In [37]:
class EncodedNumpyModel(NumpyModel):
     def initialise_weights(self) -> None:
        if not self.symbols:
            raise ValueError('Symbols not initialised. Instantiate through '
                            '`from_diagrams()`.')
        self.weights = self.param_initialise_method(self.symbols)

def model_with_parameters(parameters):
    model_type = parameters['model_type']
    noun_count = parameters['ansatz'].ob_map[Ty('n')]
    ansatz = parameters['ansatz']
    sentence_count = parameters['ansatz'].ob_map[Ty('s')]
    asnatz_hyperparams = {'n':noun_count, 's':sentence_count, 'layers':parameters['ansatz'].n_layers}
    ansatz_name = parameters['ansatz'].__class__.__name__.lower()
    loss_name = parameters['loss'].__class__.__name__.lower()
    optimizer_name = parameters['optimiser'].__name__.lower()
    optimizer_hyperparams_str = '_'.join([f"{key}{val}" for key, val in parameters['optimiser_parameters'].items()])
    asnatz_hyperparams_str = '_'.join([f"{key}{val}" for key, val in asnatz_hyperparams.items()])
    num_epochs = parameters['epochs']
    batch_size = parameters['batch']
    seed = parameters['seed']

    model_folder = f"{model_type}_{ansatz_name}_{asnatz_hyperparams_str}_{loss_name}_{optimizer_name}_{optimizer_hyperparams_str}_epochs{num_epochs}_batch{batch_size}_seed{seed}"
    model_folder_path = os.path.join("models", model_folder)
    if os.path.exists(model_folder_path):
        return 0
    else:
        torch.manual_seed(seed)
        random.seed(seed)
        np.random.seed(seed)
        diagrams, train_dataset, val_dataset = get_datasets(ansatz, batch_size)
        model = make_and_train_model(diagrams, train_dataset, val_dataset, model_type, noun_count, ansatz, sentence_count, asnatz_hyperparams, ansatz_name, loss_name, optimizer_name, optimizer_hyperparams_str, num_epochs, batch_size, seed)
        
        trainer = QuantumTrainer(
            model,
            loss_function=parameters['loss'],
            epochs=num_epochs,
            optimizer=parameters['optimiser'],
            optim_hyperparams=parameters['optimiser_parameters'],
            evaluate_on_train=True,
            verbose="text",
            seed=seed
        )
        trainer.fit(train_dataset, val_dataset, logging_step=100)

        save_model_training(model_folder_path, parameters, trainer.train_costs, trainer.val_costs)
    return 

def save_model_training(model_folder_path, model_params, training_losses, validation_losses):
    os.makedirs(model_folder_path, exist_ok=True)

    model_params_filepath = os.path.join(model_folder_path, "model_params.joblib")
    training_losses_filepath = os.path.join(model_folder_path, "training_losses.npy")
    validation_losses_filepath = os.path.join(model_folder_path, "validation_losses.npy")

    joblib.dump(model_params, model_params_filepath)
    np.save(training_losses_filepath, training_losses)
    np.save(validation_losses_filepath, validation_losses)
    return

def make_and_train_model(diagrams, train_dataset, val_dataset, model_type, noun_count, ansatz, sentence_count, asnatz_hyperparams, ansatz_name, loss_name, optimizer_name, optimizer_hyperparams_str, num_epochs, batch_size, seed):
    if model_type == 'random':
        model = NumpyModel.from_diagrams(diagrams, use_jit=True)
    else:
        raise ValueError(f"Invalid ansatz: {ansatz_name}")
    return model

In [41]:
model_with_parameters({
    "model_type":'random',
    "ansatz":IQPAnsatz({AtomicType.NOUN: 1, AtomicType.SENTENCE: 1}, n_layers=3),
    "loss":MSELoss(),
    "optimiser":SPSAOptimizer,
    "optimiser_parameters":{'a': 0.75, 'c': 0.1, 'A':0.01*200},
    "seed":42,
    "batch":2,
    "epochs":200
})

Epoch 1:    train/loss: 0.1864   valid/loss: 0.1205
Epoch 100:  train/loss: 0.0633   valid/loss: 0.0981
Epoch 200:  train/loss: 0.0440   valid/loss: 0.1016

Training completed!


In [42]:
model_with_parameters({
    "model_type":'random',
    "ansatz":IQPAnsatz({AtomicType.NOUN: 1, AtomicType.SENTENCE: 1}, n_layers=3),
    "loss":MSELoss(),
    "optimiser":SPSAOptimizer,
    "optimiser_parameters":{'a': 1.75, 'c': 0.1, 'A':0.01*2},
    "seed":42,
    "batch":2,
    "epochs":2
})

Epoch 1:  train/loss: 0.1882   valid/loss: 0.1259

Training completed!
