<a href="https://colab.research.google.com/github/joyennn/scalar-implicature/blob/main/scalar_implicature.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers torch numpy scikit-learn

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
### Experiment1 ###

#import data

import csv

def load_csv_to_list(file_path):
    data = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            data.append(row)
    return data

file_path = 'ex1_data.csv'
data = load_csv_to_list(file_path)

In [None]:
import torch
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch.nn.functional as F

#embeddings for BERT
def bert_embeddings(sentences):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]   #CLS embeddings
    return embeddings

#embeddings for GPT-2
def gpt2_embeddings(sentences):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2Model.from_pretrained('gpt2')
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)   #average of token embeddings
    return embeddings

# Cosine similarity and transformation
def cosine_similarity(embeddings1, embeddings2):
    cosine_sim = F.cosine_similarity(embeddings1, embeddings2).item()
    linear_transformed = (cosine_sim + 1) / 2                      #linearly transformed
    sigmoid_transformed = 1 / (1 + np.exp(-linear_transformed))    #sigmoid
    return sigmoid_transformed

In [None]:
result = []
for i in range(len(data)):
    sentence1 = data[i][0]
    sentence2 = data[i][1]
    interpretation = data[i][2]

    bert_embeddings1 = bert_embeddings([sentence1])
    bert_embeddings2 = bert_embeddings([sentence2])
    gpt2_embeddings1 = gpt2_embeddings([sentence1])
    gpt2_embeddings2 = gpt2_embeddings([sentence2])

    bert_similarity = cosine_similarity(bert_embeddings1, bert_embeddings2)
    gpt2_similarity = cosine_similarity(gpt2_embeddings1, gpt2_embeddings2)

    result.append([sentence1, sentence2, interpretation, bert_similarity, gpt2_similarity])

In [None]:
#save result to .csv

def save_list_to_csv(data_list, file_path):
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)
        for row in data_list:
            csv_writer.writerow(row)

file_path = 'ex1_result.csv'
save_list_to_csv(result, file_path)

In [None]:
### Experiment2 ###

#import data

import csv

def load_csv_to_list(file_path):
    data = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            data.append(row)
    return data

file_path = 'ex2_data.csv'
data = load_csv_to_list(file_path)

In [None]:
import torch
from transformers import BertTokenizer, BertForNextSentencePrediction, GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

# probability for BERT
def bert_nsp_probability(question, answer):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

    inputs = tokenizer.encode_plus(question, answer, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)
    nsp_prob = probs[0, 0].item()
    return nsp_prob

# probability for GPT-2
def gpt2_next_token_probability(question, answer):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
    sequence = f"{question} {answer}"
    inputs = tokenizer(sequence, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
    loss = outputs.loss.item()
    next_token_prob = np.exp(-loss)
    return next_token_prob

# Surprisal
def compute_surprisal(probability):
    return -np.log2(probability)

In [None]:
result = []
for i in range(len(data)):
    question = data[i][0]
    answer = data[i][1]
    qud = data[i][2]

    bert_probability = bert_nsp_probability(question, answer)
    gpt2_probability = gpt2_next_token_probability(question, answer)

    bert_surprisal = compute_surprisal(bert_probability)
    gpt2_surprisal = compute_surprisal(gpt2_probability)

    result.append([question, answer, qud, bert_surprisal, gpt2_surprisal])

In [None]:
#save result to .csv

def save_list_to_csv(data_list, file_path):
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)
        for row in data_list:
            csv_writer.writerow(row)

file_path = 'ex2_result.csv'
save_list_to_csv(result, file_path)