# Sentence-BERT Local Coherence

Compute local coherence scores for cw responses using Sentence-BERT embeddings and average cosine similarity.

Run on Google Colab.

In [None]:
!pip install sentence-transformers


In [None]:
#!wget https://github.com/ijyliu/anlp23-project/blob/main/Data/actual_cw_passage_content.xlsx
#https://github.com/ijyliu/anlp23-project/blob/947bc984cb7ead83eca0943f87ac3db44669a2fb/Data/actual_cw_passage_content.xlsx


In [None]:
# Get passage content

import pandas as pd

# URL of the raw Excel file from GitHub
url = 'https://github.com/ijyliu/anlp23-project/blob/main/Data/actual_cw_passage_content.xlsx'

# Read the Excel file
df = pd.read_excel(url)

# Display the first few rows of the dataframe
print(df.head())


In [None]:
# Packages
from math import sqrt
from sentence_transformers import SentenceTransformer
import numpy as np


In [None]:
# Load model
sentence_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')


In [None]:
# Testing embedding creation
embedding=sentence_model.encode("this is a sentence")
print(embedding.shape)


In [None]:
# Cosine similarity between embeddings
def cosine(one, two):
  return np.dot(one,two)/(sqrt(np.dot(one,one)) * sqrt(np.dot(two,two)))


In [None]:
# Function to get passage average cosine similarity between sentence and the following sentence
def get_passage_avg_cosine_sim(passage):
  # Get sentence embeddings
  sentence_embeddings = sentence_model.encode(passage)
  # Get cosine similarity between sentence and the one after it
  cosine_sim = []
  for i in range(len(sentence_embeddings)-1):
    cosine_sim.append(cosine(sentence_embeddings[i], sentence_embeddings[i+1]))
  # Return average cosine similarity
  return sum(cosine_sim)/len(cosine_sim)


In [None]:
# Create column for average cosine similarity
df['avg_cosine_sim'] = df['passage'].apply(get_passage_avg_cosine_sim)


In [None]:
# Save similarities
from google.colab import drive
drive.mount('/content/drive')
filename_to_save = '/content/drive/My Drive/ANLP23_Project_Data/actual_cw_passage_content_with_sim.xlsx'

# Write df to Excel
df.to_excel(filename_to_save)
