# Sentence-BERT Inter-Paragraph Coherence

Compute inter-paragraph coherence scores for cw responses using Sentence-BERT embeddings and average cosine similarity.

Run on Google Colab.

In [13]:
# sentence-transformers package setup
!pip install sentence-transformers




In [14]:
# Google Drive setup
from google.colab import drive
drive.mount('/content/drive')
filename_to_load = '/content/drive/My Drive/ANLP23_Project_Data/actual_cw_passage_content.xlsx'
filename_to_save = '/content/drive/My Drive/ANLP23_Project_Data/actual_cw_passage_content_with_inter_paragraph_sim.xlsx'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# Get passage content

import pandas as pd

# Read the Excel file
df = pd.read_excel(filename_to_load)

# Display the first few rows of the dataframe
print(df.head())


                     model_task_method  conversation_number  \
0  gpt4_cw_ape_zero_shot_cot_responses                    1   
1  gpt4_cw_ape_zero_shot_cot_responses                    2   
2  gpt4_cw_ape_zero_shot_cot_responses                    3   
3  gpt4_cw_ape_zero_shot_cot_responses                    4   
4  gpt4_cw_ape_zero_shot_cot_responses                    5   

                                            response  
0  Joe was an astronaut with a rather unusual hob...  
1  There lived a hawk on the hill, tall and proud...  
2  Being an outdoors enthusiast, one might consid...  
3  Edward was a corporate executive, laden with p...  
4  Joe was a curious and inventive character, alw...  


In [16]:
# Packages
from math import sqrt
from sentence_transformers import SentenceTransformer
import numpy as np
import nltk
nltk.download('punkt')
from tqdm import tqdm
tqdm.pandas()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
# Load model
sentence_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')


In [18]:
# Testing embedding creation
embedding=sentence_model.encode("this is a sentence")
print(embedding.shape)


(768,)


In [19]:
# Cosine similarity between embeddings
def cosine(one, two):
  return np.dot(one,two)/(sqrt(np.dot(one,one)) * sqrt(np.dot(two,two)))


In [None]:
# Trying to split responses into paragraphs

# Split on newlines
paragraph_lists = [response.split('\n') for response in df['response']]

# Print 100 random elements of paragraph_lists
import random
random.seed(42)
print(random.sample(paragraph_lists, 100))


In [None]:
# Function to get average cosine similarity between paragraph embeddings - averages of sentence embeddings
def get_passage_interparagraph_cosine_sim(paragraph_list):
    # List of paragraph embeddings
    paragraph_embeddings = []
    # List of paragraph number of sentences
    paragraph_num_sentences = []
    # Loop through paragraphs
    for paragraph in paragraph_list:
        # Split into sentences
        sentences = nltk.sent_tokenize(paragraph)
        # Get sentence embeddings
        sentence_embeddings = [sentence_model.encode(sentence) for sentence in sentences]
        # Average sentence embeddings
        paragraph_embeddings.append(np.mean(sentence_embeddings, axis=0))
        # Saving number of sentences there were embeddings for
        paragraph_num_sentences.append(len(sentence_embeddings))
    # Number of paragraphs
    num_paragraphs = len(paragraph_embeddings)
    # Get cosine similarity between paragraph and the one after it
    cosine_sim = []
    if num_paragraphs > 1:
        for i in range(len(paragraph_embeddings)-1):
            cosine_sim.append(cosine(paragraph_embeddings[i], paragraph_embeddings[i+1]))
        avg_cosine_sim = np.mean(cosine_sim)
    else:
        avg_cosine_sim = 1
    # Return average cosine similarity
    return avg_cosine_sim, num_paragraphs, paragraph_num_sentences


In [21]:
# Test function
test_1_avg, test_1_num_para, test_1_num_sent = get_passage_interparagraph_cosine_sim(["My life has been filled with hardships, but I continue to power forward. I'm a firm believer that every difficulty I face can be used to strengthen my character. I'm not the same person I was at the start, and for that I'm thankful. I'm a living furnace, forging my character so that I may become a stronger, more successful person.", "In some ways, life is like a rose - beautiful, but with thorns. The thorns represent those moments of hardship, challenge, and heartache we face. Yet despite the difficulties we might endure, there is still beauty to be found in life. There's a reason that roses have thorns, despite their beauty. They provide the strength and courage to help us overcome any obstacle we may face."])
print('test 1')
print(test_1_avg)
print(test_1_num_para)
print(test_1_num_sent)

test_2_avg, test_2_num_para, test_2_num_sent = get_passage_interparagraph_cosine_sim(['The boat sailed out of the harbor. Paul joined the other construction workers on the job site.'])
print('test 2')
print(test_2_avg)
print(test_2_num_para)
print(test_2_num_sent)


test 1
0.4240155942444682
2
test 2
0.07097828921740414
2


In [None]:
# Create lists of average_cosine_sim, passage_num_paragraphs, passage_num_sentences
average_cosine_sims_responses = []
passage_num_paragraphs_responses = []
passage_num_sentences_responses = []
for plist in tqdm(paragraph_lists):
  avg, passage_num_paragraphs, passage_num_sentences = get_passage_interparagraph_cosine_sim(plist)
  average_cosine_sims_responses.append(avg)
  passage_num_paragraphs_responses.append(passage_num_paragraphs)
  passage_num_sentences_responses.append(passage_num_sentences)


 27%|██▋       | 436/1600 [07:33<15:51,  1.22it/s]

In [None]:
# Create column for average cosine similarity
#df[['avg_cosine_sim', 'num_sentences']] = df.progress_apply(lambda row: get_passage_avg_cosine_sim(row['response']), axis=1)


In [None]:
# Create dataframe columns
df['avg_inter_paragraph_cosine_sim'] = average_cosine_sims_responses
df['num_paragraphs'] = passage_num_paragraphs_responses
df['num_sentences'] = passage_num_sentences_responses


In [None]:
# For avg_inter_paragraph_cosine_sim, replace 1 with NaN
df['avg_inter_paragraph_cosine_sim'] = df['avg_inter_paragraph_cosine_sim'].replace(1, np.nan)


In [None]:
# Write df to Excel
df.to_excel(filename_to_save)
