# Sentence-BERT Inter-Paragraph Coherence

Compute inter-paragraph coherence scores for cw responses using Sentence-BERT embeddings and average cosine similarity.

Run on Google Colab.

In [1]:
# sentence-transformers package setup
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=b18d3677fb02384dc54ff6bfac2a77605cb952a55ea1a64480b0a63f61519112
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tra

In [2]:
# Google Drive setup
from google.colab import drive
drive.mount('/content/drive')
filename_to_load = '/content/drive/My Drive/ANLP23_Project_Data/actual_cw_passage_content.xlsx'
filename_to_save = '/content/drive/My Drive/ANLP23_Project_Data/actual_cw_passage_content_with_inter_paragraph_sim.xlsx'


Mounted at /content/drive


In [3]:
# Get passage content

import pandas as pd

# Read the Excel file
df = pd.read_excel(filename_to_load)

# Display the first few rows of the dataframe
print(df.head())


                     model_task_method  conversation_number  \
0  gpt4_cw_ape_zero_shot_cot_responses                    1   
1  gpt4_cw_ape_zero_shot_cot_responses                    2   
2  gpt4_cw_ape_zero_shot_cot_responses                    3   
3  gpt4_cw_ape_zero_shot_cot_responses                    4   
4  gpt4_cw_ape_zero_shot_cot_responses                    5   

                                            response  
0  Joe was an astronaut with a rather unusual hob...  
1  There lived a hawk on the hill, tall and proud...  
2  Being an outdoors enthusiast, one might consid...  
3  Edward was a corporate executive, laden with p...  
4  Joe was a curious and inventive character, alw...  


In [4]:
# Packages
from math import sqrt
from sentence_transformers import SentenceTransformer
import numpy as np
import nltk
nltk.download('punkt')
from tqdm import tqdm
tqdm.pandas()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
# Load model
sentence_model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')


.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [6]:
# Testing embedding creation
embedding=sentence_model.encode("this is a sentence")
print(embedding.shape)


(768,)


In [7]:
# Cosine similarity between embeddings
def cosine(one, two):
  return np.dot(one,two)/(sqrt(np.dot(one,one)) * sqrt(np.dot(two,two)))


In [8]:
# Trying to split responses into paragraphs

# Split on newlines
paragraph_lists = [response.split('\n') for response in df['response']]

# Print 100 random elements of paragraph_lists
import random
random.seed(42)
print(random.sample(paragraph_lists, 100))




In [13]:
# Statistics of paragraph length
passage_para_lengths = [len(plist) for plist in paragraph_lists]
print('mean length')
print(np.mean(passage_para_lengths))
print('min length')
print(np.min(passage_para_lengths))
print('max length')
print(np.max(passage_para_lengths))
# Print out non-conforming paragraphs
anomaly_counter = 0
for index in range(len(passage_para_lengths)):
  if passage_para_lengths[index] != 2:
    print('anomaly')
    print(paragraph_lists[index])
    print(passage_para_lengths[index])
    anomaly_counter = anomaly_counter + 1
print('num anomalies')
print(anomaly_counter)

mean length
1.44625
min length
1
max length
4


In [9]:
# Function to get average cosine similarity between paragraph embeddings - averages of sentence embeddings
def get_passage_interparagraph_cosine_sim(paragraph_list):
    # List of paragraph embeddings
    paragraph_embeddings = []
    # List of paragraph number of sentences
    paragraph_num_sentences = []
    # Loop through paragraphs
    for paragraph in paragraph_list:
        # Split into sentences
        sentences = nltk.sent_tokenize(paragraph)
        # Get sentence embeddings
        sentence_embeddings = [sentence_model.encode(sentence) for sentence in sentences]
        # Average sentence embeddings
        paragraph_embeddings.append(np.mean(sentence_embeddings, axis=0))
        # Saving number of sentences there were embeddings for
        paragraph_num_sentences.append(len(sentence_embeddings))
    # Number of paragraphs
    num_paragraphs = len(paragraph_embeddings)
    # Get cosine similarity between paragraph and the one after it
    cosine_sim = []
    for i in range(len(paragraph_embeddings)-1):
        cosine_sim.append(cosine(paragraph_embeddings[i], paragraph_embeddings[i+1]))
    # Return average cosine similarity
    return sum(cosine_sim)/len(cosine_sim), num_paragraphs, paragraph_num_sentences


In [12]:
# Test function
test_1_avg, test_1_num_para, test_1_num_sent = get_passage_interparagraph_cosine_sim(["My life has been filled with hardships, but I continue to power forward. I'm a firm believer that every difficulty I face can be used to strengthen my character. I'm not the same person I was at the start, and for that I'm thankful. I'm a living furnace, forging my character so that I may become a stronger, more successful person.", "In some ways, life is like a rose - beautiful, but with thorns. The thorns represent those moments of hardship, challenge, and heartache we face. Yet despite the difficulties we might endure, there is still beauty to be found in life. There's a reason that roses have thorns, despite their beauty. They provide the strength and courage to help us overcome any obstacle we may face."])
print('test 1')
print(test_1_avg)
print(test_1_num_para)
print(test_1_num_sent)

test_2_avg, test_2_num_para, test_2_num_sent = get_passage_interparagraph_cosine_sim(['The boat sailed out of the harbor. Paul joined the other construction workers on the job site.', 'Hi, my name is Bob.', 'Ouch, that hurt.'])
print('test 2')
print(test_2_avg)
print(test_2_num_para)
print(test_2_num_sent)


test 1
0.4835480511380878
2
[4, 5]
test 2
0.1155728341639507
3
[2, 1, 1]


In [None]:
# Create lists of average_cosine_sim, passage_num_paragraphs, passage_num_sentences
average_cosine_sims_responses = []
passage_num_paragraphs_responses = []
passage_num_sentences_responses = []
for plist in tqdm(paragraph_lists):
  avg, passage_num_paragraphs, passage_num_sentences = get_passage_interparagraph_cosine_sim(plist)
  average_cosine_sims_responses.append(avg)
  passage_num_paragraphs_responses.append(passage_num_paragraphs)
  passage_num_sentences_responses.append(passage_num_sentences)


In [None]:
# Create column for average cosine similarity
#df[['avg_cosine_sim', 'num_sentences']] = df.progress_apply(lambda row: get_passage_avg_cosine_sim(row['response']), axis=1)


In [None]:
# Create dataframe columns
df['avg_inter_paragraph_cosine_sim'] = average_cosine_sims_responses
df['num_paragraphs'] = passage_num_paragraphs_responses
df['num_sentences'] = passage_num_sentences_responses


In [None]:
# Write df to Excel
df.to_excel(filename_to_save)
