In [3]:
import pandas as pd
import PyPDF2
from PyPDF2 import PdfReader
import bertopic
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
from matplotlib import pyplot as plt
import warnings
from transformers import pipeline
import re
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 500)


In [4]:
def extract_text_from_pdfs(pdf_files):
    # Create an empty data frame
    df = pd.DataFrame(columns=['file', 'text'])

    # Iterate over the PDF files
    for pdf_file in pdf_files:
        # Open the PDF file
        with open(pdf_file, 'rb') as f:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(f)

            # Get the number of pages in the PDF
            num_pages = len(pdf_reader.pages) 

            # Initialize a string to store the text from the PDF
            text = ""

            # Iterate over all the pages
            for page_num in range(num_pages):
                if page_num < 48:
                # Get the page object
                    page = pdf_reader.pages[page_num]

                # Extract the text from the page
                    page_text = page.extract_text()

                # Add the page text to the overall text
                    text += page_text

            # Add the file name and the text to the data frame
            df = df.append({'file': pdf_file.name, 'text': text}, ignore_index=True)

    # Return the data frame
    return df


In [5]:
from pathlib import Path
path='data/'
files = Path(path).glob("WHR+22.pdf")
df = extract_text_from_pdfs(files)
df

Unnamed: 0,file,text
0,WHR+22.pdf,"John F. Helliwell, Richard Layard, Jeffrey D. Sachs, \nJan-Emmanuel De Neve, Lara B. Aknin, and Shun Wang2022\nThe World Happiness Report was written by a group of independent experts acting in \ntheir personal capacities. Any views expressed in this report do not necessarily reflect the views of any organization, agency or programme of the United Nations.Table of Contents\nWorld Happiness Report \n2022\n Foreword . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ...."


In [6]:
# Import the re module for regular expressions
import re

def preprocess_text(text_list):
    # Initialize a list to store the pre-processed text
    processed_text = []

    # Iterate over the text in the list
    for text in text_list:
      num_words = len(text.split(" "))
      if num_words > 15:
        processed_text.append(text)

    # Return the pre-processed text
    return processed_text


def remove_short_sentences(df):
  df['sentences'] = df['sentences'].apply(preprocess_text)
  return df

In [7]:
df['sentences'] = df['text'].apply(lambda long_str: long_str.replace("\n", " ").split("."))
df = remove_short_sentences(df)
df

Unnamed: 0,file,text,sentences
0,WHR+22.pdf,"John F. Helliwell, Richard Layard, Jeffrey D. Sachs, \nJan-Emmanuel De Neve, Lara B. Aknin, and Shun Wang2022\nThe World Happiness Report was written by a group of independent experts acting in \ntheir personal capacities. Any views expressed in this report do not necessarily reflect the views of any organization, agency or programme of the United Nations.Table of Contents\nWorld Happiness Report \n2022\n Foreword . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ....","[ Aknin, and Shun Wang2022 The World Happiness Report was written by a group of independent experts acting in their personal capacities, Any views expressed in this report do not necessarily reflect the views of any organization, agency or programme of the United Nations, 5 Helliwell, Layard, Sachs, De Neve, Aknin, & Wang 2 Happiness, Benevolence, and Trust During COVID-19 and Beyond , 13 Helliwell, Wang, Huang, & Norton 3 Trends in Conceptions of Progress and Well-being , 53 Ba..."


In [8]:
"""
We have a corpus with various sentences. Then, for a given search query,
we want to find the most similar sentence in the document.
This script outputs the similarity score for all sentences in the document.
"""

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2') 
from sklearn.metrics.pairwise import cosine_similarity
cosine_threshold = 0.3 # set threshold for cosine similarity value

queries = ['country ranking of happiness'] #search query

print("\nSemantic Search Results")

results = []
for i, document in enumerate(df['sentences']):
  sentence_embeddings = model.encode(document)
  query_embedding    = model.encode(queries)
  for j, sentence_embedding in enumerate(sentence_embeddings):
    distance = cosine_similarity(sentence_embedding.reshape((1,-1)), query_embedding.reshape((1,-1)))[0][0]
    
    sentence = df['sentences'].iloc[i][j]
    results += [(i, sentence, distance)]
results = sorted(results, key=lambda x: x[2], reverse=True)


Semantic Search Results


In [9]:
print(f"Query: {queries}")
print(f"Order by most relevant sentences in corpus:\n")

for idx, sentence, distance in results:
        if (distance > cosine_threshold):
                print(f"{sentence.strip()}, \n{df['file'].iloc[idx]}\nCosine Score: {distance:.4f})")
                print('-----------------------')
                df = df.append({'file': 'WHR+22.pdf', 'query': 'country ranking of happiness', 'sentence':sentence,
                 'cosine_score': (distance)}, ignore_index=True)

Query: ['country ranking of happiness']
Order by most relevant sentences in corpus:

In World Happiness Report 2019, we  presented comparable rankings for all three  subjective well-being measures that we track:   the Cantril ladder (and its standard deviation,  which provides a measure of happiness inequality19),  positive affect and negative affect, along with  country rankings for the six variables we use in World Happiness Report 2022 26Table 2, 
WHR+22.pdf
Cosine Score: 0.7820)
-----------------------
Our first  section presents our usual ranking and modelling  of national happiness based on data covering  2019 through 2021, 
WHR+22.pdf
Cosine Score: 0.7580)
-----------------------
What do the data show for the 2019-2021   country rankings? Two features carry over from previous editions of  the World Happiness Report, 
WHR+22.pdf
Cosine Score: 0.7570)
-----------------------
We first split each country’s respondents (see Table  10 of Statistical Appendix 1 of World Happiness  Repo

In [10]:
df[~df['sentence'].isnull()][['file', 'query', 'sentence', 'cosine_score']].sort_values(by = 'cosine_score', ascending = False)

Unnamed: 0,file,query,sentence,cosine_score
1,WHR+22.pdf,country ranking of happiness,"In World Happiness Report 2019, we presented comparable rankings for all three subjective well-being measures that we track: the Cantril ladder (and its standard deviation, which provides a measure of happiness inequality19), positive affect and negative affect, along with country rankings for the six variables we use in World Happiness Report 2022 26Table 2",0.782014
2,WHR+22.pdf,country ranking of happiness,Our first section presents our usual ranking and modelling of national happiness based on data covering 2019 through 2021,0.758043
3,WHR+22.pdf,country ranking of happiness,What do the data show for the 2019-2021 country rankings? Two features carry over from previous editions of the World Happiness Report,0.757050
4,WHR+22.pdf,country ranking of happiness,We first split each country’s respondents (see Table 10 of Statistical Appendix 1 of World Happiness Report 2018 for more detail) randomly into two groups,0.753159
5,WHR+22.pdf,country ranking of happiness,This has been to a significant extent enabled by the data available in the Gallup World Poll since 2005-2006 and analysed in the World Happiness Report since 2012,0.742547
...,...,...,...,...
318,WHR+22.pdf,country ranking of happiness,In 2017-2019 the percentage of the population involved in the selected prosocial acts was 40% in the western industrial countries62 and 30% in the rest of the world,0.301065
319,WHR+22.pdf,country ranking of happiness,"Negative affect is given by the average of individual yes or no answers about three emotions experienced or the previous day: worry, sadness, and anger",0.300968
320,WHR+22.pdf,country ranking of happiness,It is too early to tell whether the increased benevolence in 2021 will carry forward as a welcome addition to global well-being,0.300359
321,WHR+22.pdf,country ranking of happiness,"The largest trend increases were in Central and Eastern Europe, East Asia and the CIS",0.300284


In [19]:
# extract all the sentences from results that have a cosine similarity score larger than the threshold 
# and put in a list
texts = []
for idx, sentence, distance in results:
    if distance > cosine_threshold:
        text = sentence
        texts.append(text)
#turn the list to string
final_text = "".join(texts)

In [20]:
from transformers import BertForQuestionAnswering, AutoTokenizer
modelname = 'deepset/bert-base-cased-squad2'
model_qa = BertForQuestionAnswering.from_pretrained(modelname)
#initalized the token, and convert the list of strings (tokens) into a list of integers (token IDs).  This is done using an internal dictionary that contains every token understood by Bert.
tokenizer = AutoTokenizer.from_pretrained(modelname)

In [21]:
nlp = pipeline('question-answering', model=model_qa, tokenizer=tokenizer)

context = final_text

nlp({
    'question': 'which country occupies the top spot in life evaluation?',
    'context': context
})

{'score': 0.8920966386795044,
 'start': 40071,
 'end': 40078,
 'answer': 'Finland'}