In [228]:
# Required packages
!pip install vega_datasets
!pip install keybert
!pip install sentence-transformers
!pip install spacy

In [229]:
import http.client
import json
import re

import pandas as pd

from collections import defaultdict
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

import numpy as np
import spacy

In [205]:
# inputs

volume_id = 'osu.32436000578904'
# Connect to the HTRC API
conn = http.client.HTTPSConnection("tools.htrc.illinois.edu")

# Set the headers for the API request
headers = { 'Content-Type': "application/json" }

key_word_model = KeyBERT('distilbert-base-nli-mean-tokens')
sentences_model = SentenceTransformer('sentence-transformers/sentence-t5-base')

In [206]:
# Main fuctions

filter_out = ['<', '>', '(', ')', '[', ']', '{', '}', '!', '@', '#', '$', '%', '^', '&', '*', '_', '+', '=', '|', '\\',
              '/', '?', ',', '.', ';', ':', '"', "'", ' ',
              'IN', 'CC', 'CD', 'SYM']

def retrieve_data(conn, headers, volume_id):
    ''' Given the API connection retrieve a JSON with the data'''

    # Make an API request to get information about the volumes in the workset
    conn.request("GET", f"/ef-api/volumes/{volume_id}/pages", headers=headers)

    # Get the response and parse the data
    res = conn.getresponse()
    data = res.read()
    
    # Initialize variables to store total token count, total unique word count, and other metrics
    data = json.loads(data.decode("utf-8"))

    return data

def generate_plain_text(token_pos_count):
    ''' Generate plain texts'''
    token_cleaned = []

    for token in token_pos_count:
        for tag in token_pos_count[token]:
            if tag not in filter_out:
                token_cleaned.append(token)
    return token_cleaned

# Retrieving Data
def get_text_per_pages(record):
    '''' Reconstruct the text inside the pages'''
    
    data = {}
    for page in record['data']['pages']:
        if page['body']:
            if page['body']['tokenCount'] > 15:

                page_text = generate_plain_text(page['body']['tokenPosCount'])
                if len(page_text) > 15:
                    # Use the page for similarity analysis
                    data[page['seq']] = ' '.join(page_text)
                    # Use the page for similarity analysis
                    #data.append({'page_no': page['seq'], 'text': ' '.join(page_text)})


    return data

def tokens_per_pages_count(record):
    ''' Create the list of words per page and how many time the word is mentioned '''

    word_key_count = []
    
    for page in record['data']['pages']:
        if page['body']:
            for token, pos_count in page['body']['tokenPosCount'].items():
                word_key_count.append({'token': token, 'page_no': page['seq'].strip("0"), 'count': list(pos_count.values())[0]})
                    
    return word_key_count

def get_keywords(text):
    keywords = key_word_model.extract_keywords(text, top_n=10)
    return keywords


def get_key_words_by_page(data: dict) -> dict:
    pages_keywords = {}
    for page in data:
        keywords = get_keywords(data[page])
        pages_keywords[page] = keywords

    return pages_keywords

def extract_relevant_pages(data, query_embedding):
    page_similarities = {}
    for idx, items in enumerate(data.items()):
        #print(f"Page {idx} - {items[1]}\n")

        # Calculate the similarity between the query and the pages
        cosine_similarities = np.dot(list_embeddings[idx, :], query_embedding[0])
        #print(cosine_similarities)
        page_similarities[items[0]] = cosine_similarities
    document_attributes = [
        {'page_no': key.strip("0"), 'text': value, 'key_terms': pages_keywords[key], 'score': page_similarities[key]} for
        key, value in data.items()]

    return sorted(document_attributes, key=lambda x: x['page_no'], reverse=True)

def text_lemmatizer(text):

    # English pipelines include a rule-based lemmatizer
    nlp = spacy.load("en_core_web_sm")
    lemmatizer = nlp.get_pipe("lemmatizer")
    print(lemmatizer.mode)  # 'rule'
    
    doc = nlp(text)
    
    return ([token.lemma_ for token in doc])
    # ['I', 'be', 'read', 'the', 'paper', '.']


In [207]:
# Raw data from the API
data = retrieve_data(conn, headers, volume_id)

# Dictionary that reconstructs the text at page level
page_dataset = get_text_per_pages(data)

# List of list of token, page_no and token count per page
count_words = tokens_per_pages_count(data)

# I want to find mentions related to this query
query_term = "how the childrem learn?"

# Create the embedding for the query
query_embedding = sentences_model.encode([query_term])

# Create the embeddings for each page
list_texts = list(page_dataset.values())
list_embeddings = sentences_model.encode(list_texts)

#print(list_texts)

#df = pd.DataFrame(page_dataset)

#df_text = df['text'].to_list()
#df_page_no = df['page_no'].to_list()


# Create the embeddings for each page
list_texts = list(page_dataset.values())
#print(list_texts)
list_embeddings = sentences_model.encode(list_texts)

# Extract the top 10 key words of each pages
pages_keywords = get_key_words_by_page(page_dataset)
print(pages_keywords)

In [208]:
# Create a dataset with the key words and the relevant page
relevant_pages_dataset = extract_relevant_pages(page_dataset, query_embedding)

relevant_pages

In [209]:
df_relevant_pages = pd.DataFrame.from_records(relevant_pages) #[0:5000]

# Filter out pages with lower score
df_relevant_pages = df_relevant_pages[df_relevant_pages['score'] > 0.83] 

# Create a list with the relevant page
list_relevant_pages = [page.strip("0") for page in df_relevant_pages['page_no'].to_list()]

In [218]:
df = pd.DataFrame.from_records(count_words) #[0:5000]

relevant_count_words = df[df['page_no'].isin(list_relevant_pages)]
relevant_count_words
len(relevant_count_words)

In [223]:
df_relevant_pages['key_terms'].to_list()

In [226]:
list_key_term = set([key_term[0][0] for key_term in df_relevant_pages['key_terms'].to_list()])


relevant_count_words = df[df['token'].isin(list_key_term)]
relevant_count_words

In [181]:
df = pd.DataFrame.from_records(count_words) 
df

In [227]:
# To use this visualization we should create a .csv with the following columns Keyword, Page, Count (How many times it appear per page)

import altair as alt
from vega_datasets import data

#source = data.disasters.url

alt.Chart(relevant_count_words).mark_circle(
    opacity=0.8,
    stroke='black',
    strokeWidth=1,
    strokeOpacity=0.4
).encode(
    alt.X('page_no:Q') # quantitative data
        .title(None)
        .scale(domain=['0', str(df['page_no'].nunique())]),
    alt.Y('token:N') # nominal data
        .title(None),
        #.sort(field="count", op="sum", order='descending'),
    alt.Size('count:Q')
        .scale(range=[0, 50])
        .title('count'),
        #.legend(clipHeight=30, format='s'),
    alt.Color('token:N').legend(None),
    tooltip=[
        #"token:N",
        alt.Tooltip("page_no:Q"),
        alt.Tooltip("count:Q")
    ],
).properties(
    width=450,
    height=320,
    title=alt.Title(
        text="Global Deaths from Natural Disasters (1900-2017)",
        subtitle="The size of the bubble represents the total death count per year, by type of disaster",
        anchor='start'
    )
).configure_axisY(
    domain=False,
    ticks=False,
    offset=10
).configure_axisX(
    grid=False,
).configure_view(
    stroke=None
)