# Keyword Analysis with KeyBERT and Taipy

## 01 - Extraction of arXiv Abstracts with API
- https://github.com/lukasschwab/arxiv.py

In [1]:
import arxiv
import sqlite3
import pandas as pd
from keybert import KeyBERT

In [2]:
search = arxiv.Search(
            query = 'artificial intelligence',
            max_results = 2,
            sort_by = arxiv.SortCriterion.SubmittedDate,
            sort_order = arxiv.SortOrder.Descending)

In [3]:
type(search)

arxiv.arxiv.Search

In [4]:
for result in search.results():
    print(result.entry_id)
    print(result.published)
    print(result.title)
    print(result.summary)

http://arxiv.org/abs/2303.05512v1
2023-03-09 18:59:50+00:00
PAC-NeRF: Physics Augmented Continuum Neural Radiance Fields for Geometry-Agnostic System Identification
Existing approaches to system identification (estimating the physical
parameters of an object) from videos assume known object geometries. This
precludes their applicability in a vast majority of scenes where object
geometries are complex or unknown. In this work, we aim to identify parameters
characterizing a physical system from a set of multi-view videos without any
assumption on object geometry or topology. To this end, we propose "Physics
Augmented Continuum Neural Radiance Fields" (PAC-NeRF), to estimate both the
unknown geometry and physical parameters of highly dynamic objects from
multi-view videos. We design PAC-NeRF to only ever produce physically plausible
states by enforcing the neural radiance field to follow the conservation laws
of continuum mechanics. For this, we design a hybrid Eulerian-Lagrangian
represe

___
## 02 - SQLite Database Setup
- https://www.digitalocean.com/community/tutorials/how-to-use-the-sqlite3-module-in-python-3

In [4]:
connection = sqlite3.connect("../data/abstracts.db")
cursor = connection.cursor()

In [5]:
# Create new table in database
cursor.execute("CREATE TABLE IF NOT EXISTS abstracts_ai (id TEXT PRIMARY KEY, \
                                                         title TEXT, \
                                                         date_published TEXT, \
                                                         abstract TEXT)"
              )

<sqlite3.Cursor at 0x2183b230b90>

In [6]:
# Insert dummy row
cursor.execute("INSERT INTO abstracts_ai VALUES ('a1', \
                                                 'test_title', \
                                                 '2023-02-16 18:16:09+00:00', \
                                                 'test abstract text')"
              )

<sqlite3.Cursor at 0x2183b230b90>

In [7]:
# Fetch all rows
query = "SELECT * FROM abstracts_ai"
df = pd.read_sql_query("SELECT * FROM abstracts_ai", connection)
df

Unnamed: 0,id,title,date_published,abstract
0,a1,test_title,2023-02-16 18:16:09+00:00,test abstract text


In [8]:
# Delete dummy row
cursor.execute(
    "DELETE FROM abstracts_ai")

<sqlite3.Cursor at 0x2183b230b90>

In [9]:
# Check all rows deleted
query = "SELECT * FROM abstracts_ai"
df = pd.read_sql_query("SELECT * FROM abstracts_ai", connection)
df

Unnamed: 0,id,title,date_published,abstract


___
## 03 - Retrieve and Store arXiv AI Article Abstracts

In [10]:
search = arxiv.Search(
            query = 'artificial intelligence',
            max_results = 2,
            sort_by = arxiv.SortCriterion.SubmittedDate,
            sort_order = arxiv.SortOrder.Descending)

In [13]:
for result in search.results():
    entry_id = result.entry_id
    uid = entry_id.split('.')[-1]
    title = result.title
    date_published = result.published
    abstract = result.summary
    
    query = 'INSERT OR REPLACE INTO abstracts_ai(id, title, date_published, abstract)' + \
            ' VALUES(?, ?, ?, ?);'
    
    fields = (uid, title, date_published, abstract)

    cursor.execute(query, fields)

In [14]:
# Fetch all rows
query = "SELECT * FROM abstracts_ai"
df = pd.read_sql_query("SELECT * FROM abstracts_ai", connection)
df

Unnamed: 0,id,title,date_published,abstract
0,05512v1,PAC-NeRF: Physics Augmented Continuum Neural R...,2023-03-09 18:59:50+00:00,Existing approaches to system identification (...
1,05510v1,Planning with Large Language Models for Code G...,2023-03-09 18:59:47+00:00,Existing large language model-based code gener...


## Alternative - Without SQLite

In [9]:
df_raw = pd.DataFrame()

In [10]:
for result in search.results():
    entry_id = result.entry_id
    uid = entry_id.split('.')[-1]
    title = result.title
    date_published = result.published
    abstract = result.summary
    
    result_dict = {'uid': uid,
                   'title': title,
                   'date_published': date_published,
                   'abstract': abstract
                  }
    
    df_raw = df_raw.append(result_dict, ignore_index=True)    

In [11]:
df_raw

Unnamed: 0,uid,title,date_published,abstract
0,05512v1,PAC-NeRF: Physics Augmented Continuum Neural R...,2023-03-09 18:59:50+00:00,Existing approaches to system identification (...
1,05510v1,Planning with Large Language Models for Code G...,2023-03-09 18:59:47+00:00,Existing large language model-based code gener...


___
## 04 - DataFrame Pre-Processing

In [15]:
print(df.dtypes)

id                object
title             object
date_published    object
abstract          object
dtype: object


In [16]:
df['date_published'] = pd.to_datetime(df['date_published'])

In [17]:
print(df.dtypes)

id                             object
title                          object
date_published    datetime64[ns, UTC]
abstract                       object
dtype: object


In [18]:
# Create empty column to store keyword extraction output
df['keywords_and_scores'] = ''

# Create empty column to store top keywords
df['keywords'] = ''

In [19]:
df

Unnamed: 0,id,title,date_published,abstract,keywords_and_scores,keywords
0,05512v1,PAC-NeRF: Physics Augmented Continuum Neural R...,2023-03-09 18:59:50+00:00,Existing approaches to system identification (...,,
1,05510v1,Planning with Large Language Models for Code G...,2023-03-09 18:59:47+00:00,Existing large language model-based code gener...,,


___
## 05 - Keyword Extraction with KeyBERT
- https://github.com/MaartenGr/KeyBERT
- https://maartengr.github.io/KeyBERT/guides/embeddings.html

In [13]:
# Using 'all-MiniLM-L6-v2' given its speed and good quality
# https://www.sbert.net/docs/pretrained_models.html#model-overview
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

In [21]:
# Define parameters
stop_words = 'english'
ngram_lower_bound = 1
ngram_upper_bound = 2
use_mmr = True
diversity = 0.2
use_maxsum=False
nr_candidates = 20
top_n = 3

In [22]:
for i, row in df.iterrows():
    abstract_text = row['abstract']
    kw_output = kw_model.extract_keywords(abstract_text, 
                                  keyphrase_ngram_range=(ngram_lower_bound, ngram_upper_bound), 
                                  stop_words=stop_words,
                                  use_mmr=use_mmr, 
                                  use_maxsum=use_maxsum,
                                  diversity=diversity,
                                  top_n=top_n)
    df.at[i, 'keywords_and_scores'] = kw_output
    
    # Obtain keyword from every keyword-score pair
    top_kw = []
    
    for pair in kw_output:
        top_kw.append(pair[0])
        
    df.at[i, 'keywords'] = top_kw

In [23]:
df

Unnamed: 0,id,title,date_published,abstract,keywords_and_scores,keywords
0,05512v1,PAC-NeRF: Physics Augmented Continuum Neural R...,2023-03-09 18:59:50+00:00,Existing approaches to system identification (...,"[(neural rendering, 0.4973), (object videos, 0...","[neural rendering, object videos, physics augm..."
1,05510v1,Planning with Large Language Models for Code G...,2023-03-09 18:59:47+00:00,Existing large language model-based code gener...,"[(code generation, 0.5558), (decoding pg, 0.43...","[code generation, decoding pg, transformer dec..."
