# Keyword Analysis with KeyBERT and Taipy

## 01 - Extraction of arXiv Abstracts with API
- https://github.com/lukasschwab/arxiv.py

In [None]:
import arxiv
import itertools
import pandas as pd
import sqlite3
from keybert import KeyBERT

In [None]:
search = arxiv.Search(
            query = 'artificial intelligence',
            max_results = 20,
            sort_by = arxiv.SortCriterion.SubmittedDate,
            sort_order = arxiv.SortOrder.Descending)

In [None]:
for result in search.results():
    print(result.entry_id)
    print(result.published)
    print(result.title)
    print(result.summary)

___
## 02 - SQLite Database Setup
- https://www.digitalocean.com/community/tutorials/how-to-use-the-sqlite3-module-in-python-3

In [None]:
# connection = sqlite3.connect("../data/abstracts.db")
# cursor = connection.cursor()

In [None]:
# # Create new table in database
# cursor.execute("CREATE TABLE IF NOT EXISTS abstracts_ai (id TEXT PRIMARY KEY, \
#                                                          title TEXT, \
#                                                          date_published TEXT, \
#                                                          abstract TEXT)"
#               )

In [None]:
# # Insert dummy row
# cursor.execute("INSERT INTO abstracts_ai VALUES ('a1', \
#                                                  'test_title', \
#                                                  '2023-02-16 18:16:09+00:00', \
#                                                  'test abstract text')"
#               )

In [None]:
# # Fetch all rows
# query = "SELECT * FROM abstracts_ai"
# df = pd.read_sql_query("SELECT * FROM abstracts_ai", connection)
# df

___
## 03 - Retrieve and Store arXiv AI Article Abstracts

In [None]:
# for result in search.results():
#     entry_id = result.entry_id
#     uid = entry_id.split('.')[-1]
#     title = result.title
#     date_published = result.published
#     abstract = result.summary
    
#     query = 'INSERT OR REPLACE INTO abstracts_ai(id, title, date_published, abstract)' + \
#             ' VALUES(?, ?, ?, ?);'
    
#     fields = (uid, title, date_published, abstract)

#     cursor.execute(query, fields)

In [None]:
# # Fetch all rows
# query = "SELECT * FROM abstracts_ai"
# df = pd.read_sql_query("SELECT * FROM abstracts_ai", connection)
# df

## Alternative - Without SQLite

In [None]:
df_raw = pd.DataFrame()

In [None]:
for result in search.results():
    entry_id = result.entry_id
    uid = entry_id.split('.')[-1]
    title = result.title
    date_published = result.published
    abstract = result.summary
    
    result_dict = {'uid': uid,
                   'title': title,
                   'date_published': date_published,
                   'abstract': abstract
                  }
    
    df_raw = df_raw.append(result_dict, ignore_index=True)    

In [None]:
df_raw

___
## 04 - DataFrame Pre-Processing

In [None]:
df = df_raw.copy()
print(df.dtypes)

In [None]:
df['date_published'] = pd.to_datetime(df['date_published'])

In [None]:
# Create empty column to store keyword extraction output
df['keywords_and_scores'] = ''

# Create empty column to store top keywords
df['keywords'] = ''

___
## 05 - Keyword Extraction with KeyBERT
- https://github.com/MaartenGr/KeyBERT
- https://maartengr.github.io/KeyBERT/guides/embeddings.html

In [None]:
# Using 'all-MiniLM-L6-v2' given its speed and good quality
# https://www.sbert.net/docs/pretrained_models.html#model-overview
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

In [None]:
# Define parameters
stop_words = 'english'
ngram_lower_bound = 1
ngram_upper_bound = 2
use_mmr = True
diversity = 0.1
use_maxsum=False
nr_candidates = 20
top_n = 8

In [None]:
for i, row in df.iterrows():
    abstract_text = row['abstract']
    kw_output = kw_model.extract_keywords(abstract_text, 
                                  keyphrase_ngram_range=(ngram_lower_bound, ngram_upper_bound), 
                                  stop_words=stop_words,
                                  use_mmr=use_mmr, 
                                  use_maxsum=use_maxsum,
                                  diversity=diversity,
                                  top_n=top_n)
    df.at[i, 'keywords_and_scores'] = kw_output
    
    # Obtain keyword from every keyword-score pair
    top_kw = []
    
    for pair in kw_output:
        top_kw.append(pair[0])
        
    df.at[i, 'keywords'] = top_kw

### Get value counts of keywords

In [None]:
keywords_count = pd.DataFrame(pd.Series([x for item in df.keywords for x in item]).value_counts()).reset_index()
keywords_count.columns = ['keyword', 'count']
keywords_count.head(10)