# Modules

In [1]:
import pandas as pd
from sqlalchemy import create_engine
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


# Conect to PostgreSQL

In [2]:
DATABASE_URL = 'postgresql://postgres:VpJkyKPDSDUtTXwOTSlyyZvjUBPsLGdo@maglev.proxy.rlwy.net:46115/railway'

engine = create_engine(DATABASE_URL)

query = 'SELECT * FROM books'
df = pd.read_sql(query, engine)

print(df.head())

   id                                  title             authors  \
0   1       Alice's Adventures in Wonderland       Lewis Carroll   
1   2  Frankenstein or The Modern Prometheus        Mary Shelley   
2   3             The Wonderful Wizard of Oz       L. Frank Baum   
3   4                       The Time Machine         H. G. Wells   
4   5                         The Lost World  Arthur Conan Doyle   

   first_publish_year          subject  edition_count               key  
0                1865  science_fiction           3546  /works/OL138052W  
1                1818  science_fiction           2184  /works/OL450063W  
2                1899  science_fiction           2052   /works/OL18417W  
3                1895  science_fiction           1151   /works/OL52267W  
4                1900  science_fiction            747  /works/OL262460W  


# Preprocessing and Topic Modeling

In [3]:
df = df.dropna(subset=['title'])

In [4]:
topic_model = BERTopic(language="english")

In [5]:
topics, _ = topic_model.fit_transform(df['title'])

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
df['topic'] = topics

In [10]:
df['topic']

0       18
1       -1
2        2
3       12
4        3
        ..
1195    35
1196     1
1197    27
1198    11
1199    18
Name: topic, Length: 1200, dtype: int64

In [7]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,364,-1_the_of_and_confessions,"[the, of, and, confessions, at, peter, pan, in...",[A strange manuscript found in a copper cylind...
1,0,79,0_de_le_la_en,"[de, le, la, en, du, les, jours, tour, quatrev...","[Le Tour du Monde en Quatre-Vingts Jours, Le T..."
2,1,48,1_magic_city_wet_down,"[magic, city, wet, down, rainbow, colour, wate...","[The Magic City, The Magic City, The Magic City]"
3,2,37,2_wizard_oz_wonderful_giant,"[wizard, oz, wonderful, giant, wicked, scarecr...","[The Wonderful Wizard of Oz, The Wonderful Wiz..."
4,3,37,3_lost_last_world_war,"[lost, last, world, war, battle, horizon, blaz...","[The Lost World, The Lost World, The Lost World]"
5,4,34,4_black_death_dead_room,"[black, death, dead, room, love, house, beauti...","[Death on the Nile, The Dead Room, Black house]"
6,5,33,5_autobiography_memoirs_my_charlottes,"[autobiography, memoirs, my, charlottes, web, ...","[Autobiography, Autobiography, Autobiography]"
7,6,30,6_abraham_caesar_david_lincoln,"[abraham, caesar, david, lincoln, julius, jude...","[Abraham Lincoln, Jude the Obscure, Jude the O..."
8,7,29,7_bruno_sylvie_daisy_anna,"[bruno, sylvie, daisy, anna, madame, miller, k...","[Sylvie and Bruno, Sylvie and Bruno, Sylvie an..."
9,8,27,8_northern_lights_dark_shades,"[northern, lights, dark, shades, fifty, darkne...","[Northern Lights, Northern Lights, Northern Li..."


In [8]:
topic_model.visualize_topics()

# Save

In [9]:
df.to_sql('books_with_topics', engine, if_exists='replace', index=False)

200