https://github.com/MaartenGr/BERTopic

https://www.holisticseo.digital/python-seo/topic-modeling/

In [1]:
%%capture
!pip install bertopic

In [None]:
!python -m spacy download fr_core_news_sm

In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
from bertopic import BERTopic

In [10]:
import spacy
nlp = spacy.load("fr_core_news_sm")

In [3]:
df = pd.read_csv("impresso2.csv", sep=';')

In [4]:
# Remove null values
df = df[df['content'].notna()]

# Convert dates to datetime objects and sort by date
df.date = pd.to_datetime(df.date)
df = df.sort_values(by = "date")
df = df.reset_index(drop=True)

In [5]:
# Data was manually extracted with keywords so there is a lot of noise, these are phrases
# that appear in some of the bigger documents that are not relevant to our task -- might still need to add more
# Remove these articles from the dataframe

to_remove = ['CINÉMA Apollo 1 Faubourg', 'CINÉMA Eden Rue de la', 'NOUS RECHERCHONS', 'Ciné NEUCHÂTEL', \
             'Ciné LA CHAUX-DE-FONDS', 'AGENDA', 'APOLLO', 'ANIMATIONS', 'RATIONALISATION DES TRANSFERTS', \
             'REALITE', 'INTÉGRATION DU NÉGOCE DES OPTIONS', 'SOFFEX', 'www . hbo', 'Mythologies Primitive', \
             'GENEVE Marché', 'Morat-Central', 'GENEVE Reprise', 'monnaie plastique', 'La BCV', 'aint-Valentin', \
             'GENÈVE Marché', 'ÉCOLE DE NATATION', 'SOCIETES RÉSULTATS', 'SOCIETES SUISSE TRÉFILERIES', 'SOCIÉTÉS', \
             'Stock Exchange', 'CONGÉLATEUR BAHUT', 'USS Une', 'MARCHÉS BOURSIERS', 'TTVj', 'Bourse électronique', \
             'IAIIBERTé VIE']
df = df[~df.content.isin(to_remove)]
df = df[~(df.content.str.contains("bourse") & df.content.str.contains("banque"))]
df = df[df.content.str.count('026') < 3]
df = df[df.content.str.count('032') < 3]
df = df[df.content.str.count('00') < 6]

df = df.reset_index(drop=True)

In [6]:
# Remove articles that contain these regular expressions -- they are not relevant
test = df.content.apply(lambda x: re.search('\d\d\s\.\s\d\d', x))
test2 = df.content.apply(lambda x: re.search('\~\~', x))

tests = [test, test2]

for t in tests:
    none = []
    for i in range(len(t)):
        none.append(t[i] is None)
    none = pd.Series(none)

    none_index = np.where(~none)

    df = df.drop(none_index[0])
    df = df.reset_index(drop=True)

In [7]:
# Articles with multiple phone numbers are advertisements, remove them
test = df.content.apply(lambda x: len(re.findall('\d\d\d\s\d\d\s\d\d', x)))
df = df.drop(test[test > 2].index)
df = df.reset_index(drop=True)

In [8]:
# Remove really short articles
df = df[df.content.str.len() > 300]
df = df.reset_index(drop=True)

In [9]:
df.shape

(4762, 21)

In [12]:
from preprocessing import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
data = spacy_prep(df.content)

In [None]:
#df_no_keywords = df.content.apply(lambda x: remove_keywords(x))
#data_nk = spacy_prep(df_no_keywords)

In [14]:
# Transform list of words back into strings
data_ready = []
for doc in data:
  data_ready.append(' '.join(doc))

In [15]:
# Fit a topic model
#can add different sentence transformers
topic_model = BERTopic(language="french")
topics, probs = topic_model.fit_transform(data_ready)

Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]



In [16]:
# Look at topics (-1 are outliers)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2032,-1_jeu_video_nintendo_console
1,0,228,0_course_voiture_rally_circuit
2,1,106,1_banque_bourse_suisse_societe
3,2,105,2_enfant_jean_noel_petit
4,3,93,3_blanc_pierre_noir_jeu
...,...,...,...
78,77,10,77_ballon_deruns_ballon_rouge_rouge
79,78,10,78_toy_eye_jouet_eye_toy
80,79,10,79_abe_rome_agrippa_shadow
81,80,10,80_joystick_lcr_killzone_minish


In [17]:
# Look at first topic
topic_model.get_topic(0)

[('course', 0.02213726674883922),
 ('voiture', 0.015677314277290362),
 ('rally', 0.015305212237113733),
 ('circuit', 0.015264973055434599),
 ('simulation', 0.015257614918504568),
 ('gran', 0.012882467861239321),
 ('turismo', 0.012349099836610108),
 ('bolide', 0.012268554158325671),
 ('jeu', 0.012030521742773036),
 ('mode', 0.011097255937952378)]

In [18]:
# Visualize topics
topic_model.visualize_topics()

In [19]:
topic_model.visualize_barchart()

In [20]:
topic_model.visualize_heatmap()

In [21]:
topic_model.visualize_term_rank()

In [22]:
newspapers = df.newspaper

In [23]:
years = df.year

In [24]:
topics_over_time = topic_model.topics_over_time(data_ready, topics, years, nr_bins=20)

In [25]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=6)

In [26]:
# Look at topics by newspaper
topics_per_class = topic_model.topics_per_class(data_ready, topics, newspapers)

In [27]:
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
#topic_model.get_representative_docs()

### Reduce the number of topics 

In [28]:
# Reduce the number of topics
new_topics, new_probs = topic_model.reduce_topics(data_ready, topics, probs, nr_topics=30)

In [30]:
topics_over_time = topic_model.topics_over_time(data_ready, new_topics, years, nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=8)

In [31]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2691,-1_jeu_video_nintendo_console
1,0,243,0_course_jeu_voiture_simulation
2,1,139,1_suisse_bourse_banque_million
3,2,136,2_support_teste_ps_wii
4,3,105,3_enfant_jean_noel_petit
5,4,98,4_neuchatel_super_postal_start
6,5,97,5_atari_ordinateur_logiciel_prix
7,6,93,6_pierre_blanc_noir_jeu
8,7,92,7_fossett_sport_match_football
9,8,76,8_xbox_microsoft_console_live


In [32]:
topic_model.visualize_topics()

In [33]:
topic_model.visualize_heatmap()

In [35]:
topics_per_class = topic_model.topics_per_class(data_ready, new_topics, newspapers)
topic_model.visualize_topics_per_class(topics_per_class)

In [47]:
# Can change labels here before plotting
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp,Name
0,-1,"noir, blanc, pierre, segalen, gauguin",13,1979.963,-1_jeu_video_nintendo_console
1,1,"banque, transfert, giro, terminal, document",2,1979.963,1_suisse_bourse_banque_million
2,3,"blanc, probleme, lear, gelure, ieisk",1,1979.963,3_enfant_jean_noel_petit
3,6,"pierre, blanc, noir, pierre_noir, cassette",5,1979.963,6_pierre_blanc_noir_jeu
4,17,"noir, blanc, scelle, necessaire, probleme",1,1979.963,17_burnout_jeu_pokemon_rainbow
...,...,...,...,...,...
328,18,"lambelet, yannick, yannick_lambelet, argentine...",1,2015.150,18_zero_project_project_zero_jeu
329,20,"datsyuk, pavel_datsyuk, pavel, grossmann, costet",2,2015.150,20_hockey_nhl_equipe_ligue
330,21,"one, the, xbox, division, clancy",3,2015.150,21_cell_sam_splinter_fisher
331,23,"angry, birds, verts, volatile, cochon",2,2015.150,23_professeur_aventure_layton_monster
