In [242]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
from datetime import datetime


In [243]:

tweets = pd.read_csv("datalake/pre_proc/tweets_sobre_china.csv")

In [244]:
tweets.shape[0]

20000

In [245]:
tweets.isna().sum()

Unnamed: 0      0
Unnamed: 0.1    0
tweet_id        0
date            0
text            0
user            0
preproc         3
dtype: int64

In [246]:
tweets = tweets[ tweets.preproc.notnull() ]

In [247]:
tweets.shape[0]

19997

In [248]:
docs = tweets.preproc.to_list()
raw_docs = tweets.text.to_list()

# Verificando se ainda tem links depois do pre processamento

In [249]:
for doc in docs:
    if 'http' in doc:
        print(doc)

# Analisando as hashtags

In [250]:
from collections import Counter
hashtags = []
for doc in raw_docs:
    for word in doc.split():
        if "#" == word[0]:
            hashtags.append(word)

hashtags = Counter(hashtags)  

In [251]:
dict(sorted(hashtags.items(), reverse= True,key=lambda item: item[1]))

{'#China': 950,
 '#Taiwan': 633,
 '#china': 173,
 '#Pelosi': 145,
 '#ChinaTaiwanCrisis': 136,
 '#USA': 119,
 '#Russia': 95,
 '#taiwan': 79,
 '#NancyPelosi': 67,
 '#Japan': 65,
 "#China's": 59,
 '#Ukraine': 57,
 '#Taiwan.': 55,
 '#US': 46,
 '#TaiwanStraitsCrisis': 45,
 '#BreakingNews': 41,
 '#Chinese': 41,
 '#XiJinping': 38,
 '#thursdayvibes': 37,
 '#news': 34,
 '#CCP': 34,
 '#India': 34,
 '#onstorm': 33,
 '#AEWDynamite': 33,
 '#WWIII': 32,
 '#PokemonScarletViolet': 32,
 '#AllOrNothingArsenal': 32,
 '#ThursdayThoughts': 31,
 '#mediumwriters': 30,
 '#StarMagicOrigins': 30,
 '#SuperPets': 30,
 '#CFMTL': 30,
 '#China,': 27,
 '#RBandME:': 27,
 '#findyourthing': 27,
 '#redbubble': 27,
 '#': 27,
 '#China’s': 26,
 '#Asia': 25,
 '#China.': 25,
 '#Taiwan,': 25,
 '#NATO': 23,
 '#News': 23,
 '#Beijing': 22,
 '#1': 21,
 '#FoxNews': 21,
 '#Europe': 21,
 '#PLA': 20,
 '#ChinaTaiwan': 20,
 '#Taiwanchina': 20,
 '#Biden': 20,
 '#TaiwanStrait': 19,
 '#COVID19': 18,
 '#usa': 17,
 '#5': 17,
 '#NFTs': 16,
 '

# Analisando usuários citados

In [252]:
users = []
for doc in raw_docs:
    for word in doc.split():
        if "@" == word[0]:
            users.append(word)

users = Counter(users) 

In [253]:
dict(sorted(users.items(), reverse= True,key=lambda item: item[1]))

{'@SpokespersonCHN': 424,
 '@POTUS': 392,
 '@SpeakerPelosi': 332,
 '@iingwen': 240,
 '@Reuters': 219,
 '@FoxNews': 196,
 '@MFA_China': 191,
 '@YouTube': 156,
 '@MahuiChina': 124,
 '@globaltimesnews': 121,
 '@zlj517': 108,
 '@CGMeifangZhang': 100,
 '@Fallen_x_King': 95,
 '@chenweihua': 94,
 '@zerohedge': 72,
 '@BBCWorld': 64,
 '@ACTBrigitte': 57,
 '@EndGameWW3': 56,
 '@caitoz': 56,
 '@CNN': 54,
 '@RepSwalwell': 52,
 '@HawleyMO': 48,
 '@narendramodi': 48,
 '@WilliamHOverhol': 45,
 '@nytimes': 45,
 '@CNBCPolitics': 44,
 '@DruidSmith': 43,
 '@JoeBiden': 41,
 '@jalebidaddy': 41,
 '@BreitbartNews': 40,
 '@SecBlinken': 40,
 '@MOFA_Taiwan': 40,
 '@newsmax': 39,
 '@jljzen': 38,
 '@MrMayfieldUSA': 37,
 '@AMFChina': 36,
 '@GOPLeader': 36,
 '@RR88656428': 36,
 '@': 35,
 '@thesiriusreport': 35,
 '@MoNDefense': 34,
 '@DrSJaishankar': 34,
 '@MarshaBlackburn': 34,
 '@robreiner': 33,
 '@MgMgrrrr8': 31,
 '@LeaderMcConnell': 31,
 '@MarkHertling': 30,
 '@marcorubio': 30,
 '@TimRunsHisMouth': 28,
 '@WhiteH

# Usuários mais presentes no conjunto

In [254]:
tweets.user.value_counts()

Liam66665          68
Naro08689546       41
RR88656428         41
jalebidaddy        36
Lezette_China      30
                   ..
Luna_Lasagna        1
little2a2000        1
hey_jai520          1
cosmoschronicle     1
norrightnorlef1     1
Name: user, Length: 14700, dtype: int64

# Treinando modelo de ML para extrair tópicos, gerando N tópicos fixos

In [255]:
#TRAINING
topic_model = BERTopic( n_gram_range = (1,2), min_topic_size = 20, nr_topics = 30)
topics, probabilities = topic_model.fit_transform(docs)

In [256]:
topic_model.get_topics()

{-1: [('china', 0.022769115195997315),
  ('taiwan', 0.00847679686470876),
  ('amp', 0.007637639710150083),
  ('russia', 0.00758834661399685),
  ('country', 0.007348770047617666),
  ('chinese', 0.007095967154093716),
  ('like', 0.007090728417381178),
  ('pelosi', 0.006716374808542541),
  ('world', 0.006691805795971846),
  ('people', 0.0066485835070949646)],
 0: [('taiwan', 0.051130686519629355),
  ('china', 0.02789200472158169),
  ('taiwan china', 0.018164447081741298),
  ('china taiwan', 0.016548298870519246),
  ('people', 0.012085074998007305),
  ('country', 0.011034564287387276),
  ('taiwanese', 0.010231428280813429),
  ('republic china', 0.009979541998995225),
  ('republic', 0.009896280068158994),
  ('war', 0.009831972436946801)],
 1: [('russia', 0.07448067282468993),
  ('china russia', 0.03676907002466812),
  ('russia china', 0.03624524021322181),
  ('china', 0.024117913755670285),
  ('small business', 0.012425037885896252),
  ('go', 0.012415695870715969),
  ('world', 0.01238752218

# Visualizando tópicos

In [257]:
topic_model.visualize_topics()

In [258]:
topic_model.visualize_barchart()

# Visualizando tópicos overtime

In [259]:
timestamps = [timestamp.split("+")[0] for timestamp in tweets.date.to_list()]

In [260]:
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps, nr_bins= 20)

In [261]:
topic_model.visualize_topics_over_time(topics_over_time, topics = [i for i in range(0,20)])

# Visualizando tópicos por usuário: Top 20 usuários com mais publicações relacionados com os tópicos

In [262]:
top_users = tweets.user.value_counts().index[0:20]

In [263]:
tweets['classes'] = tweets.user.apply(lambda x: 'Other user' if x not in top_users else x)

In [264]:
classes = tweets.classes.to_list()
topics_per_class = topic_model.topics_per_class(docs, topics, classes = classes)

In [265]:
topics_per_class.loc[topics_per_class['Class'] == 'Other user', 'Frequency'] = 0

In [266]:
topic_model.visualize_topics_per_class(topics_per_class)
