In [1]:
import json
import pandas as pd
import glob
import random
from maleo.wizard import Wizard
from nltk.tokenize import word_tokenize
from nltk import ngrams, FreqDist
from tqdm import tqdm

In [2]:
def read_json(path):
    with open(path, 'r') as file:
        return json.load(file)

In [3]:
def get_data(list_path):
    list_tweets = []
    for user_path in tqdm(list_path):
        data = read_json(user_path)

        for twt in data['tweets']:
            list_tweets.append(twt['full_text'])

    df_tweets = pd.Series(list_tweets)
    return data, df_tweets

In [4]:
def get_all_hashtag(list_tweets):
    all_hashtag = []
    
    wiz = Wizard()
    twt_hashtag = wiz.get_hashtag(list_tweets)['Hashtag']
    n_twt_use_hashtag = len(twt_hashtag)
    
    for i in twt_hashtag:
        all_hashtag += i
    return n_twt_use_hashtag, all_hashtag

In [5]:
def data_preprocessing(data):
    wiz = Wizard()
    out = wiz.rm_link(data)
    out = wiz.rm_non_ascii(out)
    out = wiz.rm_char(out)
    out = wiz.rm_punc(out)
    out = wiz.rm_multiple_space(out)
    out = wiz.rm_stopword(out)
    out = out.apply(str.lower)
    return out

In [6]:
data_path = '../data/200_buzzer/'
all_buzzer = glob.glob(data_path+'*.json')

In [7]:
raw_data, df_data = get_data(all_buzzer)

100%|██████████| 199/199 [00:42<00:00,  4.66it/s]


In [8]:
print('Jumlah tweets=', len(df_data))

Jumlah tweets= 41533


In [9]:
n_twt_use_hashtag, all_hashtag = get_all_hashtag(df_data)

In [13]:
clean_data = data_preprocessing(df_data)

In [41]:
all_words = []

for dt in tqdm(clean_data):
    all_words = all_words + word_tokenize(dt)

all_counts = FreqDist(all_words)

100%|██████████| 41533/41533 [02:29<00:00, 277.57it/s]


In [43]:
len(all_counts)

57446

In [42]:
all_counts.most_common(10)

[('yg', 8758),
 ('amp', 3363),
 ('indonesia', 2819),
 ('gak', 2501),
 ('the', 2149),
 ('jokowi', 2062),
 ('covid', 2008),
 ('19', 1929),
 ('rakyat', 1860),
 ('aja', 1764)]

In [44]:
hashtag_freq = FreqDist(all_hashtag)
hashtag_freq.most_common(10)

[('ReformasiDikorupsi', 254),
 ('ShopeeID', 171),
 ('RakyatDukungNewPresident', 156),
 ('COVID19', 152),
 ('BersatuLawanCovid19', 137),
 ('TolakOmnibusLaw', 136),
 ('SolidaritasLawanCorona', 128),
 ('TangkisKomunis', 120),
 ('PerempuanBerdaulat', 119),
 ('BonekaSial', 117)]

In [45]:
pd.DataFrame(all_counts.most_common(1000), columns=['word', 'count']).to_csv('../result/word_freq.csv', index=False)

In [46]:
pd.DataFrame(hashtag_freq.most_common(1000), columns=['hashtag', 'count']).to_csv('../result/hashtag_freq.csv', index=False)