In [1]:
# when using google drive
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
import os
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import pickle
import regex as re
from tqdm import tqdm
from tqdm.notebook import tqdm

import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Load Data

In [3]:
data_stance = pd.read_csv("/content/gdrive/Shareddrives/tk2-e-anamedsos/labeled-all-data.csv")

In [4]:
data_stance.head()

Unnamed: 0,id,tweet,stance
0,1285640199540805634,"rt @blogdokter: meminta maaf sih mudah banget,...",ABSTAIN
1,1285640514872782852,dukungan aleg golkar jadi kelinci percobaan va...,PRO
2,1285640881089974273,"rt @blogdokter: meminta maaf sih mudah banget,...",ABSTAIN
3,1285643172274737152,rt @alvinlie21: ombudsman jelaskan kelinci per...,PRO
4,1285645639515279361,rt @tawakalback: dukungan aleg golkar jadi kel...,PRO


In [5]:
data_stance_pro = data_stance[data_stance['stance']=='PRO']

# Data Cleaning

In [6]:
def clean_data(tweet):
    # lowercase
    normal_tw = tweet.lower()
    # hapus b'
    normal_tw = re.sub(r'^b\'', '', normal_tw)
    # hapus RT
    normal_tw = re.sub(r'^rt ', '', normal_tw)
    # hapus emoji
    normal_tw = re.sub(r'\\x.{2}', '', normal_tw)
    # hapus www.* atau https?://* (URL)
    normal_tw = re.sub(r'((www\.[^\s]*)|(https?://[^\s]*))', '', normal_tw)
    # remove spasi berlebih
    normal_tw = re.sub(r'\s+', ' ', normal_tw)
    # trim depan belakang
    normal_tw = normal_tw.strip()
    # regex huruf yang berulang kaya haiiii (untuk fitur unigram)
    normal_regex = re.compile(r"(.)\1{1,}")
    # buang huruf yang berulang
    normal_tw = normal_regex.sub(r"\1\1", normal_tw)
    # hapus @username
    normal_tw = re.sub(r'@[^\s]+', '', normal_tw)
    # hapus hashtag
    normal_tw = re.sub(r'#[^\s]+', '', normal_tw)
    # hapus tanda baca
    normal_tw = re.sub(r'[^\w\s]', '', normal_tw) 
    # hapus angka
    normal_tw = re.sub(r'\d+', ' ', normal_tw) 
    return normal_tw

def remove_stopwords_and_normalize(tweet):
    token = nltk.word_tokenize(tweet)
    token_new = []
    for k in token:
        if k in df_kamus_singkatan['singkatan'].values:
            k = df_kamus_singkatan.loc[df_kamus_singkatan['singkatan']
                                       == k, 'asli'].values[0]
        if k in df_kamus_alay['slang'].values:
            k = df_kamus_alay.loc[df_kamus_alay['slang']
                                  == k, 'formal'].values[0]
        if k not in stopwords[0].values:
            token_new.append(k)

    str_clean = ' '.join(token_new)
    return str_clean

def pra_pemrosesan(list_tweet):
    tweet_clean = []
    for tw in tqdm(list_tweet):
        normal_tweet = clean_data(tw)
        normal_tweet = remove_stopwords_and_normalize(normal_tweet)
        tweet_clean.append(normal_tweet)
    return tweet_clean

In [7]:
# Mengambil data
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=1rFLB_1QpZhKyWFGxqPo5znTEkh7dUhPf' -O dataset.csv
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=1xEIQwYre1SR71uRdQuez9MDmAUIysvAG' -O stopwordsID.csv
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=19NOzXA8Voturopg_DTuiMns3s4M2IAUz' -O kamus_singkatan.csv
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=1VjgivEr1pxyRCuyhVifPnaReFz0yd8Us' -O colloquial-indonesian-lexicon.csv

In [8]:
stopwords = pd.read_csv("stopwordsID.csv", header=None)
df_kamus_singkatan = pd.read_csv('kamus_singkatan.csv')
df_kamus_alay = pd.read_csv('colloquial-indonesian-lexicon.csv')

raw_tweet = data_stance_pro['tweet']
clean_tweet = np.array(pra_pemrosesan(raw_tweet))

  0%|          | 0/44668 [00:00<?, ?it/s]

# Answer for Bonus Question 1

In [11]:
# source: https://covid19.go.id/tentang-vaksin-covid19
vaccine_type_dict = {
    'sinovac': ['sinovac'],
    'astrazeneca': ['astrazeneca', 'astra', 'az'],
    'sinopharm': ['sinopharm'],
    'moderna': ['moderna'],
    'pfizer': ['pfizer'],
    'novavax': ['novavax'],
    'sputnik-v': ['sputnik-v', 'sputnik'],
    'janssen': ['janssen'],
    'convidencia': ['convidencia'],
    'zifivax': ['zifivax'],
}

In [12]:
vaccine_count_dict = {key: 0 for key in vaccine_type_dict.keys()}

# for each tweet
for tweet in tqdm(clean_tweet):

  # check the occurence for each vaccine type by its common name in society
  for vaccine, vaccine_list in vaccine_type_dict.items():

    # check for each common name
    for vaccine_type in vaccine_list:

      # count if exist and pass to another vaccine type
      if vaccine_type in tweet:
        vaccine_count_dict[vaccine] += 1
        break

  0%|          | 0/44668 [00:00<?, ?it/s]

In [13]:
vaccine_count_dict

{'astrazeneca': 1213,
 'convidencia': 0,
 'janssen': 10,
 'moderna': 173,
 'novavax': 25,
 'pfizer': 669,
 'sinopharm': 128,
 'sinovac': 1619,
 'sputnik-v': 91,
 'zifivax': 66}

In [14]:
vaccine_count_df = pd.DataFrame({
    'type': list(vaccine_count_dict.keys()),
    'count': list(vaccine_count_dict.values())
})
vaccine_count_df.sort_values('count', inplace=True)
vaccine_count_df

Unnamed: 0,type,count
8,convidencia,0
7,janssen,10
5,novavax,25
9,zifivax,66
6,sputnik-v,91
2,sinopharm,128
3,moderna,173
4,pfizer,669
1,astrazeneca,1213
0,sinovac,1619


In [15]:
fig = px.bar(vaccine_count_df,
             x="count",
             y="type",
             text_auto=True,
             title='Number of Vaccine Type on Pro Tweets')
fig.show()