In [None]:
# when using google drive
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import pickle
import regex as re
from tqdm import tqdm

import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data Collection

In [None]:
df = pd.read_csv("/content/gdrive/Shareddrives/tk2-e-anamedsos/labeled-all-data.csv")
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
df.sample(5)

Unnamed: 0,id,tweet,stance
116079,1432379218885681154,@starfess udahh tadi pagi vaksin ke duaa,ABSTAIN
4310,1527305578544271361,/jbrfess/ Rek info vaksin booster jember kotaa...,ABSTAIN
37861,1365050492787650560,@kompascom sedekah vaksin ke yang membutuhkan.,ABSTAIN
1316,1527083317132021761,"@potatouwwu vaksin 3 dan vaksin 2, itu sama ya...",ABSTAIN
137114,1442897539590590470,"mau tidur madep kiri susah banget yaallah, gar...",ABSTAIN


# Demography Analysis

## Data Preparation

In [None]:
def clean_data(tweet):
    # lowercase
    normal_tw = tweet.lower()
    # hapus b'
    normal_tw = re.sub(r'^b\'', '', normal_tw)
    # hapus RT
    normal_tw = re.sub(r'^rt ', '', normal_tw)
    # hapus emoji
    normal_tw = re.sub(r'\\x.{2}', '', normal_tw)
    # hapus www.* atau https?://* (URL)
    normal_tw = re.sub(r'((www\.[^\s]*)|(https?://[^\s]*))', '', normal_tw)
    # remove spasi berlebih
    normal_tw = re.sub(r'\s+', ' ', normal_tw)
    # trim depan belakang
    normal_tw = normal_tw.strip()
    # regex huruf yang berulang kaya haiiii (untuk fitur unigram)
    normal_regex = re.compile(r"(.)\1{1,}")
    # buang huruf yang berulang
    normal_tw = normal_regex.sub(r"\1\1", normal_tw)
    # hapus @username
    normal_tw = re.sub(r'@[^\s]+', '', normal_tw)
    # hapus hashtag
    normal_tw = re.sub(r'#[^\s]+', '', normal_tw)
    # hapus tanda baca
    normal_tw = re.sub(r'[^\w\s]', '', normal_tw) 
    # hapus angka
    normal_tw = re.sub(r'\d+', ' ', normal_tw) 
    return normal_tw

def remove_stopwords_and_normalize(tweet):
    token = nltk.word_tokenize(tweet)
    token_new = []
    for k in token:
        if k in df_kamus_singkatan['singkatan'].values:
            k = df_kamus_singkatan.loc[df_kamus_singkatan['singkatan']
                                       == k, 'asli'].values[0]
        if k in df_kamus_alay['slang'].values:
            k = df_kamus_alay.loc[df_kamus_alay['slang']
                                  == k, 'formal'].values[0]
        if k not in stopwords[0].values:
            token_new.append(k)

    str_clean = ' '.join(token_new)
    return str_clean

def pra_pemrosesan(list_tweet):
    tweet_clean = []
    for tw in tqdm(list_tweet):
        normal_tweet = clean_data(tw)
        normal_tweet = remove_stopwords_and_normalize(normal_tweet)
        tweet_clean.append(normal_tweet)
    return tweet_clean

In [None]:
# Mengambil data
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=1rFLB_1QpZhKyWFGxqPo5znTEkh7dUhPf' -O dataset.csv
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=1xEIQwYre1SR71uRdQuez9MDmAUIysvAG' -O stopwordsID.csv
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=19NOzXA8Voturopg_DTuiMns3s4M2IAUz' -O kamus_singkatan.csv
!wget -q --no-check-certificate 'https://docs.google.com/uc?export=download&id=1VjgivEr1pxyRCuyhVifPnaReFz0yd8Us' -O colloquial-indonesian-lexicon.csv

In [None]:
stopwords = pd.read_csv("stopwordsID.csv", header=None)
df_kamus_singkatan = pd.read_csv('kamus_singkatan.csv')
df_kamus_alay = pd.read_csv('colloquial-indonesian-lexicon.csv')

raw_tweet = df['tweet']
clean_tweet = np.array(pra_pemrosesan(raw_tweet))

100%|██████████| 161864/161864 [19:18<00:00, 139.73it/s]


In [None]:
df['clean_tweet'] = clean_tweet

In [None]:
df.to_csv('/content/gdrive/MyDrive/Kelompok E Anamedsos/CODE/Pertanyaan 1/df_oac.csv', index=False)

## Job Area Prediction

Import vectorizer for tweet.

In [None]:
with open('/content/gdrive/MyDrive/Kelompok E Anamedsos/CODE/Pertanyaan 1/vectorizer_tweet_oac.sav', 'rb') as vect:
  vectorizer_tweet = pickle.load(vect)

Import model for occupational area prediction.

In [None]:
with open('/content/gdrive/MyDrive/Kelompok E Anamedsos/CODE/Pertanyaan 1/oac_best_model.sav', 'rb') as model:
  oac_model = pickle.load(model)

In [None]:
btm_idx = 0
top_idx = 5000
predictions = []

while btm_idx < top_idx:
  # slice list
  print(btm_idx, top_idx-1)
  list_tweet = df['clean_tweet'].loc[btm_idx:top_idx-1].tolist()

  # vectorize tweet
  X = vectorizer_tweet.transform(list_tweet).todense()

  # predict job area and append to result list
  y_pred = oac_model.predict(X)
  predictions.append(y_pred)

  # delete vectorized tweet
  del X

  # update indexes
  btm_idx += 5000
  top_idx += 5000
  if top_idx > len(df['clean_tweet']):
    top_idx = len(df['clean_tweet'])

0 4999
5000 9999
10000 14999
15000 19999
20000 24999
25000 29999
30000 34999
35000 39999
40000 44999
45000 49999
50000 54999
55000 59999
60000 64999
65000 69999
70000 74999
75000 79999
80000 84999
85000 89999
90000 94999
95000 99999
100000 104999
105000 109999
110000 114999
115000 119999
120000 124999
125000 129999
130000 134999
135000 139999
140000 144999
145000 149999
150000 154999
155000 159999
160000 161863


In [None]:
list_prediction = np.array([])

for pred in predictions:
  list_prediction = np.concatenate((list_prediction, pred))

In [None]:
df['job_area'] = list_prediction

In [None]:
df.to_csv('/content/gdrive/MyDrive/Kelompok E Anamedsos/CODE/Pertanyaan 1/df_oac_pred.csv', index=False)

# Answer for Question 1

In [None]:
# for prop in prop_list:
#   res = validation[prop].value_counts(normalize=True).rename_axis('label').reset_index(name='percentage')
#   res['property'] = prop
#   res_list.append(res)

# comp_summary = pd.concat(res_list, ignore_index=True, sort=False)

# # transform value of 1 and 0
# dict_map = {1: 'Complete', 0: 'Incomplete'}
# comp_summary['label'] = comp_summary['label'].map(dict_map)

In [None]:
df_health = df[df['job_area']=='kesehatan']

In [None]:
res_df = df_health['stance'].value_counts().rename_axis('label').reset_index(name='count')

In [None]:
test = df_health['stance'].value_counts(normalize=True).rename_axis('label').reset_index(name='percentage')

In [None]:
import plotly.graph_objects as go

In [None]:
# fig = px.pie(res_df, values='count', names='label', title='...')
colors = {
    "ABSTAIN": "#f9a73e",
    "PRO": "#006f3c",
    "KONTRA": "#bf212f",
}

s = pd.Series(colors)

fig = go.Figure(data=[go.Pie(labels=res_df['label'], values=res_df['count'], hole=.3)],
                layout_title_text='Percentage of Stance from Healthcare Job Area')

fig.update_traces(hoverinfo='label+percent', textinfo='label+value+percent', textfont_size=12,
                  marker=dict(colors=s))
fig.show()