In [None]:
import os
import re
import json
import glob
import pickle
import string
import itertools
import collections
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline

In [None]:
tqdm.pandas()

In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
# input path
SUPPORT_PATH = "../data/supports"

In [None]:
def read_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [None]:
d_199 = read_pickle(os.path.join(SUPPORT_PATH, "parsed_199.pkl"))
d_6725 = read_pickle(os.path.join(SUPPORT_PATH, "parsed_7003.pkl"))

In [None]:
d_tweets = pd.concat([d_199, d_6725], axis = 0, sort=False)
d_tweets.drop_duplicates(subset="id_tweet", inplace=True)
d_tweets.reset_index(drop=True, inplace=True)

In [None]:
d_label = pd.read_csv("../data/account_labeled/label_updated.csv")

In [None]:
d_label.head()

In [None]:
d_label.shape

In [None]:
d_tweets.head()

In [None]:
d_tweets.shape

In [None]:
d_label = d_label[~d_label.label.isna()]

In [None]:
d_label.head()

In [None]:
d_label.label.value_counts()

In [None]:
# 2797 accounts are labeled, the rest are excluded before parsing
d_tweets[d_tweets.screen_name.isin(d_label.screen_name)].screen_name.unique().shape

In [None]:
d_dataset = d_tweets.groupby('screen_name')['full_text'].apply(list)
d_dataset = d_dataset.reset_index()

# join dataset and label
d_dataset = d_dataset.join(d_label.set_index('screen_name'), on='screen_name')

In [None]:
def get_profile_key(screen_name, key):
    profile_path = os.path.join(f"../data/profile/{screen_name}.json")
    if os.path.exists(profile_path):
        with open(profile_path) as f:
            data = json.load(f)
            if isinstance(data, dict):
                return data.get(key, None)
            else:
                return None

In [None]:
# get verified
d_dataset["is_verified"] = d_dataset.screen_name.progress_apply(get_profile_key, args=('verified', ))

In [None]:
d_dataset.is_verified.value_counts()

In [None]:
# get description profile
d_dataset["profile_description"] = d_dataset.screen_name.progress_apply(get_profile_key, args=('description', ))

In [None]:
def check_akun_resmi(description):
    if description:
        if re.search("akun .* resmi", description.lower()):
            return True
        else:
            return False
    else:
        return False

In [None]:
d_dataset["is_akun_resmi"] = d_dataset.profile_description.apply(check_akun_resmi)

In [None]:
d_dataset.loc[:, 'num_tweets'] = d_dataset.full_text.apply(lambda x : len(x))

In [None]:
d_dataset.label.value_counts()

In [None]:
d_dataset2 = d_dataset[d_dataset.num_tweets >= 20]

In [None]:
d_dataset.shape

In [None]:
d_dataset2.shape

In [None]:
d_dataset2.label.value_counts()

In [None]:
# using n samples
n_tweets = 30
d_dataset2['text_used'] = d_dataset2.full_text.apply(lambda x: " ".join(x[:n_tweets]))

In [None]:
d_dataset2.shape

In [None]:
d_dataset2 = d_dataset2[d_dataset2.label.notna()]

In [None]:
d_dataset2.reset_index(drop=True, inplace=True)

In [None]:
d_dataset2.shape

In [None]:
def text_cleansing(title):
    punctuation = '!"#$%&\'()*+,-./:;=?@[\\]^_`{|}~'
    table = str.maketrans(punctuation, ' '*len(punctuation)) #map punctuation to space
    
    # parse hashtag
    title = re.sub(r"([a-z])([A-Z])", r"\1 \2", title)
    # lowercase
    title = title.lower()
    # convert hyperlinks to link
#     title = re.sub('http(s):/\/\\S+', '<LINK> ', title)
    title = re.sub('http(s):/\/\\S+', ' ', title)
    # convert @username to username
#     title = re.sub('@\w+', '<USERNAME>', title)
    title = re.sub('@\w+', ' ', title)
    # remove punctuation
    title = title.translate(table)
    # only take string started with alphanum
    title = re.sub("[^(\w|\<\>)]", ' ', title)
    # remove double whitespaces
    title = re.sub('\s+', ' ', title)
    # remove double whitespaces
    title = title.strip()
    # tokenize text
    tokens = word_tokenize(title)
    # filter tokens more than 2 characters
    tokens = list(filter(lambda x: len(x) > 2, tokens))
    # filter tokens not numeric only
    tokens = list(filter(lambda x: not x.isnumeric(), tokens))
    # revert to string
    title = " ".join(tokens)
    
    return title

In [None]:
d_dataset2["preprocessed_text"] = d_dataset2.text_used.apply(text_cleansing)

In [None]:
d_dataset2

In [None]:
d_dataset2.label = d_dataset2.label.astype(int)

In [None]:
d_dataset2.label.value_counts()

In [None]:
d_dataset2.label.value_counts().plot(kind='pie', autopct='%.2f')

#### Creating Count Matrix

In [None]:
d_dataset2.head()

In [None]:
d_dataset2.shape

In [None]:
d_dataset2['preprocessed_text_token'] = d_dataset2.preprocessed_text.apply(word_tokenize)

In [None]:
d_dataset2.head()

In [None]:
vocab = itertools.chain.from_iterable(d_dataset2.preprocessed_text_token.tolist())

In [None]:
vocab = sorted(set(list(vocab)))

In [None]:
vocab = ['PAD', 'UNK'] + vocab

### LSTM Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [None]:
d_train, d_test = train_test_split(d_dataset2)

In [None]:
d_train.reset_index(drop=True, inplace=True)
d_test.reset_index(drop=True, inplace=True)

In [None]:
class TwitterDataset(Dataset):
    def __init__(self, d_train, d_test):
        self.dataset = {
            'train': (d_train, len(d_train)),
            'test': (d_test, len(d_test))
        }
        
        self.set_split(split="train")
        
    def set_split(self, split="train"):
        self.data, self.length = self.dataset[split]
        
    def __getitem__(self, index):
        tokens = self.data.loc[index, 'preprocessed_text_token']
        label = self.data.loc[index, 'label']
        
        tokens = [vocab.index(token) for token in tokens]
        tokens = torch.tensor(tokens, dtype=torch.long)
        
        return (tokens, label)
    
    def __len__(self):
        return self.length

In [None]:
class Architecture(nn.Module):
    def __init__(self, num_vocab, emb_size, hidden_size, num_layer, dropout, is_bidirectional):
        super(Architecture, self).__init__()
        self.model = nn.Sequential(
            nn.Embedding(len(vocab), emb_size),
            nn.LSTM(emb_size, hidden_size, num_layers=num_layer, batch_first=True, dropout=dropout, bidirectional=is_bidirectional)
        )
        
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, input_):
        out, (h, c) = self.model(input_)
        out = self.fc(out[:, -1, :])
        out = torch.sigmoid(out)
        
        return out

In [None]:
dataset = TwitterDataset(d_train, d_test)

In [None]:
x, y = dataset.__getitem__(0)

In [None]:
num_vocab = len(vocab)
emb_size = 256
hidden_size = 128
num_layer = 1
dropout = 0
is_bidirectional=False

In [None]:
model = Architecture(num_vocab, emb_size, hidden_size, num_layer, dropout, is_bidirectional)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

In [None]:
def padding(data):
    x, y = zip(*data)
    
    x = pad_sequence(x, batch_first=True)
    y = torch.Tensor(y)
    
    return (x, y)

In [None]:
batch_size = 16

In [None]:
def calculate_accuracy(y_pred, y_true):
    y_pred = (y_pred > 0.5).long()
    n_correct = torch.eq(y_pred, y_true).sum().item()
    
    accuracy = (n_correct / len(y_true)) * 100
    
    return accuracy

In [None]:
for epoch in range(1, 101):
    
    running_loss = 0
    running_loss_v = 0
    running_acc = 0
    running_acc_v = 0
    
    dataset.set_split(split="train")
    data_gen = DataLoader(dataset, batch_size=batch_size, collate_fn=padding)
    model.train()
    for batch_index, (x, y) in enumerate(data_gen, 1):
        
        optimizer.zero_grad()
        
        out = model(x)
        out = out.squeeze()
        
        loss = criterion(out , y)
        running_loss += (loss.item() - running_loss) / batch_index
        loss.backward() 
        
        accuracy = calculate_accuracy(out, y)
        running_acc += (accuracy - running_acc) / batch_index
        
        optimizer.step()
        
        break
        
    dataset.set_split(split="test")
    data_gen = DataLoader(dataset, batch_size=batch_size, collate_fn=padding)
    model.eval()
    for batch_index, (x, y) in enumerate(data_gen, 1):
        
        with torch.no_grad():
            out = model(x)
            out = out.squeeze()
            
        loss = criterion(out, y)
        running_loss_v += (loss.item() - running_loss_v) / batch_index
        
        accuracy = calculate_accuracy(out, y)
        running_acc_v += (accuracy - running_acc_v) / batch_index
        
        break
    
    print(f"epoch: {epoch}")
    print(f"\t train loss: {running_loss:.2f} | accuracy: {running_acc:.2f}")
    print(f"\t val loss: {running_loss_v:.2f} | accuracy: { running_acc_v:.2f}")