In [1]:
import os
import re
import json
import glob
import pickle
import string
import itertools
import collections
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline

In [2]:
tqdm.pandas()

In [3]:
pd.set_option('display.max_rows', 10)

In [4]:
# input path
SUPPORT_PATH = "../data/supports"

In [5]:
def read_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [None]:
d_199 = read_pickle(os.path.join(SUPPORT_PATH, "parsed_199.pkl"))
d_6725 = read_pickle(os.path.join(SUPPORT_PATH, "parsed_7003.pkl"))
d_tweets = pd.concat([d_199, d_6725], axis = 0, sort=False)
d_tweets.drop_duplicates(subset="id_tweet", inplace=True)
d_tweets.reset_index(drop=True, inplace=True)

In [None]:
d_label = pd.read_csv("../data/account_labeled/label_updated.csv")

In [None]:
d_label.head()

In [None]:
d_label.shape

In [None]:
d_tweets.head()

In [None]:
d_tweets.shape

In [None]:
d_label = d_label[~d_label.label.isna()]

In [None]:
d_label.head()

In [None]:
d_label.label.value_counts()

In [None]:
# 2797 accounts are labeled, the rest are excluded before parsing
d_tweets[d_tweets.screen_name.isin(d_label.screen_name)].screen_name.unique().shape

In [None]:
d_dataset = d_tweets.groupby('screen_name')['full_text'].apply(list)
d_dataset = d_dataset.reset_index()

# join dataset and label
d_dataset = d_dataset.join(d_label.set_index('screen_name'), on='screen_name')

In [None]:
def get_profile_key(screen_name, key):
    profile_path = os.path.join(f"../data/profile/{screen_name}.json")
    if os.path.exists(profile_path):
        with open(profile_path) as f:
            data = json.load(f)
            if isinstance(data, dict):
                return data.get(key, None)
            else:
                return None

In [None]:
# get verified
d_dataset["is_verified"] = d_dataset.screen_name.progress_apply(get_profile_key, args=('verified', ))

In [None]:
d_dataset.is_verified.value_counts()

In [None]:
# get description profile
d_dataset["profile_description"] = d_dataset.screen_name.progress_apply(get_profile_key, args=('description', ))

In [None]:
def check_akun_resmi(description):
    if description:
        if re.search("akun .* resmi", description.lower()):
            return True
        else:
            return False
    else:
        return False

In [None]:
d_dataset["is_akun_resmi"] = d_dataset.profile_description.apply(check_akun_resmi)

In [None]:
d_dataset.loc[:, 'num_tweets'] = d_dataset.full_text.apply(lambda x : len(x))

In [None]:
d_dataset2 = d_dataset[d_dataset.num_tweets >= 20]

In [None]:
d_dataset.shape

In [None]:
d_dataset2.shape

In [None]:
# using 10 samples
d_dataset2['text_used'] = d_dataset2.full_text.apply(lambda x: " ".join(x[:30]))

In [None]:
d_dataset2.shape

In [None]:
d_dataset2 = d_dataset2[(d_dataset2.is_verified == False) & (d_dataset2.is_akun_resmi == False)]

In [None]:
d_dataset2.shape

In [None]:
d_train = d_dataset2[d_dataset2.label.notna()]

In [None]:
d_train.reset_index(drop=True, inplace=True)

In [None]:
d_train.shape

In [None]:
def text_cleansing(title):
    punctuation = '!"#$%&\'()*+,-./:;=?@[\\]^_`{|}~'
    table = str.maketrans(punctuation, ' '*len(punctuation)) #map punctuation to space
    
    # parse hashtag
    title = re.sub(r"([a-z])([A-Z])", r"\1 \2", title)
    # lowercase
    title = title.lower()
    # convert hyperlinks to link
#     title = re.sub('http(s):/\/\\S+', '<LINK> ', title)
    title = re.sub('http(s):/\/\\S+', ' ', title)
    # convert @username to username
#     title = re.sub('@\w+', '<USERNAME>', title)
    title = re.sub('@\w+', ' ', title)
    # remove punctuation
    title = title.translate(table)
    # only take string started with alphanum
    title = re.sub("[^(\w|\<\>)]", ' ', title)
    # remove double whitespaces
    title = re.sub('\s+', ' ', title)
    # remove double whitespaces
    title = title.strip()
    
    return title

In [None]:
d_train["preprocessed_text"] = d_train.text_used.apply(text_cleansing)

In [None]:
d_train.loc[:, 'preprocessed_text_token'] = d_train.preprocessed_text.apply(word_tokenize)

In [None]:
d_train

In [None]:
all_words = itertools.chain.from_iterable(d_train.preprocessed_text_token)

In [None]:
all_words = list(all_words)

In [None]:
vocab_freq = collections.Counter(all_words)

In [None]:
d_vocab_freq = pd.DataFrame(vocab_freq.items(), columns=["word", "freq"]).sort_values("freq", ascending=False)

In [None]:
stop_words = d_vocab_freq[(d_vocab_freq.freq <= 2) | (d_vocab_freq.freq >= 2500)].word.to_list()

In [None]:
vocab_used = d_vocab_freq[d_vocab_freq.freq.between(2, 2500)].word.to_list()

In [None]:
vocab_used2 = list(filter(lambda x: True if (len(x) > 2) & (not x.isnumeric()) else False, vocab_used))

In [None]:
len(vocab_used2)

In [None]:
vocab_used2 = sorted(vocab_used2)

In [None]:
d_train

In [None]:
d_train

In [None]:
d_train.loc[:, "label_encoded"] = d_train.label.map({'buzzer': 1, 'non-buzzer': 0})

In [None]:
d_train.label.value_counts()

In [None]:
d_train.label_encoded.value_counts().plot(kind='pie', autopct='%.2f')

#### Creating Count Matrix

In [None]:
d_train

In [None]:
d_train.shape

In [None]:
X_count = np.zeros((len(d_train), len(vocab_used2)))

In [None]:
vocab_used[0]

In [None]:
for idx, token_list in tqdm(enumerate(d_train.preprocessed_text_token)):
    for token in token_list:
        try:
            X_count[idx, vocab_used2.index(token)] += 1
        except:
            pass

### Exprimenting with simple algorithm

In [None]:
# tfidf = TfidfVectorizer()
tfidf = TfidfTransformer()

In [None]:
# text_tfidf = tfidf.fit_transform(d_train.preprocessed_text)
text_tfidf = tfidf.fit_transform(X_count)

In [None]:
text_tfidf.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_tfidf, d_train.label_encoded, test_size=0.2, random_state=123)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
def scoring(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"accuracy: {acc:.2f} | precision: {pre:.2f} | recall: {rec:.2f} | f score: {f1:.2f}")

In [None]:
model = BernoulliNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)

In [None]:
model = SVC(gamma='scale', kernel='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)

In [None]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)

In [None]:
model = GradientBoostingClassifier(n_estimators=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)

In [None]:
model = AdaBoostClassifier(n_estimators=400)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scoring(y_test, y_pred)