# Exploratory Data Analysis

### Background
boundaries yang jelas antara buzzer dan non buzzer
kalo pake rule based mungkin bakal:

- banyak tweet dari suatu account yang pake hashtag, e.g 80% dari tweetnya pake hashtag
- banyak retweet yang mengandung hashtag atau yang bermuatan politik entah berupa text, gambar atau video
- Jika tidak menggunakan hashtag, tidak meretweet, namun sering membuat tweet yang me-mention tokoh-tokoh politik dan sangat sering e.g 70% of the tweets

In [8]:
import re
import os
import pytz
import pickle
from datetime import datetime, timedelta, timezone
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import chain
from collections import Counter

In [2]:
tqdm.pandas()

In [3]:
def read_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [4]:
d_accounts = pd.read_csv("../data/account_labeled/project_12_labels_Thu_Oct_15_2020.csv")

In [5]:
d_199 = read_pickle("../data/supports/parsed_199.pkl")
d_6725 = read_pickle("../data/supports/parsed_7003.pkl")
d_tweets = pd.concat([d_199, d_6725], axis = 0, sort=False)
d_tweets.reset_index(drop=True, inplace=True)

In [9]:
d_tweets.drop_duplicates(subset="id_tweet", inplace=True)

In [None]:
print(f'Total tweets: {d_tweets.shape[0]:,}')

In [None]:
d_accounts.columns = ["id", "text", "label"]

In [None]:
d_accounts.shape

In [None]:
d_accounts.label.value_counts().plot.pie(autopct='%1.1f')
plt.savefig("../data/figures/label_percentage.png")

In [None]:
d_accounts = d_accounts[d_accounts.label != 'inactive']

In [None]:
d_accounts.shape

In [None]:
d_accounts.label.value_counts().plot.pie(autopct='%1.1f')

In [None]:
d_accounts['screenname'] = d_accounts.text.apply(lambda text: text.split()[0].split('/')[-1])

In [None]:
d_accounts

In [None]:
d_tweets.head()

In [None]:
utc7 = pytz.timezone("Asia/Jakarta")

In [None]:
d_tweets["created_at_format"] = d_tweets.created_at.progress_apply(
    lambda x: datetime.strptime(x, "%a %b %d %H:%M:%S %z %Y"))

In [None]:
d_tweets["created_at_format"] = d_tweets.created_at_format.progress_apply(
    lambda x: x.tz_convert(utc7))

### calculate average tweets per day given an account

In [None]:
d_accounts.loc[:, "mean"] = d_accounts.loc[:, "screenname"].progress_apply(
    lambda screenname: d_tweets.loc[d_tweets.screen_name == screenname].groupby(
    by=d_tweets.created_at_format.dt.date)["id_tweet"].count().mean())

In [None]:
plt.figure(figsize=(5, 8))
sns.boxplot(x = 'label', y = 'mean', data = d_accounts)

In [None]:
plt.savefig("../data/figures/average-tweets-per-account.png")

### calculate median tweets per day given an account

In [None]:
d_accounts.loc[:, "median"] = d_accounts.loc[:, "screenname"].progress_apply(
    lambda screenname: d_tweets.loc[d_tweets.screen_name == screenname].groupby(
    by=d_tweets.created_at_format.dt.date)["id_tweet"].count().median())

In [None]:
plt.figure(figsize=(5, 8))
sns.boxplot(x = 'label', y = 'median', data = d_accounts)
plt.savefig("../data/figures/average-tweets-per-account.png")

### calculate average hashtags per account

In [None]:
d_accounts["average_hashtag"] = d_accounts.loc[:, "screenname"].progress_apply(
    lambda x: d_tweets.loc[d_tweets.screen_name.isin([x])].hashtags.apply(len).mean())

In [None]:
plt.figure(figsize=(4,8))
sns.boxplot(x = 'label', y = 'average_hashtag', data = d_accounts)
plt.savefig("../data/figures/average_hashtag_per_user.png")

### average retweet per hari

In [None]:
d_accounts["average_rt"] = d_accounts.screenname.progress_apply(lambda x:
    d_tweets[d_tweets.screen_name.isin([x])].full_text.str.contains('^RT', regex=True).mean())

In [None]:
plt.figure(figsize=(3,5))
sns.boxplot(x = "label", y = "avergae_rt", data = d_accounts)
plt.savefig("../data/figures/average_rt_per_account.png")

### berapa banyak tweet yang pake hashtag

In [None]:
d_accounts.head()

In [None]:
d_accounts["average_tweet_contain_hashtag"] = d_accounts.screenname.progress_apply(
    lambda x: (d_tweets[d_tweets.screen_name.isin([x])].hashtags.apply(len) > 0).mean())

In [None]:
plt.figure(figsize=(5,8))
sns.boxplot(x = 'label', y = 'average_tweet_contain_hashtag', data = d_accounts)
plt.savefig("../data/figures/average_tweet_contain_hashtag")

In [None]:
hashtag_list = d_tweets.hashtags.apply(lambda x: [hashtag["text"] for hashtag in x]).to_list()

In [None]:
hashtag_list_flat = list(chain(*hashtag_list))

hashtag_dict = dict(Counter(hashtag_list_flat))

d_hashtag = pd.DataFrame(data={"hashtag": list(hashtag_dict.keys()), "frequency": list(hashtag_dict.values())})

In [None]:
d_hashtag.to_csv("../data/supports/hashtag.csv", index=False)

In [None]:
d_hashtag.sort_values(by="frequency", ascending=False)

In [None]:
screenname_buzzer = d_accounts.loc[d_accounts.label=="non-buzzer", "screenname"].tolist()

In [None]:
hashtag_list = d_tweets[d_tweets.screen_name.isin(screenname_buzzer)].hashtags.apply(lambda x: [hashtag["text"] for hashtag in x]).to_list()

In [None]:
hashtag_list_flat = list(chain(*hashtag_list))
hashtag_dict = dict(Counter(hashtag_list_flat))
d_hashtag = pd.DataFrame(data={"hashtag": list(hashtag_dict.keys()), "frequency": list(hashtag_dict.values())})

In [None]:
d_hashtag.sort_values(by="frequency", ascending=False)