In [None]:
import os
import re
import glob
import json
import pickle

from tqdm import tqdm
import datetime as dt
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
PROFILE_FILTERED_PATH = "../data/supports/profile_filtered.pkl"
SCREEN_NAME_FILTERED = "screen_name_filtered.json"

In [None]:
paths = glob.glob("../data/profile/*")

In [None]:
data_list = []

In [None]:
data_error = []

In [None]:
for path in tqdm(paths):
    try:
        with open(path, 'r') as f:
            data = json.load(f)

        data_list.append(data)
    except:
        data_error.append(path)

In [None]:
len(data_list)

## filtering

In [None]:
profile_list = [profile for profile in data_list if type(profile) == dict]

In [None]:
len(profile_list)

In [None]:
d_profile = pd.DataFrame(data = profile_list)

In [None]:
d_profile.shape

In [None]:
cols = ['screen_name', 'name', 'description', 'friends_count', 'followers_count', 'statuses_count', 
        'favourites_count', 'protected', 'lang', 'created_at', 
        'blocked_by', 'blocking', 'contributors_enabled', 'default_profile', 'default_profile_image', 
        'entities', 'follow_request_sent', 'following', 'geo_enabled', 'has_extended_profile',
       'id', 'id_str', 'is_translation_enabled', 'is_translator',
       'listed_count', 'live_following', 'location', 'muting',
       'notifications', 'profile_background_color',
       'profile_background_image_url', 'profile_background_image_url_https',
       'profile_background_tile', 'profile_banner_url', 'profile_image_url',
       'profile_image_url_https', 'profile_link_color', 'profile_location',
       'profile_sidebar_border_color', 'profile_sidebar_fill_color',
       'profile_text_color', 'profile_use_background_image', 'status', 'time_zone',
       'translator_type', 'url', 'utc_offset', 'verified', 'withheld_in_countries']

In [None]:
d_profile = d_profile[cols]

In [None]:
month_dict = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06"
             , "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"}

In [None]:
def parse_created(date):
    date = re.sub(" \+[0-9]{0,4}", "", date)
    date = re.sub("^[a-zA-Z]{3} ", "", date)
    date = re.sub("[0-9]{2}:[0-9]{2}:[0-9]{2}", "", date)
    
    tgl = re.search("[0-9]{2}", date).group()
    bln = re.search("[A-Za-z]{3}", date).group()
    bln = month_dict[bln]
    thn = re.search("[0-9]{4}", date).group()
    date = tgl + "-" + bln + "-" + thn
    
    date = datetime.strptime(date, "%d-%m-%Y")
    
    return date

In [None]:
date_gathered = datetime.strptime("13-06-2020", "%d-%m-%Y")

In [None]:
d_profile["created_at_format"] = d_profile.created_at.apply(parse_created)

d_profile["days_duration"] = d_profile.created_at_format.apply(lambda x: (date_gathered - x).days)

In [None]:
fig = sns.distplot(d_profile.days_duration.values)

In [None]:
# fig.figure.savefig("days_duration.png")

In [None]:
d_profile["activity_count"] = d_profile.statuses_count + d_profile.favourites_count

In [None]:
d_profile["activity_perday"] = d_profile.activity_count / d_profile.days_duration

In [None]:
d_profile[["activity_count", "activity_perday", "days_duration"]].describe()

## get hashtag tweets

## cleansing

based on absolute actvity

In [None]:
d_profile.drop(d_profile[d_profile["activity_count"] < 10].index, inplace=True)

based on activity and account age above 5 years

In [None]:
d_profile.drop(d_profile[(d_profile["activity_count"] < 2000) & 
          (d_profile["days_duration"] > 365*5)].sort_values(by="activity_perday").index, inplace=True)

In [None]:
d_profile.shape

In [None]:
d_profile[["activity_count", "activity_perday", "days_duration"]].describe()

In [None]:
d_profile.shape

In [None]:
d_profile.drop(d_profile[(d_profile["friends_count"] > 100) & 
          (d_profile["followers_count"] > 100) & 
          (d_profile["days_duration"] > 365*5) &
          (d_profile["activity_perday"] < 10)].sort_values(by=["activity_perday"]).index, inplace=True)

Save to pickle

In [None]:
if False:
    pickle.dump(d_profile, open(PROFILE_FILTERED_PATH, 'wb'))

Save screen_name to json

In [None]:
if False:
    with open(SCREEN_NAME_FILTERED, 'w') as f:
        f.write(json.dumps({"screen_name": d_profile.screen_name.to_list()}))