In [1]:
import json
import pandas as pd
import glob
from maleo.wizard import Wizard
from tqdm import tqdm

In [2]:
def read_json(path):
    with open(path, 'r') as file:
        return json.load(file)

In [3]:
data_path = '../data/data_7200/*.json'
profile_data_path = '../data/profile_id.json'

## Get media and url link

In [4]:
def get_media_and_url(data):
    media_type = None
    url_link = None
    
    if 'quoted_status' not in data:
        try:
            media_type = data['extended_entities']['media'][0]['type']
        except:
            pass
        if media_type != 'photo' and data['entities']['urls'] != []:
            url_link = data['entities']['urls'][0]['expanded_url']
    return media_type, url_link

In [5]:
def get_youtube_title(url_link):
    import urllib.request
    import json
    import urllib
    
    VideoID = url_link.split('/')[-1] 

    params = {"format": "json", "url": "https://www.youtube.com/watch?v=%s" % VideoID}
    url = "https://www.youtube.com/oembed"
    query_string = urllib.parse.urlencode(params)
    url = url + "?" + query_string

    with urllib.request.urlopen(url) as response:
        response_text = response.read()
        data = json.loads(response_text.decode())
        return data['title']

In [6]:
def extract_url_title(data):
    media_type, url_link = get_media_and_url(data)
    
    if url_link == None:
        title = None
    elif url_link.split('/')[-2] == 'youtu.be':
        title = get_youtube_title(url_link)
    else:
        title = url_link.split('/')[-1].split('?')[0].split('.')[0]
    
    return media_type, title

In [7]:
def summary_media_content(user_data):
    media_content = [extract_url_title(twt) for twt in user_data]
    
    media_type, title = zip(*media_content)
    n_photo = media_type.count('photo')
    n_video = media_type.count('video')
    title = [item for item in title if item is not None]
    
    return n_photo, n_video, title

## Feature Extraction and Engineering

In [8]:
def separate_tweets(user_data):
    """Data pada key "tweets" terdiri atas independent & dependent tweet.
    
    Independent tweet = tweet yang dibuat sendiri (inspirasi sendiri)
    Dependent tweet = tweet yang mengutip/quote tweet org lain (quoted tweet)"""
    
    list_tweets, list_quoted_tweets = [], []
    
    for twt in user_data['tweets']:
        list_tweets.append(twt['full_text'])
        try:
            list_quoted_tweets.append(twt['quoted_status']['full_text'])
        except:
            pass
    return list_tweets, list_quoted_tweets

In [9]:
def get_all_hashtag(list_tweets):   
    all_hashtag = []
    
    wiz = Wizard()
    twt_hashtag = wiz.get_hashtag(pd.Series(list_tweets))['Hashtag']
    n_twt_use_hashtag = len(twt_hashtag)
    
    for i in twt_hashtag:
        all_hashtag += i
    return n_twt_use_hashtag, all_hashtag

In [10]:
def hashtag_related_feat(list_tweets):
    n_twt_use_hashtag, all_hashtag = get_all_hashtag(list_tweets)
    
    if n_twt_use_hashtag != 0:
        ratio = (n_twt_use_hashtag/len(list_tweets))
    else:
        ratio = 0
    return all_hashtag, n_twt_use_hashtag, ratio

In [11]:
def get_desc(filename, user_data, username_desc):
    username = filename.split('/')[-1][:-5]
    if not username.startswith('@'):
        desc = username_desc.get(username)[1]
    else:
        desc = user_data['description']
    return username, desc

In [12]:
raw_features = []

for filename in tqdm(glob.glob(data_path)[:2]):
    user_data = read_json(filename)
    # Separate tweets
    list_tweets, list_quoted_tweets = separate_tweets(user_data)
    # Extract hashtag related features
    all_hashtag, n_twt_use_hashtag, ratio = hashtag_related_feat(list_tweets)
    # Get username description
    profile_id = read_json(profile_data_path)
    username_desc = {user['screen_name']:(user['name'], user['description']) for user in profile_id}
    username, desc = get_desc(filename, user_data, username_desc)
    # Get summary of media content
    n_photo, n_video, title = summary_media_content(user_data['tweets'])

    
    # Output
    feat = {'username': username, 
            'name': username_desc.get(username)[0],
            'desc': desc,
            'tweets': list_tweets,
            'n_tweet': len(list_tweets),
            'quoted_tweets': list_quoted_tweets,
            'hashtag': all_hashtag,
            'n_tweet_use_hashtag': n_twt_use_hashtag,
            'ratio_tweets_use_hashtag': ratio,
            'n_photo': n_photo,
            'n_video': n_video,
            'content_title': title}
    raw_features.append(feat)

100%|██████████| 2/2 [00:01<00:00,  1.95it/s]


In [13]:
raw_features[0]

{'username': 'GantengPendekar',
 'name': 'Pendekar Ganteng',
 'desc': 'Anak Sulung | penyanyi amatir | pansos',
 'tweets': ['PDIP adalah TARGET\n\n#NewNormalDinodaiKadrun https://t.co/8oQpKJY0R4',
  'Aksi didepan gedung DPR mau menyelamatkan pancasila? Agama minoritas aja ditindas dan dilarang oleh mereka. Wkwkwk https://t.co/cXY2yYvVLA',
  'Pemerintah Pastikan Insentif Kartu Prakerja Cair Pekan Ini https://t.co/W3yYbiTmHY\n\n#TolakProvokasiDitengahPandemi',
  'Soekarno Meninggal Dunia pada Ulang Tahun Jokowi yang ke-9\n\nUtas\n\nOleh: Pical Gadi\n#JagaDanPerkuatPancasila https://t.co/cgrNs9UGJK',
  'Indonesian President Joko to resume working trips next week as country eases restrictions\nhttps://t.co/1WrnrIyyfZ\n\n#JumatBerkah',
  'RUU HIP Haluan Ideologi Pancasila dijegal oleh Kelompok Radikal\n\n#IndonesiaDaruratHumor https://t.co/xBYF6M0kNT',
  'ESENSI PANCASILA ITU GOTONG ROYONG\n\nSebuah thread melawan penyesatan narasi pancasila\n#TokopediaxBTS https://t.co/RPvnB353ii',
  'Di B