In [1]:
import json
import pandas as pd
import glob
import urllib
import requests

from maleo.wizard import Wizard
from tqdm import tqdm

In [7]:
class BuzzerFeatures():
    def __init__(self, data_path, profile_data_path):
        self.data_path = data_path
        self.profile_data_path = profile_data_path
        self.feat = []
    
    
    def read_json(self, path):
        with open(path, 'r') as file:
            return json.load(file)
    
    def write_json(self, path, data):
        with open(path, 'w') as outfile:
            json.dump(data, outfile)
    
    
    def separate_tweets(self, user_data):
        """Data pada key "tweets" terdiri atas independent & dependent tweet.

        Independent tweet = tweet yang dibuat sendiri (inspirasi sendiri)
        Dependent tweet = tweet yang mengutip/quote tweet org lain (quoted tweet)"""

        list_tweets, list_quoted_tweets = [], []

        for twt in user_data['tweets']:
            list_tweets.append(twt['full_text'])
            try:
                list_quoted_tweets.append(twt['quoted_status']['full_text'])
            except:
                pass
        return list_tweets, list_quoted_tweets
    
    
    def get_all_hashtag(self, list_tweets):   
        all_hashtag = []

        wiz = Wizard()
        twt_hashtag = wiz.get_hashtag(pd.Series(list_tweets))['Hashtag']
        n_twt_use_hashtag = len(twt_hashtag)

        for i in twt_hashtag:
            all_hashtag += i
        return n_twt_use_hashtag, all_hashtag
    
    
    def hashtag_related_feat(self, list_tweets):
        n_twt_use_hashtag, all_hashtag = self.get_all_hashtag(list_tweets)

        if n_twt_use_hashtag != 0:
            ratio = (n_twt_use_hashtag/len(list_tweets))
        else:
            ratio = 0
        return all_hashtag, n_twt_use_hashtag, ratio
    
    
    def get_desc(self, filename, user_data, username_desc):
        username = filename.split('/')[-1][:-5]
        if not username.startswith('@'):
            desc = username_desc.get(username)[1]
        else:
            desc = user_data['description']
        return username, desc
    
    
    def get_media_and_url(self, data):
        media_type = None
        url_link = None

        if 'quoted_status' not in data:
            try:
                media_type = data['extended_entities']['media'][0]['type']
            except:
                pass
            if media_type != 'photo' and data['entities']['urls'] != []:
                url_link = data['entities']['urls'][0]['expanded_url']
        return media_type, url_link
    

    def get_youtube_title(self, url_link):
        VideoID = url_link.split('/')[-1] 

        params = {"format": "json", "url": "https://www.youtube.com/watch?v=%s" % VideoID}
        url = "https://www.youtube.com/oembed"
        query_string = urllib.parse.urlencode(params)
        url = url + "?" + query_string

        with urllib.request.urlopen(url) as response:
            response_text = response.read()
            data = json.loads(response_text.decode())
            return data['title']
        
        
    def extract_url_title(self, data):
        media_type, url_link = self.get_media_and_url(data)

        if url_link is None:
            content_url = None
        elif url_link.split('/')[-2] == 'youtu.be':
            content_url = self.get_youtube_title(url_link)
        else:
            content_url = url_link
        return media_type, content_url
    
    
    def summary_media_content(self, user_data):
        media_content = [self.extract_url_title(twt) for twt in user_data]
        
        if media_content != []:        
            media_type, content_url = zip(*media_content)
            n_photo = media_type.count('photo')
            n_video = media_type.count('video')
            content_url = [item for item in content_url if item is not None]
        else:
            n_photo, n_video, content_url = None, None, None
        return n_photo, n_video, content_url
    
    
    def feature_extraction(self):
        self.feat = []
        for filename in tqdm(glob.glob(self.data_path)):
            user_data = self.read_json(filename)
            
            # Checker
            self.error_code = self.error_code_checker(user_data)
            if self.error_code:
                continue
            
            # Separate tweets
            list_tweets, list_quoted_tweets = self.separate_tweets(user_data)
            # Extract hashtag related features
            all_hashtag, n_twt_use_hashtag, ratio = self.hashtag_related_feat(list_tweets)
            # Get username description
            profile_id = self.read_json(profile_data_path)
            username_desc = {user['screen_name']:(user['name'], user['description']) for user in profile_id}
            username, desc = self.get_desc(filename, user_data, username_desc)
            # Get summary of media content
            n_photo, n_video, content_url = self.summary_media_content(user_data['tweets'])
            
            try:
                name = username_desc.get(username)[0]
            except:
                name = user_data['tweets'][0]['user']['name']
                
            # Output
            out = {'username': username, 
                   'name': name,
                   'desc': desc,
                   'tweets': list_tweets,
                   'n_tweet': len(list_tweets),
                   'quoted_tweets': list_quoted_tweets,
                   'hashtag': all_hashtag,
                   'n_tweet_use_hashtag': n_twt_use_hashtag,
                   'ratio_tweets_use_hashtag': ratio,
                   'n_photo': n_photo,
                   'n_video': n_video,
                   'content_url': content_url}
            self.feat.append(out)
    
    
    def data_preprocessing(self, data):
        wiz = Wizard()
        data = pd.Series(data)
        out = wiz.rm_link(data)
        out = wiz.emoji_to_word(out)
        out = wiz.rm_char(out)
        out = wiz.rm_punc(out)
        out = out.astype(str).str.strip()
        out = wiz.rm_multiple_space(out)
        out = out.apply(str.lower)
        return out.tolist()


    def features(self, processed=False):
        if processed:
            self.feature_extraction()
            for user_feat in tqdm(self.feat):
                if self.error_code is False:
                    user_feat['desc'] = self.data_preprocessing(user_feat['desc'])[0]
                    user_feat['tweets'] = self.data_preprocessing(user_feat['tweets'])
                    if user_feat['quoted_tweets'] != []:
                        user_feat['quoted_tweets'] = self.data_preprocessing(user_feat['quoted_tweets'])
            print('Get clean features')
        else:
            self.feature_extraction()
            print('Get raw features')
    
    
    def error_code_checker(self, user_data):
        if user_data['error_code'] != 'none':
            keys = ['username', 'name', 'desc', 'tweets', 'n_tweet', 
                    'quoted_tweets', 'hashtag', 'n_tweet_use_hashtag',
                    'ratio_tweets_use_hashtag', 'n_photo', 'n_video', 'content_title']
            out = {key:None for key in keys}
            self.feat.append(out)
            return True
        else:
            return False

## Feature Extraction and Engineering

In [8]:
data_path = '../data/data_7200/*.json'
profile_data_path = '../data/profile_id.json'

In [9]:
buzzer = BuzzerFeatures(data_path, profile_data_path)

In [10]:
buzzer.features(processed=True)

100%|██████████| 10/10 [00:05<00:00,  1.86it/s]
100%|██████████| 10/10 [00:12<00:00,  1.28s/it]

Get clean features





In [13]:
buzzer.write_json('../data/dataset/buzzer_features.json', buzzer.feat)