## Data gathering using multitoken

- get following
- get profile
- get tweets by hashtag
- get user timeline

In [1]:
import os
import glob
import json
import time
import pickle

import tweepy
import pandas as pd
import concurrent.futures
from tqdm import tqdm, notebook

from itertools import compress 
from datetime import datetime

In [2]:
os.cpu_count()

8

In [3]:
data_dir_path = '../data'
key_dir_path = '../keys'

In [4]:
key_paths = glob.glob(os.path.join(key_dir_path, '*'))
key_paths = [key.replace('\\', '/') for key in key_paths]

In [5]:
daftar_buzzer = os.path.join(data_dir_path, 'supports', 'list_of_buzzer.csv')

In [6]:
class Friends():
    def __init__(self, keys_paths, username_path):
        self.keys = self.read_key(keys_paths)
        self.usernames = self.read_csv(username_path)
        self.apis = self.auth_twitter()
        self.api_statuses = [True] * len(self.apis)
        self.setup()
        
    def setup(self):
        paths = ["../data/profile", "../data/following"]
        for path in paths:
            if not os.path.exists(path):
                os.makedirs(path)
        
    def read_key(self, key_paths):
        return [pickle.load(open(path, 'rb')) for path in key_paths]
    
    def read_csv(self, path):
        d_data = pd.read_csv(path)
        return d_data
    
    def read_json(self, path):
        with open(path, 'r', encoding="utf-8") as file:
            data_dict = json.load(file)
        
        return data_dict
    
    def auth_twitter(self):
        api_list = []
        for key in self.keys:
            auth = tweepy.OAuthHandler(key["api_key"], key["api_secret_key"])
            auth.set_access_token(key["access_token"], key["access_token_secret"])
            api = tweepy.API(auth)
            
            api_list.append(api)
            
        return api_list
    
    def get_free_token(self):
        idx_tokens = list(compress(range(len(self.api_statuses)), self.api_statuses))
        
        if len(idx_tokens) > 0:
            index = idx_tokens[0]
            return self.apis[index], index
        else:
            return None, None

    def output(self, data, path_dir, filename):
        
        try:
            with open(os.path.join(path_dir, filename + '.json'), 'w') as f:
                f.write(json.dumps(data))
        except:
            try:
                with open(os.path.join(path_dir, filename + '.json'), 'w') as f:
                    f.write(data)
            except:
                pickle.dump(data, open(os.path.join(path_dir, filename + '.pkl'), 'wb'))
        
    def get_profile_user(self, username, api, index_token):
        while True:
            try:
                profile = api.get_user(username)
                profile = profile._json
                self.api_statuses[index_token] = True
                self.output(profile, '../data/profile', profile["screen_name"])
                break
            except tweepy.RateLimitError:
                print("\tRateLimit", datetime.today().strftime("\t%H:%M:%S %d-%m-%Y"))
                time.sleep(15*60)
            except tweepy.TweepError as e:
                err_msg = e.response.text
                self.api_statuses[index_token] = True
                self.output(err_msg, '../data/profile', username.replace("@", ''))
                break
                
    def limit_handled(self, cursor):
        while True:
            try:
                yield cursor.next()
            except tweepy.RateLimitError:
                print('\tRateLimit', datetime.today().strftime("\t%H:%M:%S %d-%m-%Y"))
                time.sleep(15 * 60)
            except tweepy.TweepError as e:
                msg = e
                if "Failed to send request" in msg.reason:
                    pass
                elif '429' in msg.reason:
                    print('\tRateLimit', datetime.today().strftime("\t%H:%M:%S %d-%m-%Y"))
                    time.sleep(15 * 60)
                else:
                    return
            except StopIteration:
                return
    
    def get_following(self, username, api, index_token):
        user_follower_dict = {username: []}
        
        for follower in self.limit_handled(tweepy.Cursor(api.friends, id=username).items()):
            user_follower_dict[username].append(follower.screen_name)
            self.output(follower._json, '../data/profile', follower.screen_name)
            
        self.output(user_follower_dict, '../data/following', username)
        
        self.api_statuses[index_token] = True
        
        
    def get_search_tweet(self, hashtag, api, index_token):
        for tweet in self.limit_handled(tweepy.Cursor(api.search, q=hashtag).items(400)):
            self.output(tweet._json, '../data/search_tweets', tweet.user.screen_name)
                
        self.api_statuses[index_token] = True
        
    def get_user_timeline(self, username, api, index_token):
        all_tweets = []
        for tweet in self.limit_handled(tweepy.Cursor(api.user_timeline, username).items(300)):
            all_tweets.append(tweet._json)
        
        self.output(all_tweets, '../data/user_timeline_46K', username)
            
        self.api_statuses[index_token] = True

In [7]:
friends = Friends(key_paths, daftar_buzzer)

In [10]:
def calculate_time(start, end):
    duration = end - start
    m = int(duration / 60)
    s = int(duration % 60)
    
    return m, s

In [11]:
def get_profile_thread(usernames):
    start = time.perf_counter()
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        print("usernames", usernames)
        for username in usernames:
            while True:
                api, index_token = friends.get_free_token()
                if index_token is not None:
                    friends.api_statuses[index_token] = False
                    try:    
                        executor.submit(friends.get_profile_user, username, api, index_token)
                        progress_bar_profile.update(1)
                        break
                    except:
                        pass

    end = time.perf_counter()
    m, s = calculate_time(start,end)
    print("Time:", m, s)

In [12]:
def get_following_thread(path_profiles):
    start = time.perf_counter()
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for index, path in enumerate(path_profiles):
            user_profile = friends.read_json(path)
            if "screen_name" in user_profile:
                username = user_profile["screen_name"]
                if (user_profile["friends_count"] > 900) or (user_profile["friends_count"] == 0):
                    msg = {"status": "friends_count is above threshold", "friends_count": user_profile["friends_count"]}
                    friends.output(msg, "../data/following", username)
                else:
                    while True:
                        api, index_token = friends.get_free_token()
                        if index_token is not None:
                            friends.api_statuses[index_token] = False
                            try:
                                executor.submit(friends.get_following, username, api, index_token)
                                progress_bar.update(1)
                                break
                            except:
                                pass
            else:
                friends.output(user_profile, "../data/following", path.split("/")[-1].replace(".json", ""))

    end = time.perf_counter()
    m, s = calculate_time(start, end)
    print("Time:", m, s)

In [13]:
def get_hashtag_tweets(hashtages):
    start = time.perf_counter()
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        for index, hashtag in enumerate(hashtages):
            while True:
                api, index_token = friends.get_free_token()
                if index_token is not None:
                    print(index, "/", len(hashtages), "getting", hashtag, "data...")
                    friends.api_statuses[index_token] = False
                    try:    
                        executor.submit(friends.get_search_tweet, hashtag, api, index_token)
                        break
                    except:
                        pass

    end = time.perf_counter()
    m, s = calculate_time(start,end)
    print("Time:", m, s)

In [21]:
def get_user_first_tweet(usernames):
    
    progress_bar = notebook.tqdm(total=len(usernames))
    
    start = time.perf_counter()
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        print(usernames)
        for username in usernames:
            print(username)
            while True:
                api, index_token = friends.get_free_token()
                if index_token is not None:
                    friends.api_statuses[index_token] = False
                    executor.submit(friends.get_user_timeline, username, api, index_token)
                    progress_bar.update(1)
                    break
    
    end = time.perf_counter()
    m, s = calculate_time(start,end)
    print("Time:", m, s)

In [15]:
def get_usernames_have_not_following(profile):
    
    if profile is None:
        profile = glob.glob("../data/profile/*")
        profile = [username.replace("\\", "/") for username in profile]

    following = glob.glob("../data/following/*")
    following = [username.replace("\\", "/") for username in following]
    
    profile_username = dict((user.split('/')[-1].replace('.json', ""), user) for user in profile)
    following_username = [user.split('/')[-1].replace(".json", "") for user in following]
    
    for username in following_username:
        profile_username.pop(username, None)
        
    return list(profile_username.values())

In [16]:
def have_not_following(path_usernames):
    
    path_usernames = get_usernames_have_not_following(path_usernames)
    path_usernames = sorted(path_usernames)
    path_usernames = [username for username in path_usernames if os.path.exists(username)]
    
    return path_usernames

## Getting Started

In [None]:
usernames = friends.usernames.username.to_list()
path_usernames = ["../data/profile/"+username.replace("@", "")+".json" for username in usernames]

### get profile

In [None]:
path_usernames = [screenname
                  for screenname in screen_names 
                  if not os.path.exists('../data/profile/'+screenname+'.json')]

In [None]:
progress_bar_profile = notebook.tqdm(total = len(path_usernames))

In [None]:
get_profile_thread(path_usernames)

### get following user

In [None]:
with open('../data/supports/screen_name_id.json', 'r') as f:
    screen_names = json.load(f)

In [None]:
len(screen_names)

In [None]:
path_usernames = ['../data/profile/'+screenname+'.json' 
                  for screenname in screen_names 
                  if os.path.exists('../data/profile/'+screenname+'.json')]

In [None]:
path_usernames[:1]

In [None]:
path_usernames = have_not_following(path_usernames)

In [None]:
path_usernames = ['addiems', 'agussar', 'AksiKamisan', 'AksiLangsung', 'anandasukarlan', 'andre_rosiade', 'aniesbaswedan', 'Anyaselalubenar', 'AS86222548', 'badgaIdidi', 'BEAUTIFULYOONGO', 'braddamamad', 'CahyadiAnugrah', 'catwomanizer', 'epta25', 'FarraDemetria', 'ferizandra', 'GagakLu93445554', 'gitaputrid', 'Greschinov', 'hamdJr666', 'HAR4K', 'honcuk', 'ianhugen', 'Insideme11', 'ismailfahmi', 'jatamnas', 'justforfun9922', 'LaillyFadillah', 'Lini_ZQ', 'maderodog', 'MantapMana', 'Mantul_234', 'margianta', 'mas_piyuuu', 'mas__piyuuu', 'MawarSolitaire', 'menuju_harapan', 'mochamadarip', 'mohmahfudmd', 'muannas_alaidid', 'MuliaRamadhan10', 'Nadine_Oliv', 'NajwaShihab', 'narasitv', 'negativisme', 'Nelangsa_', 'OmahMunir', 'OposisiCerdas', 'org_hidup', 'P4tihGajahMada', 'Pattimura1817', 'PresidenWKWK', 'RamliRizal', 'ReiNaldo85', 'Roma13054710', 'Rustodead', 'rzrhmn', 'safirawwww', 'SanKohtaro', 'SerenityAthen', 'ShadowJoe2', 'sociotalker', 'Soliper_SP', 'SorotMata212', 'taecemver', 'TanYoana', 'TeddyGusnaidi', 'TretanMuslim', 'Uki23', 'VeronicaKoman', 'Yatie84991237']

In [None]:
path_usernames = ["../data/profile/"+username+".json" for username in path_usernames]

In [None]:
len(path_usernames)

In [None]:
progress_bar = notebook.tqdm(total = len(path_usernames))

In [None]:
get_following_thread(path_usernames)

### get hashtag tweets

In [None]:
hashtag_list = pd.read_excel("../data/trends/hashtag_label.xlsx", sheet_name=0)
hashtag_list = hashtag_list[hashtag_list["category"].isin([1,2])]
hashtag_list = hashtag_list.trend.to_list()

In [None]:
len(hashtag_list[50+27:])

In [None]:
get_hashtag_tweets(hashtag_list[50+27:])

### get first tweet of user

In [None]:
with open('screen_name_filtered.json', 'r') as f:
    screennames = json.load(f)

In [None]:
screennames = screennames["screen_name"]

In [None]:
screennames_exists = [username.replace('.json', '') for username in os.listdir('../data/user_timeline')]

In [None]:
screennames = list(set(screennames) - set(screennames_exists))

In [None]:
len(screennames)

In [None]:
progress_bar = notebook.tqdm_notebook(total=len(screennames))

In [None]:
get_user_first_tweet(screennames)

### get 300 tweets of user

In [22]:
with open('../data/supports/46K_users.json', 'r') as f:
    usernames = json.load(f)

In [23]:
friends

<__main__.Friends at 0x239b70b63c8>

In [24]:
get_user_first_tweet(usernames[:)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

['kangkueh06']
kangkueh06
Time: 0 0


## debug

Reading output files following

In [None]:
files = glob.glob('../data/profile/*')

In [None]:
for file in files:
    data = pickle.load(open(file, 'rb'))
    print(data)
    print("\n")

## review

In [None]:
following_paths = glob.glob('../data/following/*')

In [None]:
file_err = []
for path in following_paths:
    data = friends.read_json(path)
    try:
        for key, val in data.items():
            if len(val) == 0:
                file_err.append(path)
    except:
        pass

## under 900

In [None]:
following_paths = glob.glob('../data/following/*')

In [None]:
following_username = [username.split("\\")[-1].replace(".json", "") for username in following_paths]

In [None]:
profile_paths = glob.glob('../data/profile/*.json')

In [None]:
profile_paths = [path.replace("\\", "/") for path in profile_paths]

In [None]:
under_thresh = []
for path in tqdm(profile_paths):
    try:
        data = friends.read_json(path)
        if "screen_name" in data:
            if data["screen_name"] not in following_username:
                if (data["friends_count"] > 299) & (data["friends_count"] < 901):
                    under_thresh.append({"path": path, "friends_count": data["friends_count"]})
    except:
        print(path)

In [None]:
under_thresh = sorted(under_thresh, key = lambda x: x["friends_count"])

In [None]:
len(under_thresh)

In [None]:
path_usernames = [path["path"].replace('\\', '/') for path in under_thresh]

In [None]:
len(path_usernames)