## Configuration

### Imports

In [1]:
# Utilities
from IPython.display import display
from fastprogress import master_bar, progress_bar
from datetime import datetime
from unidecode import unidecode
import os
import ntpath
import numpy as np
import statistics 
import re
import math
import random
import datetime
import numbers
from collections.abc import MutableMapping
import pandas as pd
import time
from random import randint

# Botometer API
import botometer

# MongoDB functionality
from pymongo import MongoClient, InsertOne
from bson import ObjectId

# Concurrency
import concurrent.futures
from tqdm.notebook import tqdm

# Config management
from configparser import ConfigParser


# Directories where CSV data is stored
ROOT_DIR = "/home/mattia/javier/botbusters-spanish-general-elections-network-analysis"

# Change path to root
os.chdir(ROOT_DIR)

# MongoDB parameters
mongoclient = MongoClient('localhost', 27017)
db = mongoclient.influence

### Available botometer keys (rapidapi key, twitter consumer key, twitter consumer secret)

In [2]:
%%time

config_object = ConfigParser()
config_object.read("config.init")

keys = {}
k = 0
for key in config_object.sections():
    keys[k] = botometer.Botometer(wait_on_ratelimit=True,
                                  rapidapi_key=config_object2.get(key,'rapidapi_key'), 
                                  **{'consumer_key':config_object2.get(key,'consumer_key'), 
                                     'consumer_secret':config_object2.get(key,'consumer_secret')})
    k=k+1

CPU times: user 330 ms, sys: 50.1 ms, total: 380 ms
Wall time: 8.82 s


### Support Functions

In [3]:
def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    text = str(text)
    if not text.strip():
        return None
    try:
        return ObjectId(text.rjust(24,"0"))
    except Exception as ex:
        print(text, ex)
        return None

def get_botscore_by_userid(iteration, key, user_id, user_collection):
    """
    
    """
    try:    
        time.sleep(randint(90,100))
        botometer_instance = keys[key]
        result = botometer_instance.check_account(user_id)
        results = user_collection.update_one({'_id': make_objid(user_id)},
                                   {'$set': { 'botscore' : result }, '$unset': {'reason':''} },
                                    upsert=True)

        #print("New tweet Ma:", str(results.matched_count).rjust(8, " ")," Mo:", str(results.modified_count).rjust(8, " "))
        
        return True
    
    except Exception as e:
        
        toomany_match = re.search('Too Many Requests', str(e))
        if toomany_match:
            #print("User", user_id, "Key", key, " Too many requests")
            user_collection.update_one({'_id': make_objid(user_id)}, 
                                       {'$set': {'botscore': -2, 'reason': 'too many requests'}},
                                       upsert=True)

        else:
            user_collection.update_one({'_id': make_objid(user_id)}, 
                                       {'$set': {'botscore': -3, 'reason': str(e)}},
                                       upsert=True)
            
        error = str(iteration) + " - Exception. User: " + user_id + "; Key: " + str(key) + "; Message: " + str(e) + "\n"
        return error
    
def get_tweets(collection):
    """
    Gets tweets
    
    collection - Tweets MongoDB collection
    """
    tweets = list(collection.find({},
                                 {'_id' : False, 'user_id' : True}))
    
    print("Number of tweets in DB:", len(tweets))
    tweets = [flatten(t) for t in tweets]
    df_tweets = pd.DataFrame(tweets)
    return df_tweets

def get_retweets(collection):
    """
    Gets retweets
    
    collection - Retweets MongoDB collection
    """
    retweets = list(collection.find({},
                                 {'_id': False, 'user_id' : True}))
    
    print("Number of retweets in DB:", len(retweets))
    retweets = [flatten(r) for r in retweets]
    df_retweets = pd.DataFrame(retweets)
    return df_retweets

def get_users(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({ '$or' : [ { 'botscore': { '$exists' : False } }, {'botscore': -2} ]},
                                 {'_id' : True}))
    
    print("Number of users without botscore in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    return df_users

## Get Botscores

Get tweets and retweets

In [4]:
%%time
df_retweets = get_retweets(db.retweets)
display(df_retweets.head(5))

Number of retweets in DB: 39344305


Unnamed: 0,user_id
0,4460826197
1,1075150211725619200
2,3251522811
3,904385876
4,952575694021758976


CPU times: user 4min 33s, sys: 18.6 s, total: 4min 52s
Wall time: 5min 16s


In [5]:
%%time
df_tweets = get_tweets(db.tweets)
display(df_tweets.head(5))

Number of tweets in DB: 1875457


Unnamed: 0,user_id
0,276977398
1,2906096735
2,1620996282
3,1163542858604916739
4,279465279


CPU times: user 9.02 s, sys: 87.2 ms, total: 9.1 s
Wall time: 10.4 s


In [6]:
%%time
freq = df_tweets.user_id.value_counts()
freq2 = df_retweets.user_id.value_counts()
fsum = freq.add(freq2, fill_value=0)
print("Total users:",len(fsum))

Total users: 2802467
CPU times: user 2min 52s, sys: 1.87 s, total: 2min 54s
Wall time: 2min 37s


Get users without botscore

In [7]:
%%time
df_users = get_users(db.users)

Number of users without botscore in DB: 2601875
CPU times: user 16.2 s, sys: 228 ms, total: 16.5 s
Wall time: 16.4 s


In [8]:
df_users.head(2)

Unnamed: 0,_id
0,2266588688
1,471028961


Order by importance

In [9]:
%%time
filtered_fsum = fsum[fsum.index.isin(df_users._id)]
print(len(filtered_fsum))

2601875
CPU times: user 9.85 s, sys: 3.69 ms, total: 9.85 s
Wall time: 9.85 s


In [10]:
ordered_users = filtered_fsum.sort_values(ascending=False)
ordered_users.head(5)

000001183437478914641920    117.0
000000986284368309161984    117.0
000000000000000186037690    117.0
000000000000000072070959    117.0
000000752995384125947904    117.0
Name: user_id, dtype: float64

In [None]:
filename = "error.log"
daily_limit = 500

with concurrent.futures.ThreadPoolExecutor(max_workers=len(keys)) as executor:
    futures = []
    key = 0
    it = 0
    for user_id in ordered_users.head(len(keys)*daily_limit).index:
        future = executor.submit(get_botscore_by_userid, it, key, str(user_id), db.users)
        futures.append(future)
        key = (key + 1) % len(keys)
        it = it+1
        
    for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures), leave=True):
        if f.result() is not True:
            myfile = open(ROOT_DIR+filename, 'a')
            myfile.write(f.result())
            myfile.close()
        pass