## Configuration

### Imports

In [1]:
# Utilities
from IPython.display import display
from fastprogress import master_bar, progress_bar
from datetime import datetime
from unidecode import unidecode
import os
import ntpath
import numpy as np
import statistics 
import re
import math
import random
import datetime
import numbers
from collections.abc import MutableMapping
import pandas as pd
import time
from random import randint

# Botometer API
import botometer

# MongoDB functionality
from pymongo import MongoClient, InsertOne, UpdateOne
from bson import ObjectId

# Concurrency
import concurrent.futures
from tqdm.notebook import tqdm

# Config management
from configparser import ConfigParser


# Directories where CSV data is stored
ROOT_DIR = "/home/mattia/javier/botbusters-spanish-general-elections-network-analysis/"

# Change path to root
os.chdir(ROOT_DIR)

# MongoDB parameters
mongoclient = MongoClient('localhost', 27017)
db = mongoclient.influence

### Available botometer keys (rapidapi key, twitter consumer key, twitter consumer secret)

In [2]:
%%time

config_object = ConfigParser()
config_object.read("config.init")

keys = {}
k = 0
for key in config_object.sections():
    keys[k] = botometer.Botometer(wait_on_ratelimit=True,
                                  rapidapi_key=config_object.get(key,'rapidapi_key'), 
                                  **{'consumer_key':config_object.get(key,'consumer_key'), 
                                     'consumer_secret':config_object.get(key,'consumer_secret')})
    k=k+1

CPU times: user 621 ms, sys: 53 ms, total: 674 ms
Wall time: 14.4 s


### Support Functions

In [12]:
def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    text = str(text)
    if not text.strip():
        return None
    try:
        return ObjectId(text.rjust(24,"0"))
    except Exception as ex:
        print(text, ex)
        return None

def get_botscore_by_userid(iteration, key, user_id, user_collection):
    """
    
    """
    try:    
        time.sleep(randint(85,115))
        botometer_instance = keys[key]
        result = botometer_instance.check_account(user_id)
        results = user_collection.update_one({'_id': make_objid(user_id)},
                                   {'$set': { 'botscore' : result }, '$unset': {'reason':''} },
                                    upsert=True)

        #print("New tweet Ma:", str(results.matched_count).rjust(8, " ")," Mo:", str(results.modified_count).rjust(8, " "))
        
        return True
    
    except Exception as e:
    
        toomany_match = re.search('Too Many Requests', str(e))
        if toomany_match:
            #print("User", user_id, "Key", key, " Too many requests")
            user_collection.update_one({'_id': make_objid(user_id)}, 
                                       {'$set': {'botscore': -2, 'reason': 'too many requests'}},
                                       upsert=True)

        else:
            user_collection.update_one({'_id': make_objid(user_id)}, 
                                       {'$set': {'botscore': -3, 'reason': str(e)}},
                                       upsert=True)
            
        error = str(iteration) + " - Exception. User: " + user_id + "; Key: " + str(key) + "; Message: " + str(e) + "\n"
        return error
    
def get_tweets(collection):
    """
    Gets tweets
    
    collection - Tweets MongoDB collection
    """
    tweets = list(collection.find({},
                                 {'_id' : False, 'user_id' : True}))
    
    print("Number of tweets in DB:", len(tweets))
    tweets = [flatten(t) for t in tweets]
    df_tweets = pd.DataFrame(tweets)
    return df_tweets

def get_retweets(collection):
    """
    Gets retweets
    
    collection - Retweets MongoDB collection
    """
    retweets = list(collection.find({},
                                 {'_id': False, 'user_id' : True}))
    
    print("Number of retweets in DB:", len(retweets))
    retweets = [flatten(r) for r in retweets]
    df_retweets = pd.DataFrame(retweets)
    return df_retweets

def get_users_for_botscore(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({ '$or' : [ { 'botscore': { '$exists' : False } }, {'botscore': -2}, {'reason':'unknown'} ]},
                                 {'_id' : True, 'position_num_interactions' : True, 'num_interactions':True }))
    
    print("Number of users without botscore in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    return df_users

def get_users(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({},
                                 {'_id' : True}))
    
    print("Number of users without botscore in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    return df_users

## Get number of interactions by user

Get tweets and retweets

In [6]:
%%time
df_retweets = get_retweets(db.retweets)
display(df_retweets.head(5))

Number of retweets in DB: 39344305


Unnamed: 0,user_id
0,4460826197
1,1075150211725619200
2,3251522811
3,904385876
4,952575694021758976


CPU times: user 4min 17s, sys: 15.1 s, total: 4min 32s
Wall time: 4min 55s


In [7]:
%%time
df_tweets = get_tweets(db.tweets)
display(df_tweets.head(5))

Number of tweets in DB: 1875457


Unnamed: 0,user_id
0,276977398
1,2906096735
2,1620996282
3,1163542858604916739
4,279465279


CPU times: user 8.57 s, sys: 96.1 ms, total: 8.66 s
Wall time: 9.91 s


In [8]:
%%time
freq = df_tweets.user_id.value_counts()
freq2 = df_retweets.user_id.value_counts()
fsum = freq.add(freq2, fill_value=0)
print("Total users:",len(fsum))

Total users: 2802467
CPU times: user 3min 4s, sys: 1.66 s, total: 3min 6s
Wall time: 2min 50s


In [9]:
fsum = fsum.sort_values(ascending=False)
fsum.head(10)

000000000000002479879152    15225.0
000000000000003064247597    11152.0
000000000000003387713985     9829.0
000001061332198266204164     9486.0
000000000000001723241016     8521.0
000000830507792998027266     8233.0
000000000000000216711237     8107.0
000000000000000128534745     7917.0
000000825432095577296896     7845.0
000000000000002420883178     7763.0
Name: user_id, dtype: float64

In [None]:
operations = []
pos = 0
for user_id, interactions in fsum.items():
    
    try:
        operations.append(UpdateOne({'_id': make_objid(user_id)},
                                    {'$set': { 'num_interactions': int(interactions), 'position_num_interactions': pos }},
                                    upsert=True))  

        if len(operations) >= 100000:
            results = db.users.bulk_write(operations)
            print("M:", str(results.matched_count).rjust(8, " "),
                  " I:", str(results.inserted_count).rjust(8, " "),
                  " U:", str(results.upserted_count).rjust(8, " "))
            operations = []
        
        pos = pos + 1
        
    except Exception as e:
        print("Exception. Message:", e)


if len(operations) > 0: 
    try: 
        results = db.users.bulk_write(operations)
        print("M:", str(results.matched_count).rjust(8, " "),
              " I:", str(results.inserted_count).rjust(8, " "),
              " U:", str(results.upserted_count).rjust(8, " "))
    except Exception as e:
        print("Exception. Message:", e)

## Get Botscores

Get users without botscore

In [13]:
%%time
df_users = get_users_for_botscore(db.users)

Number of users without botscore in DB: 2407006
CPU times: user 31.8 s, sys: 3.18 s, total: 35 s
Wall time: 34 s


Order by importance

In [14]:
df_users = df_users.sort_values(by='num_interactions', axis='index', ascending=False)

In [19]:
df_users[df_users.num_interactions>5].count()

_id                          510490
num_interactions             510490
position_num_interactions    510490
dtype: int64

In [20]:
df_users[df_users.num_interactions<4].count()

_id                          1692156
num_interactions             1692156
position_num_interactions    1692156
dtype: int64

In [21]:
df_users[df_users.num_interactions<2].count()

_id                          1121041
num_interactions             1121041
position_num_interactions    1121041
dtype: int64

In [22]:
df_users[df_users.num_interactions>1].count()

_id                          1285965
num_interactions             1285965
position_num_interactions    1285965
dtype: int64

Request botscore by importance and respecting daily quota

In [11]:
filename = "error.log"
daily_limit = 500


with concurrent.futures.ThreadPoolExecutor(max_workers=len(keys)) as executor:
    futures = []
    key = 0
    it = 0
    for user_id in df_users.head((len(keys))*daily_limit)['_id']:
        future = executor.submit(get_botscore_by_userid, it, key, str(user_id), db.users)
        futures.append(future)
        key = (key + 1) % (len(keys))
        it = it+1
        
    for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures), leave=True):
        if f.result() is not True:
            myfile = open(ROOT_DIR+filename, 'a')
            myfile.write(f.result())
            myfile.close()
        pass

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

KeyboardInterrupt: 