## Configuration

### Imports

In [1]:
# Utilities
from IPython.display import display
from fastprogress import master_bar, progress_bar
from datetime import datetime
from unidecode import unidecode
import os
import ntpath
import numpy as np
import statistics 
import re
import math
import random
import datetime
import numbers
from collections.abc import MutableMapping
import pandas as pd
import time
from random import randint
import json

# Botometer API
import botometer

# MongoDB functionality
from pymongo import MongoClient, InsertOne, UpdateOne
from bson import ObjectId

# Concurrency
import concurrent.futures
from tqdm.notebook import tqdm

# Config management
from configparser import ConfigParser


# Directories where CSV data is stored
ROOT_DIR = "/home/mattia/javier/botbusters-spanish-general-elections-network-analysis/"

# Change path to root
os.chdir(ROOT_DIR)

# MongoDB parameters
mongoclient = MongoClient('localhost', 27017)
db = mongoclient.influence

### Support Functions

In [2]:
def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    text = str(text)
    if not text.strip():
        return None
    try:
        return ObjectId(text.rjust(24,"0"))
    except Exception as ex:
        print(text, ex)
        return None


def get_tweets(collection):
    """
    Gets tweets
    
    collection - Tweets MongoDB collection
    """
    tweets = list(collection.find({},
                                 {'_id' : False, 'user_id' : True}))
    
    print("Number of tweets in DB:", len(tweets))
    tweets = [flatten(t) for t in tweets]
    df_tweets = pd.DataFrame(tweets)
    return df_tweets

def get_retweets(collection):
    """
    Gets retweets
    
    collection - Retweets MongoDB collection
    """
    retweets = list(collection.find({},
                                 {'_id': False, 'user_id' : True}))
    
    print("Number of retweets in DB:", len(retweets))
    retweets = [flatten(r) for r in retweets]
    df_retweets = pd.DataFrame(retweets)
    return df_retweets


def get_users(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({},
                                 {'_id' : True}))
    
    print("Number of users without botscore in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    return df_users

## Get number of interactions by user

Get tweets and retweets

In [None]:
%%time
df_retweets = get_retweets(db.retweets)
display(df_retweets.head(5))

In [7]:
%%time
df_tweets = get_tweets(db.tweets)
display(df_tweets.head(5))

Number of tweets in DB: 1875457


Unnamed: 0,user_id
0,276977398
1,2906096735
2,1620996282
3,1163542858604916739
4,279465279


CPU times: user 8.57 s, sys: 96.1 ms, total: 8.66 s
Wall time: 9.91 s


In [8]:
%%time
freq = df_tweets.user_id.value_counts()
freq2 = df_retweets.user_id.value_counts()
fsum = freq.add(freq2, fill_value=0)
print("Total users:",len(fsum))

Total users: 2802467
CPU times: user 3min 4s, sys: 1.66 s, total: 3min 6s
Wall time: 2min 50s


In [9]:
fsum = fsum.sort_values(ascending=False)
fsum.head(10)

000000000000002479879152    15225.0
000000000000003064247597    11152.0
000000000000003387713985     9829.0
000001061332198266204164     9486.0
000000000000001723241016     8521.0
000000830507792998027266     8233.0
000000000000000216711237     8107.0
000000000000000128534745     7917.0
000000825432095577296896     7845.0
000000000000002420883178     7763.0
Name: user_id, dtype: float64

In [None]:
operations = []
pos = 0
for user_id, interactions in fsum.items():
    
    try:
        operations.append(UpdateOne({'_id': make_objid(user_id)},
                                    {'$set': { 'num_interactions': int(interactions), 'position_num_interactions': pos }},
                                    upsert=True))  

        if len(operations) >= 100000:
            results = db.users.bulk_write(operations)
            print("M:", str(results.matched_count).rjust(8, " "),
                  " I:", str(results.inserted_count).rjust(8, " "),
                  " U:", str(results.upserted_count).rjust(8, " "))
            operations = []
        
        pos = pos + 1
        
    except Exception as e:
        print("Exception. Message:", e)


if len(operations) > 0: 
    try: 
        results = db.users.bulk_write(operations)
        print("M:", str(results.matched_count).rjust(8, " "),
              " I:", str(results.inserted_count).rjust(8, " "),
              " U:", str(results.upserted_count).rjust(8, " "))
    except Exception as e:
        print("Exception. Message:", e)

## Sampling of 20%

In [8]:
def get_retweets(collection):
    """
    Gets retweets
    
    collection - Retweets MongoDB collection
    """
    retweets = list(collection.find({},
                                 {'_id': True, 'tweet_id' : True, 'user_id' : True}))
    
    print("Number of retweets in DB:", len(retweets))
    retweets = [flatten(r) for r in retweets]
    df_retweets = pd.DataFrame(retweets)
    df_retweets.rename(columns={'user_id':'retweet_author','_id':'retweet_mongo_id'},inplace=True)
    return df_retweets

def get_tweets(collection):
    """
    Gets tweets
    
    collection - Tweets MongoDB collection
    """
    tweets = list(collection.find({},
                                 {'_id' : True, 'user_id' : True, 'tweet_id' : True}))
    
    print("Number of tweets in DB:", len(tweets))
    tweets = [flatten(t) for t in tweets]
    df_tweets = pd.DataFrame(tweets)
    df_tweets.rename(columns={'user_id':'tweet_author', '_id':'tweet_mongo_id'},inplace=True)
    return df_tweets

def get_users(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({ 'botscore.cap.universal' : { '$exists' : True }},
                                 {'_id' : True, 'botscore.cap.universal' : True})
                 #.limit(LIMIT)
                )
    print("Number of users with botscore in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    return df_users

In [9]:
%%time
df_retweets = get_retweets(db.retweets)
display(df_retweets.head(5))

Number of retweets in DB: 39344305


Unnamed: 0,retweet_mongo_id,tweet_id,retweet_author
0,1193668589628383234,1193667913368121351,4460826197
1,1192582587753140230,1192581634291355649,1075150211725619200
2,1185317085502738433,1185312751301906433,3251522811
3,1190660503070236672,1190659978471911424,904385876
4,1190660746096644099,1190659978471911424,952575694021758976


CPU times: user 8min 48s, sys: 34.3 s, total: 9min 22s
Wall time: 9min 46s


In [10]:
%%time
df_tweets = get_tweets(db.tweets)
display(df_tweets.head(5))

Number of tweets in DB: 1875457


Unnamed: 0,tweet_mongo_id,tweet_author,tweet_id
0,a86e778c-46a3-11ea-9505-02420a0000af,276977398,1191462058602192907
1,44a195f2-468c-11ea-9505-02420a0000af,2906096735,1191481739606183937
2,44a1996c-468c-11ea-9505-02420a0000af,1620996282,1191488966639443968
3,75dba318-46a7-11ea-9505-02420a0000af,1163542858604916739,1191464452987772928
4,77c4e81a-46a7-11ea-9505-02420a0000af,279465279,1191464087064109058


CPU times: user 19.8 s, sys: 398 ms, total: 20.2 s
Wall time: 21.6 s


In [11]:
%%time
users = get_users(db.users)
display(users.head(5))

In [16]:
set_calculated_users = set(users._id)

#### Tag sampled data

In [82]:
df_sample = df_retweets.sample(frac=0.2, random_state=30100)
print("Number of interactions in the sample:", len(df_sample))

Number of interactions in the sample: 7868861


In [83]:
df_sample = df_sample.set_index('tweet_id').join(df_tweets.set_index('tweet_id')).reset_index(drop=True)
df_sample.head(5)

Unnamed: 0,retweet_mongo_id,retweet_author,tweet_mongo_id,tweet_author
0,1188606157725847552,722509814165827585,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712
1,1191543404716580864,595890179,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712
2,1174059596966748160,468655723,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712
3,1174076516826865664,8576412,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712
4,1227330608566820864,232342510,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712


In [84]:
set_users = set(df_sample.retweet_author.unique()) 
set_users = set_users.union(set(df_sample.tweet_author.unique()))
print("Number of users involved in sample:", len(set_users))

Number of users involved in sample: 1297975


In [85]:
df_sample.head()

Unnamed: 0,retweet_mongo_id,retweet_author,tweet_mongo_id,tweet_author
0,1188606157725847552,722509814165827585,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712
1,1191543404716580864,595890179,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712
2,1174059596966748160,468655723,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712
3,1174076516826865664,8576412,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712
4,1227330608566820864,232342510,c75e7c0a-4671-11ea-a6d9-02420a000681,68740712


In [87]:
%%time 

collections = {
    'retweet_mongo_id' : db.retweets,
    'retweet_author' : db.users,
    'tweet_mongo_id' : db.tweets,
    'tweet_author': db.users
}

for column in ['retweet_mongo_id','retweet_author','tweet_mongo_id','tweet_author']:
    
    num_entries = len(df_sample[column])
    operations = []
    collection = collections[column]
    print(column)
    
    for row in range(0,num_entries):
        _id = df_sample[column][row]
        try:
            operations.append(UpdateOne({'_id': _id},
                                        {'$set': { 'sampled': True}},
                                        upsert=True))
            
            if len(operations) >= 1000000:
                results = collection.bulk_write(operations)
                print("M:", str(results.matched_count).rjust(8, " "),
                      " I:", str(results.inserted_count).rjust(8, " "),
                      " U:", str(results.upserted_count).rjust(8, " "))
                operations = []
                
        except Exception as e:
            print("Exception. Message:", e)

    if len(operations) > 0: 
        try: 
            results = collection.bulk_write(operations)
            print("M:", str(results.matched_count).rjust(8, " "),
                  " I:", str(results.inserted_count).rjust(8, " "),
                  " U:", str(results.upserted_count).rjust(8, " "))
        except Exception as e:
            print("Exception. Message:", e)


retweet_mongo_id
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:   868861  I:        0  U:        0
retweet_author
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:   868861  I:        0  U:        0
tweet_mongo_id
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:  1000000  I:        0  U:        0
M:   868861  I:        0  U:        0
tweet_author
M:  1000000  I:        0  U:

## Get Botscores

In [59]:
def get_users_for_botscore(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({ '$and' : [ { 'sampled': True } , { 'botscore' : { '$exists' : False } }]},
                                 {'_id' : True, 'num_interactions' : True}))
    
    print("Number of sampled users without botscore in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    return df_users

Get users without botscore

In [9]:
%%time
config_object = ConfigParser()
config_object.read("config.init")
botometer_instance = botometer.BotometerLite(wait_on_ratelimit=True,
                          rapidapi_key=config_object.get('key','rapidapi_key'),
                          **{'consumer_key':config_object.get('key','consumer_key'),
                         'consumer_secret':config_object.get('key','consumer_secret')})

CPU times: user 19.2 ms, sys: 0 ns, total: 19.2 ms
Wall time: 402 ms


Order by importance

Request botscore by importance and respecting daily quota

In [72]:
### CHECKKK
daily_limit = 200 ### CHECKKK
### CHECKKK

today = str(pd.to_datetime(pd.to_datetime('today').strftime(format='%Y-%m-%d')))[0:10]
df_users = get_users_for_botscore(db.users)
df_users = df_users.sort_values(by='num_interactions', axis='index', ascending=False)

for request in progress_bar(range(0, daily_limit)):
    
    print("Request "+str(request)+" ...")
    
    start = request*100
    end = start+100
 
    user_list = df_users[start:end]['_id'].astype(str).astype(int).tolist()
        
    try:    
        results = botometer_instance.check_accounts_from_user_ids(user_list)
        
        print("Results received: ", len(results))
        
        with open(ROOT_DIR+f"botscores/results-{today}.log", 'a') as output_file:
            for res in results:
                json.dump(res, output_file) 
                output_file.write("\n")
                

        if len(results) < 100:
            result_user_ids = []
            for result_user in results:
                result_user_id = result_user.get("user_id")
                result_user_ids.append(result_user_id)
            
            for requested_user in user_list:
                if requested_user not in result_user_ids:
                    r = db.users.update_one({'_id': make_objid(str(requested_user))},
                                           {'$set': { 'botscore' : -3, 'reason':'not found'}},
                                            upsert=True)

        for result in results:
            user_id = make_objid(str(result.get('user_id')))
            botscore = result.get('botscore', False)
            
            if botscore is False:
                error = result.get('error')
                r = db.users.update_one({'_id': user_id},
                                           {'$set': { 'botscore' : -3, 'reason': str(error)}},
                                            upsert=True)     
            else:
                r = db.users.update_one({'_id': user_id},
                                           {'$set': { 'botscore' : botscore}, 
                                            '$unset': {'reason':''}},
                                            upsert=True)        
    
    except Exception as e:
        error = str(request) + " request -> Exception; Message: " + str(e) + "\n"
        myfile = open(ROOT_DIR+"error.log", 'a')
        myfile.write(error)
        myfile.close()
        raise

    
    time.sleep(2)

df_users = get_users_for_botscore(db.users)

Number of sampled users without botscore in DB: 1039661


Request 0 ...
Results received:  100
Request 1 ...
Results received:  100
Request 2 ...
Results received:  99
Request 3 ...
Results received:  100
Request 4 ...
Results received:  100
Request 5 ...
Results received:  98
Request 6 ...
Results received:  99
Request 7 ...
Results received:  100
Request 8 ...
Results received:  100
Request 9 ...
Results received:  100
Request 10 ...
Results received:  100
Request 11 ...
Results received:  99
Request 12 ...
Results received:  99
Request 13 ...
Results received:  100
Request 14 ...
Results received:  100
Request 15 ...
Results received:  99
Request 16 ...
Results received:  99
Request 17 ...
Results received:  100
Request 18 ...
Results received:  100
Request 19 ...
Results received:  100
Request 20 ...
Results received:  100
Request 21 ...
Results received:  100
Request 22 ...
Results received:  99
Request 23 ...
Results received:  100
Request 24 ...
Results received:  100
Request 25 ...
Results received:  100
Request 26 ...
Results receive

## Checkings

In [60]:
df_users = get_users_for_botscore(db.users)

Number of sampled users without botscore in DB: 1159461
1159461


In [None]:
%%time 

today = str(pd.to_datetime(pd.to_datetime('today').strftime(format='%Y-%m-%d')))[0:10]

with open(f"botscores/results-{today}.log") as f:
    for line in f:
        result = json.loads(line) 
        botscore, user_id = result.get('botscore'), result.get('user_id')
        print(line, "jj")

In [None]:
data = []
with open('file') as f:
    for line in f:
        data.append(json.loads(line))