## Configuration

### Imports

In [210]:
# Utilities
from IPython.display import display
import os
import numpy as np
import time
import pandas as pd

# MongoDB functionality
from pymongo import MongoClient, InsertOne, UpdateOne
from bson import ObjectId
from collections.abc import MutableMapping

import matplotlib as plt

# Network analysis
import networkx as nx

ROOT_DIR = "/home/mattia/javier/botbusters-spanish-general-elections-network-analysis/"

# Change path to root
os.chdir(ROOT_DIR)

# MongoDB parameters
mongoclient = MongoClient('localhost', 27017)
db = mongoclient.influence

### Support Functions

In [435]:
def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    text = str(text)
    if not text.strip():
        return None
    try:
        return ObjectId(text.rjust(24,"0"))
    except Exception as ex:
        print(text, ex)
        return None

LIMIT=50000
    
def get_tweets(collection):
    """
    Gets tweets
    
    collection - Tweets MongoDB collection
    """
    tweets = list(collection
                    .find({},{'_id': False, 'tweet_id': True, 'user_id' : True})
                    .limit(LIMIT)
                   )
    
    print("Number of tweets:", len(tweets))
    tweets = [flatten(t) for t in tweets]
    df_tweets = pd.DataFrame(tweets)
    df_tweets.user_id = df_tweets.user_id.apply(lambda t : str(t))
    df_tweets.user_id = df_tweets.user_id.astype(str)
    df_tweets.rename(columns={'user_id':'tweet_author'},inplace=True)
    return df_tweets
    
    
def get_retweets(collection):
    """
    Gets retweets
    
    collection - Retweets MongoDB collection
    """
    retweets = list(collection
                    .find({},{'_id': False, 'tweet_id': True, 'user_id' : True})
                    .limit(LIMIT)
                   )
    
    print("Number of retweets:", len(retweets))
    retweets = [flatten(r) for r in retweets]
    df_retweets = pd.DataFrame(retweets)
    df_retweets.user_id = df_retweets.user_id.apply(lambda t : str(t))
    df_retweets.user_id = df_retweets.user_id.astype(str)
    df_retweets.rename(columns={'user_id':'retweet_author'},inplace=True)
    return df_retweets

def get_users(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({ 'botscore.cap.universal' : { '$exists' : True }},
                                 {'_id' : True, 'botscore.cap.universal' : True})
                 #.limit(LIMIT)
                )
    print("Number of users with botscore in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    df_users._id = df_users._id.apply(lambda t : str(t))
    df_users._id = df_users._id.astype(str)
    return df_users

#### Tweets

### Network analysis

#### -- Tweets

In [191]:
df_tweets = get_tweets(db.tweets)
df_tweets.head(3)

Number of tweets: 50000


Unnamed: 0,tweet_author,tweet_id
0,276977398,1191462058602192907
1,2906096735,1191481739606183937
2,1620996282,1191488966639443968


In [192]:
df_tweets[df_tweets.duplicated()].count()
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
tweet_author    50000 non-null object
tweet_id        50000 non-null object
dtypes: object(2)
memory usage: 781.4+ KB


In [193]:
df_retweets.drop_duplicates(keep='first', inplace=True)

#### -- Retweets

In [194]:
df_retweets = get_retweets(db.retweets)
df_retweets.head(3)

Number of retweets: 50000


Unnamed: 0,tweet_id,retweet_author
0,1193667913368121351,4460826197
1,1192581634291355649,1075150211725619200
2,1185312751301906433,3251522811


In [195]:
df_retweets[df_retweets.duplicated()].count()

tweet_id          32
retweet_author    32
dtype: int64

In [196]:
df_retweets.drop_duplicates(keep='first', inplace=True)
df_retweets.count()

tweet_id          49968
retweet_author    49968
dtype: int64

### -- Users

In [436]:
users = get_users(db.users)
display(users.head(5))

Number of users with botscore in DB: 206032


Unnamed: 0,_id,botscore_cap_universal
0,2266588688,0.805409
1,471028961,0.708983
2,2792368467,0.465839
3,180918124,0.346921
4,1184444845047386112,0.52224


In [442]:
len(users._id.unique())

206032

### --- Build directed-weighted retweet graph

In [197]:
df_edges = df_retweets.set_index('tweet_id').join(df_tweets.set_index('tweet_id')).reset_index(drop=True)
df_edges.head(5)

Unnamed: 0,retweet_author,tweet_author
0,932603728334589952,95487654
1,791835137306726400,95487654
2,84226971,95487654
3,776509892349857792,95487654
4,50324949,95487654


In [198]:
df_edges = df_edges.groupby(['retweet_author','tweet_author']).size().reset_index().rename(columns={0:'weight'})
df_edges.head(5)

Unnamed: 0,retweet_author,tweet_author,weight
0,781280,2904896141,1
1,1357911,149991703,1
2,1387941,402593346,1
3,1387941,2904896141,1
4,1768911,17987604,1


In [199]:
df_edges[df_edges.weight>1]

Unnamed: 0,retweet_author,tweet_author,weight
150,000000000000000014764527,000000000000002904896141,2
659,000000000000000040900410,000000000000000050982086,2
694,000000000000000042577457,000000000000000087659574,2
1003,000000000000000056842556,000001007282317151305728,2
1027,000000000000000057955465,000001007282317151305728,2
...,...,...,...
48071,000001155446895793364992,000000000000003152296384,2
48255,000001160580320695148544,000000000000000020509689,2
48773,000001171750432236101633,000001007282317151305728,2
49199,000001180651674811207680,000000000000000233928703,2


In [200]:
df_edges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49705 entries, 0 to 49704
Data columns (total 3 columns):
retweet_author    49705 non-null object
tweet_author      49705 non-null object
weight            49705 non-null int64
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [373]:
G = nx.from_pandas_edgelist(df=df_edges, source='tweet_author', target='retweet_author', edge_attr='weight', create_using=nx.DiGraph)

In [429]:
node_attr = users.set_index('_id').to_dict('index')
nx.set_node_attributes(G, values=-1, name='botscore_cap_universal')
nx.set_node_attributes(G, node_attr)

In [430]:
nodes = (
    node
    for node, data
    in G.nodes(data=True)
    if data.get("botscore_cap_universal") < 1 and data.get("botscore_cap_universal") > 0
)

G = G.subgraph(nodes)

## Graph properties

### Network composition

#### Bot-infected network

In [431]:
print("ALL DATABASE")
print("--------------")
print(nx.info(G))

ALL DATABASE
--------------
Name: 
Type: DiGraph
Number of nodes: 4883
Number of edges: 6194
Average in degree:   1.2685
Average out degree:   1.2685


In [None]:
df_describe = pd.DataFrame.from_dict({'out-degree':list(dict(G.out_degree()).values()),
                                      'in-degree':list(dict(G.in_degree()).values()),
                                      'betweenness':list(dict(nx.betweenness_centrality(G, k=1000, normalized=True)).values()),
                                      'closeness':list(dict(nx.closeness_centrality(G)).values())},
                                      orient='columns')
df_describe.describe()

#### Legitimate network

In [427]:
nodes = (
    node
    for node, data
    in G.nodes(data=True)
    if data.get("botscore_cap_universal") < 0.8 and data.get("botscore_cap_universal") > 0
)

G_without_bots = G.subgraph(nodes)

In [428]:
print("graph without bots")
print("--------------")
print(nx.info(G_without_bots))

df_describe = pd.DataFrame.from_dict({'out-degree':list(dict(G_without_bots.out_degree()).values()),
                                      'in-degree':list(dict(G_without_bots.in_degree()).values()),
                                      'betweenness':list(dict(nx.betweenness_centrality(G_without_bots, k=1000, normalized=True)).values()),
                                      'closeness':list(dict(nx.closeness_centrality(G_without_bots)).values())},
                                      orient='columns')
df_describe.describe()

graph without bots
--------------
Name: 
Type: DiGraph
Number of nodes: 3906
Number of edges: 3423
Average in degree:   0.8763
Average out degree:   0.8763


Unnamed: 0,out-degree,in-degree,betweenness,closeness
count,3906.0,3906.0,3906.0,3906.0
mean,0.876344,0.876344,6.559476e-09,0.000236
std,9.65968,0.803206,1.802836e-07,0.000223
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.000256
75%,0.0,1.0,0.0,0.000256
max,226.0,9.0,9.992312e-06,0.002305
