## Configuration

### Imports

In [1]:
# Utilities
from IPython.display import display
import os
import numpy as np
import time
import pandas as pd

# MongoDB functionality
from pymongo import MongoClient, InsertOne, UpdateOne
from bson import ObjectId
from collections.abc import MutableMapping

# Graphics
import matplotlib.pyplot as plt
from matplotlib.pyplot import text
import seaborn as sns
sns.set(font_scale=0.9)
sns.set_style("whitegrid")
sns.set_style({'font.family':'monospace'})
from mpl_toolkits.axes_grid1.inset_locator import inset_axes


# Network analysis
import networkx as nx

ROOT_DIR = "/home/mattia/javier/botbusters-spanish-general-elections-network-analysis/"

# Change path to root
os.chdir(ROOT_DIR)

# MongoDB parameters
mongoclient = MongoClient('localhost', 27017)
db = mongoclient.influence

### Support Functions

In [74]:
def flatten(d, parent_key='', sep='_'):
    """Formats MongoDB results
    
    Keyword arguments:
    d -- dictionary with key and uncleaned values
    parent_key --
    sep --
    """
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def make_objid(text):
    """Makes an ObjectId of 4 bytes
    
    Keyword arguments:
    text -- string to be converted into Object ID
    """
    text = str(text)
    if not text.strip():
        return None
    try:
        return ObjectId(text.rjust(24,"0"))
    except Exception as ex:
        print(text, ex)
        return None

LIMIT=1000000
    
def get_tweets(collection):
    """
    Gets tweets
    
    collection - Tweets MongoDB collection
    """
    tweets = list(collection
                    .find({'sampled':True},{'_id': True, 'tweet_id': True, 'user_id' : True})
                   )
    
    print("Number of tweets:", len(tweets))
    tweets = [flatten(t) for t in tweets]
    df_tweets = pd.DataFrame(tweets)
    df_tweets.user_id = df_tweets.user_id.apply(lambda t : str(t))
    df_tweets.user_id = df_tweets.user_id.astype(str)
    df_tweets.rename(columns={'user_id':'tweet_author'},inplace=True)
    return df_tweets
    
    
def get_retweets(collection):
    """
    Gets retweets
    
    collection - Retweets MongoDB collection
    """
    retweets = list(collection
                    .find({'sampled':True},{'_id': False, 'tweet_id': True, 'user_id' : True, 'date':True})
                    .limit(LIMIT)
                   )
    
    print("Number of retweets:", len(retweets))
    retweets = [flatten(r) for r in retweets]
    df_retweets = pd.DataFrame(retweets)
    df_retweets.user_id = df_retweets.user_id.apply(lambda t : str(t))
    df_retweets.user_id = df_retweets.user_id.astype(str)
    df_retweets.date = df_retweets.date.astype("datetime64")
    df_retweets.rename(columns={'user_id':'retweet_author'},inplace=True)
    return df_retweets

def get_users(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({ '$and' : [ { 'sampled': True } , { 'botscore' : { '$gte' : 0 } }]},
                                 {'_id' : True, 'botscore' : True})
                 #.limit(LIMIT)
                )
    print("Number of sampled users with botscore in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    df_users._id = df_users._id.apply(lambda t : str(t))
    df_users._id = df_users._id.astype(str)
    return df_users


def get_users(collection):
    """
    Gets users
    
    collection - Users MongoDB collection
    """
    users = list(collection.find({ '$and' : [ { 'sampled': True } , { 'botscore' : { '$gte' : 0 } }]},
                                 {'_id' : True, 'botscore' : True})
                 #.limit(LIMIT)
                )
    print("Number of sampled users with botscore in DB:", len(users))
    users = [flatten(u) for u in users]
    df_users = pd.DataFrame(users)
    df_users._id = df_users._id.apply(lambda t : str(t))
    df_users._id = df_users._id.astype(str)
    return df_users

def get_hashtags(collection):
    """
    Gets hashtags
    
    collection - Hashtags MongoDB collection
    """
    hashtags = list(collection.find({},
                                 {'_id' : False, 'tweet_id' : True, 'hashtag' : True})
                 #.limit(LIMIT)
                )
    print("Number of hashtags in DB:", len(hashtags))
    df_hashtags = pd.DataFrame(hashtags)
    return df_hashtags

## Data preparation

#### Getting users from DB...

In [3]:
users = get_users(db.users)
display(users.head(5))

Number of sampled users with botscore in DB: 477982


Unnamed: 0,_id,botscore
0,2266588688,0.16
1,471028961,0.03
2,2792368467,0.11
3,4068857357,0.01
4,6953012,0.03


#### Getting tweets from DB...

In [4]:
df_tweets = get_tweets(db.tweets)
df_tweets.head(3)

Number of tweets: 358190


Unnamed: 0,_id,tweet_author,tweet_id
0,77c4e81a-46a7-11ea-9505-02420a0000af,279465279,1191464087064109058
1,f4c064c6-4c4b-11ea-954b-02420a0000f2,1266667238,1192787272975945729
2,b84962c0-468a-11ea-9505-02420a0000af,331220696,1191467194519080960


In [5]:
df_tweets = df_tweets[df_tweets.tweet_author.isin(users._id)]

In [6]:
df_tweets[df_tweets.duplicated()].count()
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 296557 entries, 0 to 358189
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   _id           296557 non-null  object
 1   tweet_author  296557 non-null  object
 2   tweet_id      296557 non-null  object
dtypes: object(3)
memory usage: 9.1+ MB


In [7]:
df_tweets.drop_duplicates(keep='first', inplace=True)

#### Getting retweets from DB...

In [75]:
df_retweets = get_retweets(db.retweets)
df_retweets.head(3)

Number of retweets: 1000000


Unnamed: 0,tweet_id,retweet_author,date
0,1193667913368121351,4460826197,2019-11-10 23:15:43
1,1192581634291355649,1075150211725619200,2019-11-07 23:20:20
2,1190659978471911424,71190801,2019-11-02 16:54:42


In [76]:
df_retweets = df_retweets[df_retweets.retweet_author.isin(users._id)]

In [77]:
df_retweets[df_retweets.duplicated()].count()

tweet_id          0
retweet_author    0
date              0
dtype: int64

In [78]:
df_retweets.drop_duplicates(keep='first', inplace=True)
df_retweets.count()

tweet_id          698359
retweet_author    698359
date              698359
dtype: int64

## 1. Hashtag concurrence networks

#### Getting hashtags from DB...

In [12]:
hashtags = get_hashtags(db.hashtags)
display(hashtags.head(5))

Number of hashtags in DB: 2603272


Unnamed: 0,tweet_id,hashtag
0,f7c581e4-4681-11ea-a6d9-02420a000681,LaContracampaña
1,f7c581e4-4681-11ea-a6d9-02420a000681,10N
2,f7c581e4-4681-11ea-a6d9-02420a000681,PoderMigrante
3,c677460e-468b-11ea-9505-02420a0000af,España
4,c677460e-468b-11ea-9505-02420a0000af,AhoraSí


In [13]:
hashtags = hashtags[hashtags.tweet_id.isin(df_tweets._id)]
print(len(hashtags))

421749


#### Join hashtags with author

In [14]:
%%time
df_hashtags = hashtags.set_index('tweet_id').join(df_tweets.set_index('_id'))

CPU times: user 4min 37s, sys: 260 ms, total: 4min 37s
Wall time: 4min 37s


In [15]:
df_hashtags.sample(5)

Unnamed: 0,hashtag,tweet_author,tweet_id
6859223e-46ec-11ea-a6d9-02420a000681,LaMatanza,121857577,1184199009084293122
a29462d0-468b-11ea-9505-02420a0000af,10N,763077164514349057,1185652743920410624
4d97ad1e-46db-11ea-9505-02420a0000af,PorTodoLoQueNosUne,1010973606589992967,1192908777261023232
366e1ece-46cd-11ea-9505-02420a0000af,ChileDesperto,97572937,1185676994538852353
f884a4e0-467a-11ea-9505-02420a0000af,ChicasVox,739752348436828161,1191481571842363393


#### Join hashtags with botscore

In [16]:
users_botscore = dict(zip(users._id, users.botscore))
df_hashtags['botscore'] = df_hashtags.tweet_author.map(users_botscore)
df_hashtags.sample(5)

Unnamed: 0,hashtag,tweet_author,tweet_id,botscore
8aec7cf0-4689-11ea-9505-02420a0000af,debatea5RTVE,720496988,1191483702582353921,0.02
27325c04-46c3-11ea-9505-02420a0000af,EvasionMasivaTodoElDia,1974003090,1185450345574649857,0.06
51ec2b24-4712-11ea-9505-02420a0000af,unboxingfranco,1288918003,1187301962674593792,0.03
1d48f8cc-46a2-11ea-9505-02420a0000af,DebateElectoral,2878658775,1191508032334782465,0.29
ad2257f8-4685-11ea-9505-02420a0000af,Elecciones10N,2462004186,1193491082635546624,0.16


In [35]:
df_hashtags.reset_index(inplace=True, drop=True)
df_hashtags.head(5)

Unnamed: 0,hashtag,tweet_author,tweet_id,botscore
0,10N,463236045,1192571547749367808,0.1
1,YoVoto,463236045,1192571547749367808,0.1
2,10N,253018975,1192713518421815296,0.06
3,10N,299235535,1192734686839222273,0.0
4,Vota,299235535,1192734686839222273,0.0


#### Building graph concurrence network per network 70%, 90%, 100%

In [71]:
prs = [70,90]
ps = np.percentile(a=users.botscore, q=prs, interpolation='lower')

percentile = {70: ps[0],
              90: ps[1],
              100: 1.01}

df_concurrence = df_hashtags

# filter tweets per author's botscore
for percentage in [70, 90, 100]:
    print("Network " + str(percentage) +"%")

    dic = {
        'hashtag1' : [],
        'hashtag2' : []
    }
    
    df_situation = df_concurrence[df_concurrence.botscore < percentile[percentage]]

    # group pairs of hashtags concurring on same tweet
    for tweet_id, tweet_id_grp in df_situation[['hashtag','tweet_id']].groupby('tweet_id'):
        if len(tweet_id_grp) > 1:
            #display(tweet_id_grp)
            tweet_id_grp_array = tweet_id_grp.hashtag.array
            for ihast in range(0, len(tweet_id_grp_array),1):
                for jhast in range(ihast+1, len(tweet_id_grp_array),1):
                    dic['hashtag1'].append(tweet_id_grp_array[ihast])
                    dic['hashtag2'].append(tweet_id_grp_array[jhast])

    df_situation = pd.DataFrame.from_dict(dic, orient='columns')
    
    # edges from dataframe
    df_edges = df_situation.groupby(['hashtag1','hashtag2']).size().reset_index().rename(columns={0:'weight'})   # count ocurrences of concurrent hashtag appearance
        
    
    G = nx.from_pandas_edgelist(df=df_edges, source='hashtag1', target='hashtag2', edge_attr='weight', create_using=nx.Graph)
    #nx.write_graphml(G, f"graphs/3.hashtag-networks-{percentage}.graphml")
    
    df_concurrence = df_concurrence[df_concurrence.botscore >= percentile[percentage]]
    
    print("DONE!")

Network 70%
DONE!
Network 90%
DONE!
Network 100%
DONE!


## 2. Structural virality

#### Getting retweets + tweets, date and botscores

In [80]:
df_retweets['botscore'] = df_retweets.retweet_author.map(users_botscore)
df_retweets.head(10)

Unnamed: 0,tweet_id,retweet_author,date,botscore
1,1192581634291355649,1075150211725619200,2019-11-07 23:20:20,0.09
3,1190659978471911424,2330405348,2019-11-04 20:59:19,0.19
5,1187536717663875074,1158948082157084674,2019-10-25 12:01:31,0.19
6,1184920265953808385,1259978400,2019-10-18 08:27:25,0.2
7,1191470697329102854,542905886,2019-11-04 21:42:20,0.06
8,1193641870984957953,480847309,2019-11-10 21:30:05,0.19
9,1193641870984957953,462360790,2019-11-10 21:31:27,0.03
11,1193641870984957953,895672877483151361,2019-11-10 22:36:05,0.02
12,1193641870984957953,856989403,2019-11-10 22:44:42,0.16
13,1193641870984957953,981470738,2019-11-12 02:44:40,0.21


In [81]:
## join each retweet with original tweet's botscore
df_retweets.rename(columns={'botscore':'retweet_botscore'}).join(df_tweets[['tweet_id','botscore']].rename(columns={'botscore':'tweet_botscore'}).set_index('tweet_id'), on='tweet_id').head()

Unnamed: 0,tweet_id,retweet_author,date,retweet_botscore,tweet_botscore
1,1192581634291355649,1075150211725619200,2019-11-07 23:20:20,0.09,0.16
3,1190659978471911424,2330405348,2019-11-04 20:59:19,0.19,0.12
5,1187536717663875074,1158948082157084674,2019-10-25 12:01:31,0.19,0.14
6,1184920265953808385,1259978400,2019-10-18 08:27:25,0.2,0.14
7,1191470697329102854,542905886,2019-11-04 21:42:20,0.06,0.19


### 2.1. Retweet network cascade with timestamps

#### Three situations to compare. We sort users in ascending order of botscore:
##### 1. users < 70th percentile (legitimate network)
##### 2. users < 90th (shady network)
##### 3. all network (bot-interfered network)

In [None]:
prs = [70,90]
ps = np.percentile(a=users.botscore, q=prs, interpolation='lower')

percentile = {70: ps[0],
              90: ps[1],
              100: 1.01}

df_virality = df_retweets


# filter tweets per author's botscore
for percentage_tweet in [70, 90, 100]:
    print("Content generated until " + str(percentage_tweet) +"% of the network")


    df_situation = df_virality[df_virality.tweet_botscore < percentile[percentage_tweet]]

    
    #### calculate virality
    for percentage_retweet in [70, 90, 100]:
        print("Content retweeted until " + str(percentage_retweet) +"% of the network")

        df_situation2 = df_virality[df_virality.retweet_botscore < percentile[percentage_tweet]]
        
        
        
        break
        
    #df_virality = df_virality[df_virality.tweet_botscore >= percentile[percentage_tweet]]
    
    print("DONE!")
    break

### 3. Retweet cascades