In [7]:
from pymongo import MongoClient
from pprint import pprint
import igraph
import numpy as np
import scipy
import scipy.sparse as sp
from scipy.sparse.linalg import eigsh, eigs
from scipy.sparse import csgraph

import pandas as pd
import sys
import pymongo

try:
    client = MongoClient('localhost', 27017) # host, port
except ConnectionFailure as e:
    sys.stderr.write("Could not connect to MongoDB: %s" % e)
    sys.exit(1)

In [9]:
# get database with all tweets
#db = client.tweets_database


#db.filtered_tweets.drop()

# new database with cleaned tweets (without Retweets, in english)
english_tweets = client.tweets_database.english_tweets

client.tweets_database.collection_names()

['raw_tweets',
 'filtered_tweets_noRetweets_english',
 'english_tweets',
 'english_noRetweet_tweets',
 'users',
 'manually_labelled_tweets',
 'filtered_noRetweets_english_onlyPersonal']

In [4]:
#client.tweet_database.stats()

In [10]:

#pymongo.version
#client.tweets_database.english_tweets.create_index( [{ 'user.screen_name': 1 , 
#                                                               'retweeted_status.user.screen_name': 1,
#                                                               'number_of_weeks': 1}] )
client.tweets_database.english_tweets.create_index([('user.screen_name', 1), 
                                                             ('retweeted_status.user.screen_name', 1),
                                                             ('number_of_weeks', 1)], 
                                                              name='userName_retweetName_nWeek')


'userName_retweetName_nWeek'

In [11]:
users = set().union( [v['_id'] for v in english_tweets.aggregate( [ 
                                            {'$match': {'number_of_weeks' : 1}},
                                            {"$group" : { "_id" : "$user.screen_name" } }
                                           
                                         ] )
                      ],
                     [v['_id'] for v in english_tweets.aggregate( [ 
                                            {'$match': {'number_of_weeks' : 1}},
                                            {"$group" : { "_id" : "$retweeted_status.user.screen_name" } }
                                           
                                         ] )
                      ]
                   )
print(len(users))

2922


In [12]:

# create dictionary: each user name corresponds to one unique number between 0 and n_users

count = 0
userToInt_dict = {}
for user in users:
    userToInt_dict[user] = count
    count += 1
#pprint(userToInt_dict)

intToUser_dict = {v: k for k,v in userToInt_dict.items()}
#intToUser_dict


In [13]:
def calculate_score(M_sp, intToUser_dict):
    
    # get total number of nodes/users
    N = M_sp.get_shape()[0]
    
    # calculate connected components of sparse matrix
    connected_components = csgraph.connected_components(M_sp, directed=True)
    print("connected_components:", connected_components[0])
    
    # count number of nodes/users in each component
    countFrequenciesComponents = np.bincount(connected_components[1])
    #print("countFrequenciesComponents:", countFrequenciesComponents)

    unique, counts = np.unique(countFrequenciesComponents, return_counts=True)
    #print("Count occurences of groups", dict(zip(unique, counts)))
    
    # sort connected components from most connected to not connected at all
    sorted_labels = np.argsort(countFrequenciesComponents)[::-1]
    #print("sorted labels:", sorted_labels)
    
    # define vector containing all scores for each node
    v = pd.DataFrame(columns=["score"])

    for ind in sorted_labels:
        # when we see a single component of  size one we can break, 
        #since all other components after will be one too (array is sorted!)
        if countFrequenciesComponents[ind] != 1:
            #print("ind:", ind)
            #print("countfreq:", countFrequenciesComponents[ind])
            N_comp = countFrequenciesComponents[ind]

            # get list of indices of the current connected composent representing the users
            indices_user_comp = [i for i, comp in enumerate(connected_components[1]) if comp == ind]
            #print("length of indices_ user:", len(indices_user_comp))


            # get submatrix of current component
            #submatrix = M_sp.toarray()[connected_components[1] == ind, :][:, connected_components[1] == ind]
            submatrix = M_sp.tocsr()[connected_components[1] == ind, :][:, connected_components[1] == ind]
        
            #print("shape submatrix:", submatrix.shape)
            #print(submatrix)

            # k must be less than ndim(A)-1
            #eigenvalues, eigenvectors = np.linalg.eig(submatrix)
            if submatrix.get_shape()[0] > 10:
                eigenvalues, eigenvectors = eigs(submatrix, k=2, which='LM')
            else:
                eigenvalues, eigenvectors = np.linalg.eig(submatrix.toarray())


            # get index of maximum eigen value
            max_eigval_ind = np.argmax(eigenvalues)
            #print("max eigen value:", eigenvalues[max_eigval_ind], ", index:", max_eigval_ind)
            #print("max eigen vector:")
            #print(eigenvectors[:,max_eigval_ind])

            # normalise eigen vector for maximum eigen value s.t. sum of vector = 1
            max_eigvec_norm = eigenvectors[:,max_eigval_ind]/sum(eigenvectors[:,max_eigval_ind])
            #print("normalised max eigen vector:")
            #print(max_eigvec_norm)

            # weight-average vectors s.t. score of each node is weighted according to the size of the component
            # which it is found, thus ensuring that the scores are comparable and that we do not discard
            # information from the smaller components
            users_of_component = [intToUser_dict[user] for user in indices_user_comp]

            max_eigvec_norm_weighted = pd.DataFrame((N_comp/N)*max_eigvec_norm, index=users_of_component,
                                                    columns=["score"])
            #print("Weighted eig vec:")
            #print(max_eigvec_norm_weighted)
            #print("")

            # append scores from current connected composant to global vector v containing all scores
            v = pd.concat([v,max_eigvec_norm_weighted], axis=0)

        else:
            break
    
    return v

In [14]:
# dictionary in which we save the scores over the weeks
authority_dict = {}
hub_dict = {}


nWeek = 1
while(True):
    print("----------------WEEK {} -----------------:".format(nWeek))
    # get unique users and users of the retweets twittering in week 1
    #users_tweet = english_tweets.distinct('user.screen_name', {'number_of_weeks' : nWeek}) #user.id
    #users_retweet = english_tweets.distinct('retweeted_status.user.screen_name', 
    #                                                 {'number_of_weeks' : nWeek})

    #users = set().union(users_tweet, users_retweet)
    
    users = set().union( [v['_id'] for v in english_tweets.aggregate( [ 
                                                {'$match': {'number_of_weeks' : nWeek}},
                                                {"$group" : { "_id" : "$user.screen_name" } }

                                             ] )
                          ],
                         [v['_id'] for v in english_tweets.aggregate( [ 
                                                {'$match': {'number_of_weeks' : nWeek}},
                                                {"$group" : { "_id" : "$retweeted_status.user.screen_name" } }

                                             ] )
                          ]
                       )

    n_users = len(users)
    #print("number of users:", n_users)

    print("INFO: Number of nodes total:", n_users)

    
    # create dictionary: each user name corresponds to one unique number between 0 and n_users

    count = 0
    userToInt_dict = {}
    for user in users:
        userToInt_dict[user] = count
        count += 1

    intToUser_dict = {v: k for k,v in userToInt_dict.items()}
    #intToUser_dict
    print("INFO: Created dictionaries for conversion integer <-> user")
    print("\t Length of dict:", len(intToUser_dict))
    
    # break at last week when there is no tweet anymore
    if n_users < 20:
        break
      
    
    # create matrix users x users 
    #A_sp = sp.lil_matrix(np.zeros((n_users, n_users)))#.toarray()
    A_sp = sp.lil_matrix(sp.coo_matrix((n_users, n_users)))#.toarray()

    # fill retweet adjacency matrix with number of retweets
    tweets_per_week = english_tweets.find({'number_of_weeks' : nWeek}, {"user.screen_name":1, "retweeted_status.user.screen_name":1})
    #print("INFO: got tweets per week, size:", sys.getsizeof(tweets_per_week))

    i = 1
    for tweet in tweets_per_week:
        # only for retweets: add +1 when user of tweet retweeted other user
        if "retweeted_status" in tweet.keys():
            who_retweeted = userToInt_dict[tweet["user"]["screen_name"]]
            retweeted_from = userToInt_dict[tweet["retweeted_status"]["user"]["screen_name"]]
            A_sp[who_retweeted, retweeted_from] += 1

       
    print("INFO: number of non zero entries in adjacency matr:", A_sp.count_nonzero())        
    print("INFO: Memory usage of A (in KB):", A_sp.data.nbytes/1024)

    
    N = n_users
    

    # Co-citation network
    # ----------------------------------------------------------------
    C = A_sp.transpose() * A_sp

    score_authority = calculate_score(C, intToUser_dict)
    #print("INFO: Score calculated:")
    #print(score_authority)
    print("Authority score length: ", len(score_authority))
    
    
    for user, row in score_authority.iterrows():
        if user not in authority_dict.keys():
            authority_dict[user] = [(nWeek, row.score)]
        else:
            authority_dict[user].append((nWeek, row.score))
    
    
    # Bibliographic network
    # ------------------------------------------------------------------
#    B = A_sp * A_sp.transpose()
    
#    score_hub = calculate_score(B, intToUser_dict)
#    print("INFO: Score calculated")
    #print(score_authority)
#    print("\tlength: ", len(score_hub))

#    for user, row in score_hub.iterrows():
#        if user not in hub_dict.keys():
#            hub_dict[user] = [(nWeek, row.score)]
#        else:
#            hub_dict[user].append((nWeek, row.score))
    
    nWeek += 1
    print("")
    
    


In [8]:
#authority_dict

#print(datetime.datetime.now().strftime(format))
for k, v in authority_dict.items():
    s = k+";"
    for score in v:
        s += str(score[0])+";"+str(score[1])+";"

    s += "\n"
    print(s)


In [21]:
import datetime
format = "%Y-%m-%d_%H-%M-%S"

pathname = "authority_scores_week13_33__{}.csv".format(datetime.datetime.now().strftime(format))
with open(pathname,'w') as file:
    for k, v in authority_dict.items():
        s = k+";"
        for score in v:
            s += str(score[0])+";"+str(score[1])+";"

        s += "\n"
        file.write(s)

In [9]:
#authority_dict

In [29]:

authority_dict_true = authority_dict.copy()
with open("authority_scores_week1_12__2018-05-23_15-52-01.csv", 'r') as file:
    for line in file:
        line = line.split(";")
        key = line[0]
        value = line[1:-1]
        
        i = 0
        temp = []
        while i < len(value):
            temp.append((value[i], value[i+1]))
            i += 2 
        value = temp
        
        if key in authority_dict.keys():
            for tup in authority_dict[key]:
                value.append(tup)
            authority_dict[key] = value
        else:
            authority_dict[key] = value
            
        #print(key)
        #print(value)
        

  
        
        

In [10]:
#authority_dict


In [31]:
format = "%Y-%m-%d_%H-%M-%S"

pathname = "authority_scores__{}.csv".format(datetime.datetime.now().strftime(format))
with open(pathname,'w') as file:
    for k, v in authority_dict.items():
        s = k+";"
        for score in v:
            s += str(score[0])+";"+str(score[1])+";"

        s += "\n"
        file.write(s)

In [15]:
#calculate aggregated score
aggregated_scores = []
for key, values in authority_dict.items():
    print(key)
    print(values)
    sum_scores = 0
    for tup in values:
        if isinstance(tup[1], str) :
            sum_scores += np.complex(tup[1]).real
        else:
            sum_scores += np.array(tup[1]).real
        
    print(sum_scores)
    print(len(values))
    print("")

    aggregated_scores.append((key, sum_scores, len(values)))
    #break
aggregated_scores = sorted(aggregated_scores, key=lambda x: x[1], reverse=True)
aggregated_scores

In [14]:
# consider only scores of people with at least tweets of 10 weeks
#aggregated_scores_filtered = [tup for tup in aggregated_scores if tup[2] > 15]
aggregated_scores_filtered = [tup for tup in aggregated_scores]

#aggregated_scores_filtered

In [16]:
aggregated_scores_filtered[0]

In [17]:
import datetime
format = "%Y-%m-%d_%H-%M-%S"

pathname = "authority_scores_sorted_all_{}.csv".format(datetime.datetime.now().strftime(format))
with open(pathname,'w') as file:
    file.write("User;score;number_of_weeks\n")
    for tup1, tup2, tup3 in aggregated_scores_filtered:
        s = str(tup1)+";"+str(tup2)+";"+str(tup3)+";"+"\n"
        file.write(s)