# Download the data

In [31]:
import requests
from gzip import GzipFile
from io import BytesIO

activity_url = r'http://snap.stanford.edu/data/higgs-activity_time.txt.gz'
social_network_url = r'http://snap.stanford.edu/data/higgs-social_network.edgelist.gz'
activity_data_path = r'data/higgs-activity_time.txt'
social_network_path = r'data/social_network.edgelist'


# download the higgs-activity_time set
with open(activity_data_path, 'wb') as f:
    r = requests.get(activity_url)

    gzipped = BytesIO()
    gzipped.write(r.content)
    gzipped.seek(0)

    raw_file = GzipFile(fileobj=gzipped, mode='rb')
    f.write(raw_file.read())
    
# download the higgs-social_network set
with open(social_network_path, 'wb') as f:
    r = requests.get(social_network_url)

    gzipped = BytesIO()
    gzipped.write(r.content)
    gzipped.seek(0)

    raw_file = GzipFile(fileobj=gzipped, mode='rb')
    f.write(raw_file.read())

# Read in files to create graphs

### Create followers graph

In [49]:
import networkx as nx

G_followers = nx.DiGraph()

In [47]:
with open(social_network_path, 'r') as f:
    edge_tuple_list = [(x.split()[0], x.split()[1]) for x in f]

In [50]:
G_followers.add_edges_from(edge_tuple_list)

In [51]:
print(f"Number of Nodes: {G_followers.number_of_nodes()}")
print(f"Number of Edges: {G_followers.number_of_edges()}")

Number of Nodes: 456626
Number of Edges: 14855842


### Create graphs for retweet(RT), reply(RE), and mention(MT)

In [69]:
# read in the activity data
with open(activity_data_path, 'r') as f:
    activity_tuple_list = [tuple(x.split()) for x in f]

In [80]:
from collections import defaultdict
activity_type_dict = defaultdict(list)

In [81]:
for tup in activity_tuple_list:
    activity_type_dict[tup[3]].append([tup[0], tup[1]])

In [82]:
activity_type_dict.keys()

dict_keys(['MT', 'RE', 'RT'])

In [88]:
G_MT = nx.DiGraph()
G_RE = nx.DiGraph()
G_RT = nx.DiGraph()

G_MT.add_edges_from(activity_type_dict['MT'])
G_RE.add_edges_from(activity_type_dict['RE'])
G_RT.add_edges_from(activity_type_dict['RT'])

# Notes about data from source

Data saved in `higgs-activity_time.txt (loaded into activity_data_path below)` has the format of `userA userB timestamp interaction`.

> Interaction can be RT (retweet), MT (mention) or RE (reply). Each link is directed. The user IDs in this dataset corresponds to the ones adopted to anonymize the social structure, thus the datasets (1) - (5) can be used together for complex analysis involving structure and dynamics.

The above is good info, we'll indeed use the user IDs to link things together after verifying that all the necessary users exist in all sources.

> Note 1: the direction of links depends on the application, in general. For instance, if one is interested in building a network of how information flows, then the direction of RT should be reversed when used in the analysis. Nevertheless, the choice is left to the researcher and his/her own interpretation of the data, whereas we just provide the observed actions, i.e., who retweets/mentions/replies/follows whom.

This I'm less sure about.  I have avoided twitter and only have a vague idea of how it works, but I _think_ I follow what's happening here.  I'm interpreting the data above to be "userA retweeted userB", for the case of RT.  I think this will be appropriate to answer the question here.

## Question 1: Which user had the most of each interaction type (retweet vs reply vs mention)?

In [175]:
# handy function to get top users

def get_top_users(G, degree='sum', n=10):
    """Returns top ``n`` users using ``degree`` counts
    
    Parameters
    ---------
    G: DiGraph
        networkx DiGraph object
    degree : str
        either `sum`, `in`, or `out`.  Determines which degree count to use
    n : int
        number of users to return
        
    Returns
    -------
    list
        Sorted list of users at the degree of their nodes with respect to ``degree`` type
    """
    if degree == 'sum':
        degree_collection = G.degree
    elif degree == 'in':
        degree_collection = G.in_degree
    elif degree == 'out':
        degree_collection = G.out_degree
    else:
        raise ValueError("Degree is not one of ['sum', 'in', 'out']")
        
    return sorted(degree_collection, key=lambda x: x[1], reverse=True)[:n]

With a DiGraph, the `.degree` [property is the sum of in-degree and out-degree of each node](https://networkx.github.io/documentation/stable/tutorial.html#directed-graphs).  I believe it fits in this case since we're looking at which users had the "most of each interaction type", which I'm interpreting to be either the user performed the action themselves (userA RT userB) or someone RT something of theirs (userB RT userA)

Further note (mostly to myself), here is how I think the following should be interpreted:

**Out-degree:** number of people user retweeted

**In-degree:** number of retweets of a user's own tweets

The following numbers seem to indicate that this could be likely.  For example, it's unlikely in the 7 day period this data was collected that user `88` themselves retweeted 14,060 different tweets, but it's far more likely that some of user `88`'s tweets were retweeted that many times.

In [176]:
get_top_users(G_RT, degree='out')

[('38535', 134),
 ('181190', 84),
 ('81405', 66),
 ('64911', 49),
 ('54301', 49),
 ('27705', 48),
 ('53508', 42),
 ('232850', 41),
 ('492', 38),
 ('62391', 38)]

In [177]:
get_top_users(G_RT, degree='in')

[('88', 14060),
 ('14454', 6190),
 ('677', 5613),
 ('1988', 4335),
 ('349', 2802),
 ('283', 2039),
 ('3571', 1980),
 ('6948', 1959),
 ('14572', 1692),
 ('68278', 1689)]

### Top 10 users by mention (MT)

In [178]:
get_top_users(G_MT, degree='sum')

[('88', 11960),
 ('677', 3920),
 ('2417', 2538),
 ('59195', 1604),
 ('3998', 1592),
 ('7533', 1530),
 ('383', 1358),
 ('1988', 1191),
 ('13813', 1067),
 ('519', 805)]

### Top users by reply (RE)

In [179]:
get_top_users(G_RE, degree='sum')

[('677', 1213),
 ('88', 1071),
 ('220', 470),
 ('3549', 219),
 ('317', 168),
 ('349', 144),
 ('1988', 105),
 ('7690', 102),
 ('3369', 92),
 ('16460', 83)]

### Top users by retweet (RT)

In [180]:
get_top_users(G_RT, degree='sum')

[('88', 14063),
 ('14454', 6190),
 ('677', 5621),
 ('1988', 4337),
 ('349', 2803),
 ('283', 2039),
 ('3571', 1981),
 ('6948', 1959),
 ('14572', 1692),
 ('68278', 1689)]

## Question 2: Who are the top five users with the highest ratio of retweets to followers?

In [192]:
user_retweet_counts = dict(G_RT.in_degree)

In [193]:
user_followers_counts = dict(G.in_degree)

Before trying to calculate the ratio, do a quick sanity check on their counts

In [194]:
retweet_set = set(user_retweet_counts.keys())
followers_set = set(user_followers_counts.keys())
print(f"Number of users with retweets: {len(a)}")
print(f"Number of users with followers: {len(b)}")

Number of users with retweets: 256491
Number of users with followers: 456626


It looks like not every user with a follower could have a retweet (which makes sense).  Are there any with retweets that don't have followers?

In [195]:
len(retweet_set) == len(retweet_set.intersection(followers_set))

True

Alright, so now I feel confident we can calculate the ratio for all users with retweets

In [196]:
ratio_counts = {user: user_retweet_counts[user] / user_followers_counts[user] for user in user_retweet_counts if user_followers_counts[user] != 0}

In [197]:
sorted(ratio_counts.items(), key=lambda x: x[1], reverse=True)[:10]

[('250519', 103.6),
 ('272834', 64.0),
 ('132690', 60.0),
 ('14454', 42.689655172413794),
 ('86851', 37.0),
 ('231650', 34.333333333333336),
 ('8135', 31.17241379310345),
 ('308781', 25.0),
 ('220468', 22.0),
 ('294416', 21.103448275862068)]

Surprisingly, we only see the user `14454` in the top 5 in the list of retweeted people.

In [189]:
dict(G.in_degree)['14454']

145

In [190]:
dict(G_RT.in_degree)['14454']

6190

In [191]:
6190 / 175

35.371428571428574