In [1]:
import numpy as np 
import pandas as pd 
import pickle
from os import listdir
import re
from tqdm import *
import networkx as nx

# Import files and extract data

In [2]:
files = [f for f in listdir('C:/Users/Jessica/PycharmProjects/Influenza/data/combined/')]

In [3]:
# get handles of influencers scraped
influencers = [re.sub('followers_|.pickle', '', f) for f in files] 

In [4]:
# get union of all influencers' followers
all_followers = []
for f in files: 
    file_name = 'C:/Users/Jessica/PycharmProjects/Influenza/data/combined/' + f
    with open(file_name, 'rb') as f:
        x = pickle.load(f)
    all_followers.extend(x)
    all_followers = list(set(all_followers))
print(len(all_followers))

385341


# Clean data and construct adjacency matrix

In [5]:
# function that returns dictionary of int booleans whether influencer has follower in union of all followers
def get_bool_match(file): 
    file_name = 'C:/Users/Jessica/PycharmProjects/Influenza/data/combined/' + file
    with open(file_name, 'rb') as f:
        x = pickle.load(f)
    common = {user:1 for user in set(x) & set(all_followers)}
    missing = {user:0 for user in set(x)^set(all_followers)}
    joint = dict(common, **missing)
    if len(joint) != len(all_followers): 
        print(file + 'Error: Length of total not equals to length of followers')
    return joint

In [6]:
# initialise occurrence matrix
occur_mat = pd.DataFrame.from_dict(get_bool_match(files[0]), orient='index')
occur_mat.rename(columns={0:influencers[0]}, inplace=True)
occur_mat.sort_index(inplace=True)

In [7]:
# fill occurrence matrix
for i in range(1,len(files)): 
    occur_col = pd.DataFrame.from_dict(get_bool_match(files[i]), orient='index')
    occur_col.rename(columns={0:influencers[i]}, inplace=True)
    occur_col.sort_index(inplace=True)
    occur_mat = occur_mat.merge(occur_col, left_index=True, right_index=True)
    if occur_mat.shape[0] != len(all_followers):
        print('occurrence matrix nrow not equals to length of followers for ' + files[i]) 

In [8]:
# get co-occurrence/adjacency matrix
co_occur_mat = occur_mat.T.dot(occur_mat)
adj_mat = co_occur_mat.copy()

In [9]:
# calculate jaccard similarity for adjacency matrix
for i in co_occur_mat.index: 
    for j in co_occur_mat.columns:
        if i == j: 
            adj_mat.loc[i,j] = 0
        else:
            value = (co_occur_mat[i][j] / (np.sum(co_occur_mat[i]) + np.sum(co_occur_mat[j]) - co_occur_mat[i][j])) 
            adj_mat.loc[i,j] = value

In [10]:
adj_df = adj_mat.replace(to_replace = 0, value = np.nan).stack().reset_index() 
adj_df.rename(columns = {'level_0' : 'source', 'level_1' : 'target', 0: 'weight'}, inplace = True)

# Graph network

In [11]:
from community import community_louvain
G_tmp = nx.from_pandas_edgelist(adj_df, source = 'source',target = 'target', edge_attr = ['weight'], create_using = nx.Graph())
partition = community_louvain.best_partition(G_tmp)

In [12]:
G = nx.from_pandas_edgelist(adj_df[adj_df['weight'] > 0.0075] , source = 'source',target = 'target', edge_attr = ['weight'], create_using = nx.Graph())
layt = nx.spring_layout(G, dim=3)

In [14]:
import plotly.offline as py
import plotly.graph_objs as go

In [15]:
# set graph labels for plotting
labels = co_occur_mat.index
#group = np.diag(co_occur_mat)
#group = list(pd.cut(group, [0, 5000, 20000, np.inf], labels=[0, 1, 2]))
group = list(partition.values())

In [16]:
# 3D network graph drawing code taken from https://plot.ly/python/3d-network-graph/
Xn=[layt[k][0] for k in list(G.nodes)]
Yn=[layt[k][1] for k in list(G.nodes)]
Zn=[layt[k][2] for k in list(G.nodes)]
Xe=[]
Ye=[]
Ze=[]

for e in list(G.edges):
    Xe+=[layt[e[0]][0], layt[e[1]][0], None]# x-coordinates of edge ends
    Ye+=[layt[e[0]][1], layt[e[1]][1], None]
    Ze+=[layt[e[0]][2], layt[e[1]][2], None]

trace1=go.Scatter3d(x=Xe,
                    y=Ye,
                    z=Ze,
                    mode='lines',
                    line=dict(color='rgb(125,125,125)', width=1),
                    hoverinfo='none')
trace2=go.Scatter3d(x=Xn,
                    y=Yn,
                    z=Zn,
                    mode='markers',
                    name='influencers',
                    marker=dict(symbol='circle',
                                size=6,
                                color=group,
                                colorscale='Viridis',
                                line=dict(color='rgb(50,50,50)', width=0.5)),
                    text=labels,
                    hoverinfo='text')

axis=dict(showbackground=False,
          showline=False,
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title='')

layout = go.Layout(title="Network of Influencers",
                   width=1000,
                   height=1000,
                   showlegend=False,
                   scene=dict(xaxis=dict(axis),
                              yaxis=dict(axis),
                              zaxis=dict(axis)),
                   margin=dict(t=100),
                   hovermode='closest',
                   annotations=[dict(showarrow=False,
                                     text="Data source: Instagram",
                                     xref='paper',
                                     yref='paper',
                                     x=0,
                                     y=0.1,
                                     xanchor='left',
                                     yanchor='bottom',
                                     font=dict(size=14))])

In [19]:
data=[trace1, trace2]
fig=go.Figure(data=data, layout=layout)

py.init_notebook_mode(connected=True)
py.iplot(fig, filename='Influenza')