In [1]:
import os
import os.path as path

import random
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt

import networkx as nx
from cdlib import algorithms

In [2]:
# fix seed (since the algorithms we used to label nodes are randomized)
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

## Load datasets

In [3]:
# dataset
DATASET = 'abortion'  # other datasets: "brexit", "abortion"
DATASETS_PATH = path.join(path.pardir, "datasets", "balanced_exposure")
DATASET_PATH = path.join(DATASETS_PATH, DATASET)

In [4]:
# load probabilities
PROBS_PATH = path.join(DATASET_PATH, f"{DATASET}_network_heterogeneous.txt")
df = pd.read_csv(PROBS_PATH, sep='\t', names=['User', 'Follower', 'Pa', 'Pb'])

In [5]:
# load seeds
A = [l.strip() for l in open(path.join(DATASET_PATH, "side1_seeds.txt")).readlines()]  # A = pro Hillary
B = [l.strip() for l in open(path.join(DATASET_PATH, "side2_seeds.txt")).readlines()]  # B = pro Trump

In [6]:
# label group attribute for user and follower
A1, B1 = df.User.isin(A), df.User.isin(B)
df['Group1'] = np.where(~A1 & ~B1, 'NA', np.where(A1, 'A', 'B')) 

A2, B2 = df.Follower.isin(A), df.Follower.isin(B)
df['Group2'] = np.where(~A2 & ~B2, 'NA', np.where(A2, 'A', 'B')) 

In [7]:
print('A:', len(set(A)) == len(A), len(A), ', B:', len(set(B)) == len(B), len(B))

A: True 52 , B: True 65


In [8]:
# find biggest connected component
G = nx.from_pandas_edgelist(df, 'User', 'Follower')
biggest_component = next(nx.algorithms.connected_components(G))

# discard the rest
G = G.subgraph(biggest_component)
df = df[df.User.isin(biggest_component) | df.Follower.isin(biggest_component)]
A = [a for a in A if a in biggest_component]
B = [b for b in B if b in biggest_component]
len(A), len(B)

(52, 65)

In [9]:
# expand seeds to larger communities, hoping to find edges between A and B...
commA = algorithms.lemon(G, A, min_com_size=200, max_com_size=500)

# label group of new nodes
newdf = df.copy()
newdf.loc[newdf.User.isin(commA.communities[0]), 'Group1'] = 'A'
newdf.loc[newdf.Follower.isin(commA.communities[0]), 'Group2'] = 'A'

print(len(commA.communities[0]), len(set(commA.communities[0]).difference(set(A))))

MemoryError: Unable to allocate 582. GiB for an array with shape (279505, 279505) and data type float64

In [11]:
commB = algorithms.lemon(G, B, min_com_size=200, max_com_size=500)

newdf.loc[newdf.User.isin(commB.communities[0]), 'Group1'] = 'B'
newdf.loc[newdf.Follower.isin(commB.communities[0]), 'Group2'] = 'B'

print(len(commB.communities[0]), len(set(commB.communities[0]).difference(set(B))))

301 294


In [12]:
# intersections of thw two communities (should be empty)
cA, cB = set(commA.communities[0]), set(commB.communities[0])
inters = cB.intersection(cA)
if len(inters) > 0:
    for e in inters:
        cA.remove(e)
        cB.remove(e)
cA, cB = list(cA), list(cB)

In [13]:
# are there any edges between A and B now?
newdf[(newdf.Group1 != 'NA') & (newdf.Group2 != 'NA')]

Unnamed: 0,User,Follower,Pa,Pb,Group1,Group2
1,003a04f8c2054b7,patriot18d,0.001160,0.003320,B,B
2,003a04f8c2054b7,pamela_moore13,0.000143,0.004143,B,B
3,003a04f8c2054b7,jbaker31826004,0.000604,0.015884,B,B
5,003a04f8c2054b7,pyrrhop,0.002796,0.011916,B,B
7,003a04f8c2054b7,skinnypresident,0.000667,0.002827,B,B
...,...,...,...,...,...,...
844834,healthandcents,trevor90666770,0.002307,0.015187,B,B
844837,healthandcents,martucci_peter,0.002149,0.048229,B,B
844838,healthandcents,ejhirschberger,0.000226,0.005506,B,B
844866,healthandcents,drmartyfox,0.001571,0.011651,B,B


In [14]:
labeled = newdf[(newdf.Group1 != 'NA') | (newdf.Group2 != 'NA')]
newA = list(set(labeled[labeled.Group1 == 'A'].User.tolist() + labeled[labeled.Group2 == 'A'].Follower.tolist()))
newB = list(set(labeled[labeled.Group1 == 'B'].User.tolist() + labeled[labeled.Group2 == 'B'].Follower.tolist()))
newNA = list(set(labeled[labeled.Group1 == 'NA'].User.tolist() + labeled[labeled.Group2 == 'NA'].Follower.tolist()))
G2 = nx.from_pandas_edgelist(labeled, 'User', 'Follower')
biggest_component2 = next(nx.algorithms.connected_components(G2))

# save biggest component of new graph
G2 = G2.subgraph(biggest_component)
newdf = newdf[newdf.User.isin(biggest_component) & newdf.Follower.isin(biggest_component)]

In [15]:
newdf = newdf[(newdf.Group1 != 'NA') & (newdf.Group2 != 'NA')]

In [16]:
newdf[['User', 'Group1']].drop_duplicates().groupby('Group1').count()

Unnamed: 0_level_0,User
Group1,Unnamed: 1_level_1
A,356
B,375


In [17]:
len(A), len(B)

(84, 99)

In [18]:
newdf.to_csv('new_graph.csv', index=False)