In [2]:
import numpy as np
import pandas as pd
import os.path as path

## Load datasets

In [5]:
# dataset
DATASET = 'uselections'  # other datasets: "brexit", "abortion"
DATASETS_PATH = path.join(path.pardir, "datasets", "balanced_exposure")
DATASET_PATH = path.join(DATASETS_PATH, DATASET)

In [6]:
# load probabilities
PROBS_PATH = path.join(DATASET_PATH, f"{DATASET}_network_heterogeneous.txt")
df = pd.read_csv(PROBS_PATH, sep='\t', names=['User', 'Follower', 'Pa', 'Pb'])

In [7]:
# load seeds
A = [l.strip() for l in open(path.join(DATASET_PATH, "side1_seeds.txt")).readlines()]  # A = pro Hillary
B = [l.strip() for l in open(path.join(DATASET_PATH, "side2_seeds.txt")).readlines()]  # B = pro Trump
len(A), len(B)

(85, 99)

In [5]:
# label group attribute for user and follower
A1, B1 = df.User.isin(A), df.User.isin(B)
df['Group1'] = np.where(~A1 & ~B1, 'NA', np.where(A1, 'A', 'B')) 

A2, B2 = df.Follower.isin(A), df.Follower.isin(B)
df['Group2'] = np.where(~A2 & ~B2, 'NA', np.where(A2, 'A', 'B')) 

In [14]:
# get node information (label and group)
nodes = (pd.concat([df[['User', 'Group1']].rename(columns={'User': 'Label',
                                                           'Group1': 'Group'}),
                    df[['Follower', 'Group2']].rename(columns={'Follower': 'Label', 
                                                               'Group2': 'Group'})])
           .drop_duplicates()
           .reset_index(drop=True)
           .reset_index()
           .rename(columns={'index': 'Id'}))

In [15]:
# print to file
nodes.to_csv('all_nodes.csv', index=False)

In [16]:
# get edge information: follower -> user
gephi_format = {'User': 'Target', 'Follower': 'Source'}
edges = (df[['User', 'Follower']].rename(columns=gephi_format)
                                 .reset_index(drop=True))

In [23]:
replace_dict = {v: i for i, v in enumerate(nodes[['Id', 'Label']].set_index('Id').Label.to_dict().values())}
edges['Target'] = edges['Target'].map(replace_dict)
edges['Source'] = edges['Source'].map(replace_dict)

In [70]:
#edges = pd.read_csv('all_edges.csv')

In [36]:
edges.to_csv('all_edges.csv', index=False)

In [38]:
# do all the nodes appear in an edge?
unique_nodes = set(nodes.index.tolist())
nodes_in_edges = set(np.unique(edges.values.reshape(-1)))

assert len(unique_nodes.difference(nodes_in_edges)) == 0