In [1]:
import sys
import os
import os.path as path

import random
from scipy.stats import beta
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt

sys.path.insert(1, path.join(path.abspath(path.pardir)))
import sims
from platform_opt import *

In [2]:
# remove determinism
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

## Load datasets

In [3]:
# load probabilities
df = pd.read_csv("csv/abortion_new_graph.csv")

In [4]:
users = df[['User', 'Group1']].drop_duplicates()
users.groupby('Group1').count()

Unnamed: 0_level_0,User
Group1,Unnamed: 1_level_1
A,196
B,228


## Parameters in the real network

In [5]:
users.groupby('Group1').count() / users.User.count()

Unnamed: 0_level_0,User
Group1,Unnamed: 1_level_1
A,0.462264
B,0.537736


## Propagation of the articles

In [6]:
# GLOBAL VARIABLES
TRIALS = 100
T      = 10
M      = 69


# indices
Gr = {'A': -1, 'B': 1}
Pr = {'Pa': -1, 'Pb': 1}
OPr = {'Pa': 1, 'Pb': -1}

In [7]:
# UTIL METHODS

## plotting
def plot(df):
    '''Plot the lines in the dataframe.'''
    return sns.lineplot(x='t', y='n', data=df)


def plotgen(g):
    '''Plot the dataframe built from the generator.'''
    df = pd.DataFrame(g, columns=['t', 'nA', 'nB'])
    return plot(df)


## other
def filter_list(l, blacklist=[]):
    '''Filter out from a list the elements in `blacklist`.'''
    return [e for e in l if e not in blacklist]

### Propagation with the model

In [17]:
SAVED_COLS = ['Setting', 'Exposure', 'Theta_A', 'Seed', 'Article', 't', 'N', 'B', 'A',]

In [8]:
# model parameters
epsilon=0.1
new_pi = {-1: 0.5, 1: 0.5}
theta_opt = {1: 0, -1: 1}
theta_half = {1: 0.5, -1: 0.5}

pi, beta_dist, P, v, c, q = sims.get_params('twitter_uselections')
theta_fair = opt_constrained(pi, q, T, epsilon, c, v, beta_dist, exposure_e=0.1)

In [9]:
group_cols = ['Aa', 'Ab', 'Ba', 'Bb']
exps = ['fair', 'opt', 'half']

def run_model_sim(theta, T, pi, M, P, beta_dist, v, c, q, seed=0):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    num_players = np.zeros((T, 4))
    for j, a in enumerate(sims.runModel(theta, T, pi, M, P, beta_dist, v, c, q,)[1]):
        num_players[:, j] =  np.array(a + [0] * (T - len(a))).reshape(-1)
    df = pd.DataFrame(num_players, columns=group_cols)
    df.loc[:, 't'] = range(T)
    return df.astype(int)

In [10]:
# run model simulations
model_results = pd.DataFrame(columns=['Exposure', 'Theta_A', 'Size', 'Seed', 't'] + group_cols)
for exp, theta in zip(exps, [theta_fair, theta_opt, theta_half]):
    params = theta, T, new_pi, 2 * M, P, beta_dist, v, c, q
    for trial in range(TRIALS):
        num_players = run_model_sim(*params, seed=trial)
        num_players.loc[:, ['Exposure', 'Theta_A', 'Size', 'Seed']] = exp, theta[-1], M, trial
        model_results = pd.concat((model_results, num_players))

In [None]:
ids = [c for c in model_results.columns if c not in group_cols]
dfA = pd.melt(model_results, id_vars=ids, value_vars=['Aa', 'Ab']).rename(columns={'value': 'A'})
dfB = pd.melt(model_results, id_vars=ids, value_vars=['Ba', 'Bb']).rename(columns={'value': 'B'})
dfB['A'] = dfA.A
dfB.loc[:, 'Article'] = dfB.variable.str[1]

model_results = dfB.drop('variable', axis=1)
model_results['N'] = model_results.A + model_results.B
model_results['Setting'] = 'model'
model_results[SAVED_COLS].to_csv('results/abortion_results_model.csv', index=False)

### Propagation with the labeled network

In [12]:
def total_num_players(G):
    return len(G.User.unique())


def group_num_players(G):
    nA = G[['User', 'Group1']].drop_duplicates().query('Group1 == "A"').shape[0]
    nB = G[['User', 'Group1']].drop_duplicates().query('Group1 == "B"').shape[0]
    return nA, nB


def sample_seeds(df, group, size):
    '''Return dataframe with a random sample of users.'''
    # sample `size` users as the first set of users to see the article
    # we need to drop duplicates as we have a row for each (user, follower) pair
    seeds = (df[['User', 'Group1']].drop_duplicates()
                                   .query(f'Group1 == "{group}"')
                                   .sample(size)
                                   .User)
    # take the rows of the seed users
    G = df[df.User.isin(seeds)]
    return G


def propagate(df, group='A', take_one=True, norepetitions=True, article='Pa', theta=0, steps=T, size=M, seed=0, func=group_num_players):
    '''Evaluates a function on each step of the propagation of an article through the network.'''
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    blacklist = set([])
    if group == 'B':
        theta = 1 - theta
    G = sample_seeds(df, group, size).copy()
    for t in range(steps):
        yield (group, article[-1:], theta, size, seed, t,) + func(G)
        
        # get one friend for each usuer
        if not G.empty:
            blacklist.update(set(G.User.tolist()))
            newG = G.copy()
            if take_one:
                newG = newG.groupby('User').sample(1)
                # What happens if two different users are followed by the same person?
                # It's more likely that the artcile will be shared.
            # Note: we assume that if someone follows n users sharing the article, there are n toss coins for sharing it.
            nusers = len(newG.User.unique())
            
            # fraction theta of users are shown article a
            na = int(theta * nusers)
            showa = np.zeros(nusers)
            showa[:na] = 1
            np.random.shuffle(showa)
            newG.loc[:, 'showa'] = showa[pd.factorize(newG.User)[0]]
            newG = newG.sort_values(by=['Group1', 'showa', 'User'])
            
            # a fraction theta of users see the article
            # A 0
            sza = len(newG[newG.Group1 == 'A'].User.unique())
            szaa = len(newG[(newG.Group1 == 'A') & (newG.showa == 0)].User.unique())
            a0 = np.random.beta(*beta_dist[(Gr['A'], OPr[article])], size=szaa)
            # A 1
            a1 = np.random.beta(*beta_dist[(Gr['A'], Pr[article])], size=sza - szaa)
            
            # B 0
            szb = len(newG[newG.Group1 == 'B'].User.unique())
            szba = len(newG[(newG.Group1 == 'B') & (newG.showa == 0)].User.unique())
            b0 = np.random.beta(*beta_dist[(Gr['B'], OPr[article])], size=szba)
            # B 1
            b1 = np.random.beta(*beta_dist[(Gr['B'], Pr[article])], size=szb - szba)
            probs = np.concatenate((a0, a1, b0, b1), axis=0)
            assert len(probs) == nusers
            
            # toss a coin for each exposed user to determine whether or not they will share the article
            shared = np.random.binomial(1, p=probs).astype(bool)
            newG.loc[:, 'Shared'] = shared[pd.factorize(newG.User)[0]]

            # take one friend for each user
            newG = newG[newG.Shared].Follower.unique()
            if norepetitions:
                newG = filter_list(newG, blacklist=blacklist)
            G = df[df.User.isin(newG)].copy()

In [13]:
def repeat_propagation(df, trials, **kwargs):
    for trial in range(trials):
        for result in propagate(df, seed=trial, **kwargs):
            yield result

In [14]:
# run experiments
COLS = ['Group', 'Article', 'Theta_A', 'Size', 'Seed', 't', 'A', 'B']

# fair (constrained)
dfa0 = pd.DataFrame(repeat_propagation(df, TRIALS, group='A', theta=theta_fair[-1], article='Pa'), columns=COLS)
dfb0 = pd.DataFrame(repeat_propagation(df, TRIALS, group='B', theta=theta_fair[1], article='Pb'), columns=COLS)
df0 = pd.concat((dfa0, dfb0))
df0['Exposure'] = 'fair'

# half exposure
dfahalf = pd.DataFrame(repeat_propagation(df, TRIALS, group='A', theta=theta_half[-1], article='Pa'), columns=COLS)
dfbhalf = pd.DataFrame(repeat_propagation(df, TRIALS, group='B', theta=theta_half[1], article='Pb'), columns=COLS)
dfhalf = pd.concat((dfahalf, dfbhalf))
dfhalf['Exposure'] = 'half'

# optimal (unconstrained)
dfaopt = pd.DataFrame(repeat_propagation(df, TRIALS, group='A', theta=theta_opt[-1], article='Pa'), columns=COLS)
dfbopt = pd.DataFrame(repeat_propagation(df, TRIALS, group='B', theta=theta_opt[1], article='Pb'), columns=COLS)
dfopt = pd.concat((dfaopt, dfbopt))
dfopt['Exposure'] = 'opt'

network_results = pd.concat((df0, dfhalf, dfopt))
network_results['N'] = network_results.A + network_results.B
network_results['Setting'] = 'network'
network_results[SAVED_COLS].to_csv('results/abortion_results_network.csv', index=False)

In [18]:
(pd.concat((network_results[SAVED_COLS],
            model_results[SAVED_COLS]))
     .to_csv('results/abortion_results.csv', index=False))

### Drop Assumption 1: take only one follower

In [22]:
# run experiments
COLS = ['Group', 'Article', 'Theta_A', 'Size', 'Seed', 't', 'A', 'B']

# fair (constrained)
dfa0 = pd.DataFrame(repeat_propagation(df, TRIALS, group='A', take_one=False, theta=theta_fair[-1], article='Pa'), columns=COLS)
dfb0 = pd.DataFrame(repeat_propagation(df, TRIALS, group='B', take_one=False, theta=theta_fair[1], article='Pb'), columns=COLS)
df0 = pd.concat((dfa0, dfb0))
df0['Exposure'] = 'fair'

# half exposure
dfahalf = pd.DataFrame(repeat_propagation(df, TRIALS, group='A', take_one=False, theta=theta_half[-1], article='Pa'), columns=COLS)
dfbhalf = pd.DataFrame(repeat_propagation(df, TRIALS, group='B', take_one=False, theta=theta_half[1], article='Pb'), columns=COLS)
dfhalf = pd.concat((dfahalf, dfbhalf))
dfhalf['Exposure'] = 'half'

# optimal (unconstrained)
dfaopt = pd.DataFrame(repeat_propagation(df, TRIALS, group='A', take_one=False, theta=theta_opt[-1], article='Pa'), columns=COLS)
dfbopt = pd.DataFrame(repeat_propagation(df, TRIALS, group='B', take_one=False, theta=theta_opt[1], article='Pb'), columns=COLS)
dfopt = pd.concat((dfaopt, dfbopt))
dfopt['Exposure'] = 'opt'

network_results = pd.concat((df0, dfhalf, dfopt))
network_results['N'] = network_results.A + network_results.B
network_results['Setting'] = 'network'
network_results[SAVED_COLS].to_csv('results/abortion_results_network_not_takeone.csv', index=False)

In [23]:
(pd.concat((network_results[SAVED_COLS],
            model_results[SAVED_COLS]))
     .to_csv('results/abortion_results_not_takeone.csv', index=False))

### Drop Assumption 2: the same user cannot appear more than once

In [24]:
# run experiments
COLS = ['Group', 'Article', 'Theta_A', 'Size', 'Seed', 't', 'A', 'B']

# fair (constrained)
dfa0 = pd.DataFrame(repeat_propagation(df, TRIALS, group='A', norepetitions=False, theta=theta_fair[-1], article='Pa'), columns=COLS)
dfb0 = pd.DataFrame(repeat_propagation(df, TRIALS, group='B', norepetitions=False, theta=theta_fair[1], article='Pb'), columns=COLS)
df0 = pd.concat((dfa0, dfb0))
df0['Exposure'] = 'fair'

# half exposure
dfahalf = pd.DataFrame(repeat_propagation(df, TRIALS, group='A', norepetitions=False, theta=theta_half[-1], article='Pa'), columns=COLS)
dfbhalf = pd.DataFrame(repeat_propagation(df, TRIALS, group='B', norepetitions=False, theta=theta_half[1], article='Pb'), columns=COLS)
dfhalf = pd.concat((dfahalf, dfbhalf))
dfhalf['Exposure'] = 'half'

# optimal (unconstrained)
dfaopt = pd.DataFrame(repeat_propagation(df, TRIALS, group='A', norepetitions=False, theta=theta_opt[-1], article='Pa'), columns=COLS)
dfbopt = pd.DataFrame(repeat_propagation(df, TRIALS, group='B', norepetitions=False, theta=theta_opt[1], article='Pb'), columns=COLS)
dfopt = pd.concat((dfaopt, dfbopt))
dfopt['Exposure'] = 'opt'

network_results = pd.concat((df0, dfhalf, dfopt))
network_results['N'] = network_results.A + network_results.B
network_results['Setting'] = 'network'
network_results[SAVED_COLS].to_csv('results/abortion_results_network_no_repetitions.csv', index=False)

In [25]:
(pd.concat((network_results[SAVED_COLS],
            model_results[SAVED_COLS]))
     .to_csv('results/abortion_results_no_repetitions.csv', index=False))