# Political News and the Fair Exposure Problem under Homophily

## All required libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import beta
from matplotlib import pyplot as plt
import re

import random
import platform_opt
from players import *

## Important Parameters

- $T$: Number of timesteps
- $M$: Finite mass of user
- $g \in \{A, B\}$: Political Affliation/ Preference
- $\pi_g \in (0, 1)$: Fraction of users from group $g$ 
- $t \in \{1, \cdots , T\}$:  Discrete Time T $\leq $ M
- $s \in \{a, b\}$: Article sources afliated with $A, B$
- $\theta_{A}, \theta_B \in [0, 1]$: Fraction of users shown an article
- $P_{g,s}$: Probability of users of group $g$ who like an article from source $s$
- $c > 0$: Cost user incur for clicking an article
- $v > 0$: Utility users get for liking an article
- $q_{g} > 0.5$: Probability of in-group propagation 

## Load and clean the dataset:  [Replication Data](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/AAI7VA)

### Dataset description borrowed from ReadMe.md of [Replication Data](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/AAI7VA)

#### Top500 

- domain ---> s: domain
- l1 ---> $P_{A}$ : proportion of shares with liberal alignment from s
- r1 ---> $P_{B}$: proportion of shares with conservative alignment from s

The top 500 most shared domains have each been shared at least 44,000 times. 
- T = ?
- |a| < |b|

This set of domains covers over 80% of the URLs classified as hard content that

#### Homophily

- viewer ---> A / B: affiliation of an individual in the sampling frame of the study.
- friend ---> A / B : affiliation of friend.
- frac_of_friends --->: bucketed (in increments of 0.01) fraction of friends with the given affiliation.
- density --->: proportion of friends within the given bucket.



In [2]:
hard_news = ['nytimes.com', 'huffingtonpost.com', 'news.yahoo.com','cnn.com','foxnews.com','video.foxnews.com',
  'foxnewsinsider.com', 'npr.org','theblaze.com','abcnews.go.com', 'msnbc.MSN.com', 'dailycaller.com',
  'examiner.com', 'guardian.co.uk', 'cbsnews.com','politico.com','forbes.com', 'dailykos.com', 'reuters.com',
  'salon.com','motherjones.com', 'latimes.com','slate.com', 'online.wsj.com', 'realclearpolitics.com', 'theatlantic.com',
  'bbc.co.uk', 'businessinsider.com', 'video.msnbc.msn.com', 'thehill.com', 'townhall.com', 'content.usatoday.com',
  'money.cnn.com', 'breitbart.com', 'aljazeera.com', 'thedailybeast.com', 'telegraph.co.uk', 'bloomberg.com',
  'maddowblog.msnbc.com','thenation.com','glennbeck.com', 'talkingpointsmemo.com', 'nationalreview.com','usnews.nbcnews.com',
  'politicalticker.blogs.cnn.com','aclu.org','sfgate.com','chicagotribune.com', 'thegatewaypundit.com','nydailynews.com', 'politifact.com',
  'csmonitor.com', 'rushlimbaugh.com', 'pbs.org', 'usatoday.com','newamerican.com', 'thenewcivilrightsmovement.com', 'front.moveon.org','thinkprogress.org', 'addictinginfo.org',
  'alternet.org', 'rawstory.com',  'infowars.com','newsmax.com', 'rt.com','politicususa.com','truth-out.org', 'commondreams.org',
  'tpnn.com',  'democracynow.org', 'americanthinker.com', 'advocate.com','lifenews.com', 'aclj.org', 'readersupportednews.org',
  'humanevents.com', 'prisonplanet.com', 'godfatherpolitics.com', 'newsbusters.org',
  'nypost.com', 'lifesitenews.com', 'mediamatters.org']

In [3]:
# load the datasets
data_path = "../Datasets/ReplicationforExposure/"
homophily_data = pd.read_csv(data_path+"homophily_density", sep=",", quotechar='"', encoding="utf8")
top_500_data = pd.read_csv(data_path+"top500", sep=",", quotechar='"', encoding="utf8", )

In [4]:
# rename homophily columns
col_names=['Viewer', 'Friend', 'Frac_of_Friends', 'Density']
homophily_data.columns = col_names

In [5]:
# Drop moderates from the homophily data
h = homophily_data.loc[~((homophily_data['Viewer']=='Moderates') | (homophily_data['Friend']=='Moderates'))]
# replace Liberals with A
# replace conservatives with B
homophily = h.copy()
homophily['Viewer'] = homophily.replace(['Liberals'],'A')
homophily['Viewer'] = homophily.replace(['Conservatives'],'B')
homophily['Friend'] = homophily.replace(['Liberal friends'],'A')
homophily['Friend'] = homophily.replace(['Conservatives'],'B')

In [6]:
homophily.shape

(565, 4)

In [7]:
group_A = homophily[homophily.Viewer=='A'][:-36]
group_B = homophily[homophily.Viewer=='B'][:-29]
groups = pd.concat([group_A, group_B], ignore_index=True)

In [8]:
# remove very liberal, moderates, and very conservatives from top500 data
domains = top_500_data.drop(['l2', 'avg_align', 'n', 'r2'], axis=1)
# replace liberal with A and conservative with B
domains.columns = ['s', '$P_{A}$', '$P_{B}$']

In [9]:
for dom in hard_news:
    domains.replace(re.compile('.*'+dom+'.*'), 'a', inplace=True)

domains['s'].mask(domains['s'] != 'a', 'b', inplace=True)


In [14]:
domains.shape

(500, 3)

In [15]:
data = pd.concat([domains, groups], axis=1)
dataset = data.sample(frac=1).reset_index(drop=True)
dataset

Unnamed: 0,s,$P_{A}$,$P_{B}$,Viewer,Friend,Frac_of_Friends,Density
0,b,0.4480,0.0515,A,A,0.00,2.252525e-02
1,b,0.0116,0.7375,B,B,0.75,8.962681e-04
2,b,0.1501,0.0041,A,A,0.77,1.564143e-02
3,a,0.0003,0.1266,B,B,0.51,9.802047e-03
4,b,0.0006,0.5611,B,B,0.12,1.706593e-03
...,...,...,...,...,...,...,...
495,a,0.0033,0.1678,B,B,0.17,1.842591e-03
496,b,0.1902,0.1653,B,B,0.01,7.852262e-03
497,b,0.2861,0.0960,A,A,0.32,1.402257e-02
498,b,0.1559,0.0411,A,A,0.85,2.628016e-07


### ToDo

- Based on today's homophily discussion decide whether to continue with this or no
- Fill the dataset and rename appropriately
- Graph of mass of articles shown over time, with Paa and Pbb
- Finish the baseline setting, variable parameters, etc
- Pefrom optimization 
