<a href="https://colab.research.google.com/github/ldocarvalho/portals-bias/blob/main/portals_bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Data

In [1]:
import pandas as pd

In [2]:
df_names = ["Source","Title","Content","Bias"]
df_news_aggregator = pd.read_csv('https://raw.githubusercontent.com/ldocarvalho/portals-bias/main/Dataset/News-Aggregator-14/news-aggregator.csv?token=AKX52J7EV5Y2TFAPAPLJ3Q3BYDMV2', names = df_names, skiprows = 1, header = None)
df_news_aggregator

Unnamed: 0,Source,Title,Content,Bias
0,wsj.com,"EU Week Ahead March 10-14: Bank Resolution, Tr...",The European Union’s to-do list isn’t long thi...,right-center
1,wsj.com,ECB's Noyer not Happy With Euro Strength -- Up...,News Corp is a network of leading companies in...,right-center
2,seattlepi.com,"EBay, Icahn talk smack",Follow @csaid EBay has come out swinging again...,left-center
3,techcrunch.com,EBay Asks Shareholders To Vote Against PayPal ...,The war of words between eBay and its new shar...,left-center
4,valuewalk.com,Carl Icahn: Donahoe Cost eBay Investors More T...,Carl Icahn has again posted an open letter on ...,right-center
...,...,...,...,...
1795,newsone.com,NewsOne Minute: Chris Rock Interviews White Pe...,Chris Rock Interviews White People at a Monste...,left-center
1796,twitchy.com,Chart-topping songwriter Diane Warren: Hope so...,Texas cheerleader and big-game hunter Kendall ...,right
1797,twitchy.com,Reality star Joanna Krupa would love to see ‘t...,Model Joanna Krupa is known for her stints on ...,right
1798,austinchronicle.com,Elaine Stritch (1925-2014),"During her seven decades as a ""Broadway Baby"" ...",left


In [5]:
df_news_july = pd.read_csv('https://raw.githubusercontent.com/ldocarvalho/portals-bias/main/Dataset/News-July-19/news-july.csv?token=AKX52J5KQ3NIC6K7PWM7XPLBYDNKA', names = df_names, skiprows = 1, header = None)
df_news_july

Unnamed: 0,Source,Title,Content,Bias
0,berkeleyside,"ArchivesJune 10, 2019",When the homeowner did not voluntarily repair ...,left-center
1,berkeleyside,Berkeley homeowner caught in costly code viola...,"Finger-pointing, confusion, worry and mistrust...",left-center
2,berkeleyside,"A small-business success story, Yalis Caf cele...","Yali's Caf 1920 Oxford St. (at Berkeley Way), ...",left-center
3,berkeleyside,Shop Talk: Bluemercury; Airport Home Appliance...,BLUEMERCURY A new business is opening in the o...,left-center
4,berkeleyside,New vision plan imagines 3-mile greenway from ...,Imagine a bike and pedestrian path stretching ...,left-center
...,...,...,...,...
4955,theblaze,Bill Cosbys official Instagram account shares ...,The official Instagram account of actor Bill C...,right
4956,theblaze,WATCH: Students condemn racist remarks they be...,Students at Marymount University in Virginia b...,right
4957,theblaze,Republicans propose constitutional amendment t...,President Donald Trump is supporting a propose...,right
4958,theblaze,108 GOP lawmakers sign letter asking Trump to ...,More than a hundred Republican members of Cong...,right


### 2. Text Pre-processing

In [6]:
def to_lower(text):
  return text.lower()

In [7]:
import re

def remove_numbers(text):
  return re.sub(r'\d+', '', text)

In [8]:
import string
import numpy as np

def remove_puctuation(text):
  symbols = "!\"#$%&()'*+-.,—/:;<=>?@[\]^_`{|}~\n“‘"
  for i in symbols:
    text = text.replace(i, '')
  return text

In [9]:
def preprocess(data):
    data = to_lower(data)
    data = remove_puctuation(data)
    data = remove_numbers(data)
    return data

In [10]:
data = df_news_aggregator.iloc[3]['Content']
data = preprocess(data)
data

'the war of words between ebay and its new shareholder carl icahn continues to rage on today ebay issued a notice of an upcoming annual meeting in which the company asked shareholders flat out to vote against icahn’s proposal to spin off paypal meanwhile icahn issued yet another open letter against current ebay management in which he accused ceo john donahoe of inexcusable incompetence” that cost stockholders  billion carl icahn has also proposed adding two employees from the icahn group to ebay’s board icahn currently owns  of ebay as of february  ebay has also asked shareholders to reject that proposal on the subject of new board members ebay notes the board of directors does not endorse any icahn group nominee or the icahn proposal and unanimously recommends that you vote on the white proxy card or voting instruction form for all’ of the nominees proposed by the board of directors and against’ the icahn proposal” michael r jacobson company secretary writes in the sec filing the boar

### 3. Readability

#### Utils

In [11]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.2-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 3.0 MB/s 
[?25hCollecting pyphen
  Downloading pyphen-0.12.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 28.5 MB/s 
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.12.0 textstat-0.7.2


In [12]:
import spacy
from textstat.textstat import textstatistics,legacy_round

def break_sentences(text):
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)
	return list(doc.sents)

def word_count(text):
	sentences = break_sentences(text)
	words = 0
	for sentence in sentences:
		words += len([token for token in sentence])
	return words

def sentence_count(text):
	sentences = break_sentences(text)
	return len(sentences)

def avg_sentence_length(text):
	words = word_count(text)
	sentences = sentence_count(text)
	average_sentence_length = float(words / sentences)
	return average_sentence_length

def syllables_count(word):
	return textstatistics().syllable_count(word)

def avg_syllables_per_word(text):
	syllable = syllables_count(text)
	words = word_count(text)
	ASPW = float(syllable) / float(words)
	return legacy_round(ASPW, 1)

def difficult_words(text):
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)

	words = []
	sentences = break_sentences(text)
	for sentence in sentences:
		words += [str(token) for token in sentence]

	diff_words_set = set()
	
	for word in words:
		syllable_count = syllables_count(word)
		if word not in nlp.Defaults.stop_words and syllable_count >= 2:
			diff_words_set.add(word)

	return len(diff_words_set)

def poly_syllable_count(text):
	count = 0
	words = []
	sentences = break_sentences(text)
	for sentence in sentences:
		words += [token for token in sentence]
	

	for word in words:
		syllable_count = syllables_count(word)
		if syllable_count >= 3:
			count += 1
	return count

#### Flesch-Kincaid

In [13]:
def flesch_reading_ease(text):
	FRE = 206.835 - float(1.015 * avg_sentence_length(text)) -\
		float(84.6 * avg_syllables_per_word(text))
	return legacy_round(FRE, 2)

#### Dale-Chall

In [14]:
def dale_chall_readability_score(text):
  words = word_count(text)
  difficult_words_count = difficult_words(text)
  count = words - difficult_words_count
  
  if words > 0:
    per = float(count) / float(words) * 100
  
  diff_words = 100 - per
  raw_score = (0.1579 * diff_words) + (0.0496 * avg_sentence_length(text))
  
  if diff_words > 5:	
    raw_score += 3.6365
  
  return legacy_round(raw_score, 2)

#### Gunning Fog

In [100]:
def gunning_fog_score(text):
	per_diff_words = ((difficult_words(text) / word_count(text)) * 100) + 5
	grade = 0.4 * (avg_sentence_length(text) + per_diff_words)
	return grade

#### SMOG

In [16]:
def smog_index(text):
	if sentence_count(text) >= 3:
		poly_syllab = poly_syllable_count(text)
		SMOG = (1.043 * (30*(poly_syllab / sentence_count(text)))**0.5) \
				+ 3.1291
		return legacy_round(SMOG, 1)
	else:
		return 0

#### Tests

In [17]:
text_to_test = df_news_aggregator.loc[1000]['Content']

In [18]:
flesch_reading_ease(text_to_test)

91.57

In [19]:
dale_chall_readability_score(text_to_test)

6.84

In [101]:
gunning_fog_score(text_to_test)

16.121428571428574

### 4. n-grams

In [22]:
import re

def generate_ngrams(s, n):
    s = s.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [23]:
unigram = generate_ngrams(text_to_test, 1)
bigram = generate_ngrams(text_to_test, 2)

### 5. Information Theory Approach to detect media bias

#### Testes News July

##### 5.1.3.1. Computar importância de todos os termos e descartar os de menor frequência

In [24]:
corpus_news_aggregator_left = ""
corpus_news_aggregator_right = ""

for index, row in df_news_july.iterrows():
    text = row['Content']
    if 'left' in row['Bias']:
        corpus_news_aggregator_left += preprocess(text)
    else: 
        corpus_news_aggregator_right += preprocess(text)

In [25]:
bag_of_words_news_aggregator_left = corpus_news_aggregator_left.split(' ')

In [26]:
bag_of_words_news_aggregator_right = corpus_news_aggregator_right.split(' ')

In [27]:
bag_of_words = set(bag_of_words_news_aggregator_left).union(set(bag_of_words_news_aggregator_right))

In [28]:
number_of_words_left = dict.fromkeys(bag_of_words, 0)
for word in bag_of_words_news_aggregator_left:
    number_of_words_left[word] += 1
df_left = pd.DataFrame(list(number_of_words_left.items()),columns = ['word','occ-left'])

number_of_words_right = dict.fromkeys(bag_of_words, 0)
for word in bag_of_words_news_aggregator_right:
    number_of_words_right[word] += 1
df_right = pd.DataFrame(list(number_of_words_right.items()),columns = ['word','occ-right'])

df_words = pd.merge(df_left, df_right, on='word')
df_words

Unnamed: 0,word,occ-left,occ-right
0,,39579,35280
1,vocabulary,3,7
2,contestnewport,1,0
3,windsorwest,1,0
4,premarital,1,0
...,...,...,...
94556,misfire,0,1
94557,magkaroon,1,0
94558,civilunions,1,0
94559,metro,59,60


In [29]:
index = df_words[ df_words['occ-left'] + df_words['occ-right'] < 10 ].index
df_words.drop(index, inplace = True)
df_words

Unnamed: 0,word,occ-left,occ-right
0,,39579,35280
1,vocabulary,3,7
13,vatican,20,63
16,viii,1,9
17,initial,103,101
...,...,...,...
94536,translator,10,1
94537,abruptly,15,10
94545,infiltrated,8,2
94547,coroner,11,2


##### 5.1.3.2. Construção da função de probabilidade do termo t sobre as classes

In [30]:
pd.options.mode.chained_assignment = None  # default='warn'

df_words['perc-left'] = 0.0
df_words['perc-right'] = 0.0

for i in df_words.index:
  total = df_words['occ-left'][i] + df_words['occ-right'][i]
  df_words['perc-left'][i] = round(float(df_words['occ-left'][i] / total), 3)
  df_words['perc-right'][i] = round(float(df_words['occ-right'][i] / total), 3)

df_words

Unnamed: 0,word,occ-left,occ-right,perc-left,perc-right
0,,39579,35280,0.529,0.471
1,vocabulary,3,7,0.300,0.700
13,vatican,20,63,0.241,0.759
16,viii,1,9,0.100,0.900
17,initial,103,101,0.505,0.495
...,...,...,...,...,...
94536,translator,10,1,0.909,0.091
94537,abruptly,15,10,0.600,0.400
94545,infiltrated,8,2,0.800,0.200
94547,coroner,11,2,0.846,0.154


##### 5.1.3.3. Calcular a entropia de cada termo

In [31]:
df_words['shannon-entropy'] = 0.0
df_words

Unnamed: 0,word,occ-left,occ-right,perc-left,perc-right,shannon-entropy
0,,39579,35280,0.529,0.471,0.0
1,vocabulary,3,7,0.300,0.700,0.0
13,vatican,20,63,0.241,0.759,0.0
16,viii,1,9,0.100,0.900,0.0
17,initial,103,101,0.505,0.495,0.0
...,...,...,...,...,...,...
94536,translator,10,1,0.909,0.091,0.0
94537,abruptly,15,10,0.600,0.400,0.0
94545,infiltrated,8,2,0.800,0.200,0.0
94547,coroner,11,2,0.846,0.154,0.0


In [32]:
import numpy
import sys
numpy.seterr(divide = 'warn') 

def calculate_shannon_entropy(probs):
  sum = 0.0
  for prob in probs:
    if prob != 0:
      sum -= prob * numpy.log(prob)
  return sum / numpy.log(2)

In [33]:
df_words['shannon-entropy'] = 0.0

for i in df_words.index:
  shannon_entropy = round(calculate_shannon_entropy([df_words['perc-left'][i], df_words['perc-right'][i]]), 3)
  df_words['shannon-entropy'][i] = shannon_entropy

df_words

Unnamed: 0,word,occ-left,occ-right,perc-left,perc-right,shannon-entropy
0,,39579,35280,0.529,0.471,0.998
1,vocabulary,3,7,0.300,0.700,0.881
13,vatican,20,63,0.241,0.759,0.797
16,viii,1,9,0.100,0.900,0.469
17,initial,103,101,0.505,0.495,1.000
...,...,...,...,...,...,...
94536,translator,10,1,0.909,0.091,0.440
94537,abruptly,15,10,0.600,0.400,0.971
94545,infiltrated,8,2,0.800,0.200,0.722
94547,coroner,11,2,0.846,0.154,0.620


In [34]:
df_trump = df_words.loc[df_words['word'] == 'trump']
df_trump

Unnamed: 0,word,occ-left,occ-right,perc-left,perc-right,shannon-entropy
43229,trump,3951,2443,0.618,0.382,0.959


In [35]:
df_soros = df_words.loc[df_words['word'] == 'soros']
df_soros

Unnamed: 0,word,occ-left,occ-right,perc-left,perc-right,shannon-entropy
87643,soros,4,38,0.095,0.905,0.453


##### 5.1.3.4. Selecionar vocabulário de referência

In [36]:
df_reference_vocabulary = df_words.nsmallest(10000, ['shannon-entropy'])
df_reference_vocabulary

Unnamed: 0,word,occ-left,occ-right,perc-left,perc-right,shannon-entropy
73,osp,0,22,0.000,1.000,0.000
82,rebootcamp,0,16,0.000,1.000,0.000
121,rutgers,10,0,1.000,0.000,0.000
155,shreves,0,21,0.000,1.000,0.000
156,modulebodytextall,0,22,0.000,1.000,0.000
...,...,...,...,...,...,...
82332,awkward,22,14,0.611,0.389,0.964
83310,chanted,7,11,0.389,0.611,0.964
84748,livelihood,11,7,0.611,0.389,0.964
85167,randomly,11,7,0.611,0.389,0.964


##### 5.1.4 Modelagem da função de probabilidade de todos os termos t ∈ VR

In [37]:
df_news_july

Unnamed: 0,Source,Title,Content,Bias
0,berkeleyside,"ArchivesJune 10, 2019",When the homeowner did not voluntarily repair ...,left-center
1,berkeleyside,Berkeley homeowner caught in costly code viola...,"Finger-pointing, confusion, worry and mistrust...",left-center
2,berkeleyside,"A small-business success story, Yalis Caf cele...","Yali's Caf 1920 Oxford St. (at Berkeley Way), ...",left-center
3,berkeleyside,Shop Talk: Bluemercury; Airport Home Appliance...,BLUEMERCURY A new business is opening in the o...,left-center
4,berkeleyside,New vision plan imagines 3-mile greenway from ...,Imagine a bike and pedestrian path stretching ...,left-center
...,...,...,...,...
4955,theblaze,Bill Cosbys official Instagram account shares ...,The official Instagram account of actor Bill C...,right
4956,theblaze,WATCH: Students condemn racist remarks they be...,Students at Marymount University in Virginia b...,right
4957,theblaze,Republicans propose constitutional amendment t...,President Donald Trump is supporting a propose...,right
4958,theblaze,108 GOP lawmakers sign letter asking Trump to ...,More than a hundred Republican members of Cong...,right


In [38]:
df_portals_content_names = ["Source", "Content", "Bias"]
df_portals_content = pd.DataFrame(columns = df_portals_content_names)
df_portals_content

Unnamed: 0,Source,Content,Bias


In [39]:
last_source = "berkeleyside"
df_portals_content = df_portals_content.append({'Source': last_source, 'Content': "", "Bias": "left"}, ignore_index=True)

for index, row in df_news_july.iterrows():
  source = row['Source']
  bias = ""
  if "left" in row['Bias']:
    bias = "left"
  else:
    bias = "right"
  if source != last_source:
    df_portals_content = df_portals_content.append({'Source': source, 'Content': "", "Bias": bias}, ignore_index=True)
    last_source = source

df_portals_content

Unnamed: 0,Source,Content,Bias
0,berkeleyside,,left
1,migrationpolicy,,left
2,deadline,,left
3,washingtonpress,,left
4,thehindubusinessline,,right
...,...,...,...
243,sun-sentinel,,right
244,tasnimnews,,right
245,nccivitas,,right
246,mexiconewsdaily,,left


In [40]:
for index, row in df_portals_content.iterrows():
  source = row['Source']
  content = ""
  rows_1 = df_news_july[df_news_july['Source'] == source]
  for index_1, row_1 in rows_1.iterrows():
    content += preprocess(row_1['Content'])
  df_portals_content.at[index,'Content'] = content

In [41]:
df_portals_content

Unnamed: 0,Source,Content,Bias
0,berkeleyside,when the homeowner did not voluntarily repair ...,left
1,migrationpolicy,mpis us immigration policy program analyzes us...,left
2,deadline,fifty years agoon may to be exactunited arti...,left
3,washingtonpress,k shares share this story a day after the unpr...,left
4,thehindubusinessline,a total of five girls of power generating comp...,right
...,...,...,...
243,sun-sentinel,renee steele rosomoff of hollywood florida for...,right
244,tasnimnews,at the moment we are forming bilateral meeting...,right
245,nccivitas,february by ray nothstine simple share butto...,right
246,mexiconewsdaily,a federal judge today ordered three former pem...,left


In [42]:
df_portals_names = ["Source", "Bias"]
df_portals = pd.DataFrame(columns = df_portals_names)
df_portals

Unnamed: 0,Source,Bias


In [43]:
from collections import Counter

for index, row in df_portals_content.iterrows():
  df_portals.at[index, 'Source'] = row['Source']
  bias = ""
  if "left" in row['Bias']:
    bias = "left"
  else:
    bias = "right"
  df_portals.at[index, 'Bias'] = bias
  portal_content = row['Content']
  for index_1, row_1 in df_reference_vocabulary.iterrows():
    word = row_1['word']
    if row['Source'] == "berkeleyside":
      df_portals[word] = 0.2
    #counter = Counter(portal_content)
    frequency = portal_content.count(word)
    df_portals.at[index, word] = frequency

In [44]:
df_portals

Unnamed: 0,Source,Bias,osp,rebootcamp,rutgers,shreves,modulebodytextall,zullo,snowe,legasov,vollmann,namemodulesnewslettermodule,infothetyeeca,ignatius,kochs,guatemalans,chastity,technion,addservicegoogletagpubads,rfk,duluth,targettype,quintas,haidar,alshabaab,protagonists,lovebugs,siteid,lazarous,quinta,federallyinsured,biodegradable,hibakusha,rhs,sirota,cull,churchmilitantcom,pint,caketo,jacaranda,...,drastic,elijah,discriminated,vocal,betting,displays,imagined,pitching,delegate,accepted,proxies,sandy,redistribution,entities,mockery,innocence,features,planted,categorically,negligence,beverages,brushed,logically,boy,idiots,prominence,ramirez,hygiene,cookie,tying,cake,contends,competed,expulsion,multitude,awkward,chanted,livelihood,randomly,pale
0,berkeleyside,left,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,migrationpolicy,left,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,deadline,left,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,washingtonpress,left,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0
4,thehindubusinessline,right,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,sun-sentinel,right,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,tasnimnews,right,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0
245,nccivitas,right,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246,mexiconewsdaily,left,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [45]:
for index, row in df_portals.iterrows():
  sum_row = row[2:].sum()
  for name, value in row.iteritems():
    if type(value) == float:
      df_portals.at[index, name] = value / sum_row

df_portals

Unnamed: 0,Source,Bias,osp,rebootcamp,rutgers,shreves,modulebodytextall,zullo,snowe,legasov,vollmann,namemodulesnewslettermodule,infothetyeeca,ignatius,kochs,guatemalans,chastity,technion,addservicegoogletagpubads,rfk,duluth,targettype,quintas,haidar,alshabaab,protagonists,lovebugs,siteid,lazarous,quinta,federallyinsured,biodegradable,hibakusha,rhs,sirota,cull,churchmilitantcom,pint,caketo,jacaranda,...,drastic,elijah,discriminated,vocal,betting,displays,imagined,pitching,delegate,accepted,proxies,sandy,redistribution,entities,mockery,innocence,features,planted,categorically,negligence,beverages,brushed,logically,boy,idiots,prominence,ramirez,hygiene,cookie,tying,cake,contends,competed,expulsion,multitude,awkward,chanted,livelihood,randomly,pale
0,berkeleyside,left,0.000140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000047,0.000023,0.000000,0.000047,0.0,0.000000,0.0,0.000023,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000047,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000023
1,migrationpolicy,left,0.000149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000037,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000037,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000037,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000037,0.0,0.000000
2,deadline,left,0.000042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000042,0.000042,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000504,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000042,0.000000,0.000000,0.0,0.000000,0.0,0.000000
3,washingtonpress,left,0.000117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000029,0.0,0.0,0.000029,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000059,0.0,0.000029,0.0,0.0,0.0,0.0,0.000000,0.000029,0.000000,0.0,0.000000,0.0,0.0,0.000029,0.0,0.0,0.0,0.000000,0.000000,0.000029,0.000029,0.0,0.000000,0.0,0.000088
4,thehindubusinessline,right,0.000171,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000029,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000029,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000029,0.000000,0.0,0.000029,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,sun-sentinel,right,0.000240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000240,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000120,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
244,tasnimnews,right,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000054,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000054,0.000216,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.003018
245,nccivitas,right,0.000914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000022,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000022,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
246,mexiconewsdaily,left,0.000057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000057,0.000057,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000057,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000057,0.0,0.000057


##### 5.1.4 Modelagem da função de probabilidade das classes de viés

In [46]:
df_portals_reference_names = ["Bias", "Content"]
df_portals_reference = pd.DataFrame(columns = df_portals_reference_names)
df_portals_reference = df_portals_reference.append({"Bias": "left", 'Content': corpus_news_aggregator_left}, ignore_index=True)
df_portals_reference = df_portals_reference.append({"Bias": "right", 'Content': corpus_news_aggregator_right,}, ignore_index=True)
df_portals_reference

Unnamed: 0,Bias,Content
0,left,when the homeowner did not voluntarily repair ...
1,right,a total of five girls of power generating comp...


In [47]:
df_portals_prob_names = ["Bias"]
df_portals_reference_prob = pd.DataFrame(columns = df_portals_prob_names)
df_portals_reference_prob

Unnamed: 0,Bias


In [48]:
from collections import Counter

for index, row in df_portals_reference.iterrows():
  df_portals_reference_prob.at[index, 'Bias'] = row['Bias']
  portal_content = row['Content']
  for index_1, row_1 in df_reference_vocabulary.iterrows():
    word = row_1['word']
    if row['Bias'] == "left":
      df_portals_reference_prob[word] = 0.2
    frequency = portal_content.count(word)
    df_portals_reference_prob.at[index, word] = frequency

df_portals_reference_prob

Unnamed: 0,Bias,osp,rebootcamp,rutgers,shreves,modulebodytextall,zullo,snowe,legasov,vollmann,namemodulesnewslettermodule,infothetyeeca,ignatius,kochs,guatemalans,chastity,technion,addservicegoogletagpubads,rfk,duluth,targettype,quintas,haidar,alshabaab,protagonists,lovebugs,siteid,lazarous,quinta,federallyinsured,biodegradable,hibakusha,rhs,sirota,cull,churchmilitantcom,pint,caketo,jacaranda,aung,...,drastic,elijah,discriminated,vocal,betting,displays,imagined,pitching,delegate,accepted,proxies,sandy,redistribution,entities,mockery,innocence,features,planted,categorically,negligence,beverages,brushed,logically,boy,idiots,prominence,ramirez,hygiene,cookie,tying,cake,contends,competed,expulsion,multitude,awkward,chanted,livelihood,randomly,pale
0,left,896.0,0.0,10.0,0.0,0.0,0.0,19.0,10.0,19.0,0.0,16.0,0.0,26.0,0.0,0.0,0.0,0.0,10.0,0.0,11.0,0.0,10.0,10.0,10.0,13.0,0.0,17.0,2.0,12.0,24.0,0.0,0.0,22.0,24.0,0.0,156.0,20.0,0.0,68.0,...,43.0,7.0,11.0,61.0,16.0,23.0,39.0,22.0,47.0,96.0,7.0,24.0,12.0,110.0,12.0,14.0,150.0,17.0,7.0,7.0,7.0,11.0,45.0,624.0,7.0,11.0,8.0,11.0,56.0,18.0,104.0,7.0,7.0,13.0,8.0,24.0,52.0,23.0,11.0,341.0
1,right,703.0,16.0,0.0,21.0,22.0,32.0,0.0,0.0,0.0,22.0,0.0,20.0,0.0,19.0,29.0,11.0,23.0,2.0,35.0,0.0,10.0,0.0,0.0,0.0,0.0,46.0,0.0,51.0,0.0,0.0,12.0,35.0,0.0,9.0,12.0,61.0,0.0,80.0,0.0,...,29.0,12.0,8.0,38.0,14.0,14.0,24.0,33.0,35.0,61.0,11.0,16.0,8.0,50.0,7.0,23.0,95.0,16.0,11.0,11.0,11.0,7.0,24.0,343.0,12.0,7.0,11.0,7.0,59.0,15.0,69.0,11.0,11.0,9.0,39.0,19.0,13.0,16.0,7.0,182.0


In [49]:
for index, row in df_portals_reference_prob.iterrows():
  sum_row = row[1:].sum()
  for name, value in row.iteritems():
    if type(value) == float:
      df_portals_reference_prob.at[index, name] = value / sum_row

df_portals_reference_prob

Unnamed: 0,Bias,osp,rebootcamp,rutgers,shreves,modulebodytextall,zullo,snowe,legasov,vollmann,namemodulesnewslettermodule,infothetyeeca,ignatius,kochs,guatemalans,chastity,technion,addservicegoogletagpubads,rfk,duluth,targettype,quintas,haidar,alshabaab,protagonists,lovebugs,siteid,lazarous,quinta,federallyinsured,biodegradable,hibakusha,rhs,sirota,cull,churchmilitantcom,pint,caketo,jacaranda,aung,...,drastic,elijah,discriminated,vocal,betting,displays,imagined,pitching,delegate,accepted,proxies,sandy,redistribution,entities,mockery,innocence,features,planted,categorically,negligence,beverages,brushed,logically,boy,idiots,prominence,ramirez,hygiene,cookie,tying,cake,contends,competed,expulsion,multitude,awkward,chanted,livelihood,randomly,pale
0,left,0.000169,0.0,2e-06,0.0,0.0,0.0,4e-06,2e-06,4e-06,0.0,3e-06,0.0,5e-06,0.0,0.0,0.0,0.0,1.8909e-06,0.0,2e-06,0.0,2e-06,2e-06,2e-06,2e-06,0.0,3e-06,3.781801e-07,2e-06,5e-06,0.0,0.0,4e-06,5e-06,0.0,2.9e-05,4e-06,0.0,1.3e-05,...,8e-06,1e-06,2e-06,1.2e-05,3e-06,4e-06,7e-06,4e-06,9e-06,1.8e-05,1e-06,5e-06,2e-06,2.1e-05,2e-06,3e-06,2.8e-05,3e-06,1e-06,1e-06,1e-06,2e-06,9e-06,0.000118,1e-06,2e-06,2e-06,2e-06,1.1e-05,3e-06,2e-05,1e-06,1e-06,2e-06,2e-06,5e-06,1e-05,4e-06,2e-06,6.4e-05
1,right,0.000176,4e-06,0.0,5e-06,6e-06,8e-06,0.0,0.0,0.0,6e-06,0.0,5e-06,0.0,5e-06,7e-06,3e-06,6e-06,5.003338e-07,9e-06,0.0,3e-06,0.0,0.0,0.0,0.0,1.2e-05,0.0,1.275851e-05,0.0,0.0,3e-06,9e-06,0.0,2e-06,3e-06,1.5e-05,0.0,2e-05,0.0,...,7e-06,3e-06,2e-06,1e-05,4e-06,4e-06,6e-06,8e-06,9e-06,1.5e-05,3e-06,4e-06,2e-06,1.3e-05,2e-06,6e-06,2.4e-05,4e-06,3e-06,3e-06,3e-06,2e-06,6e-06,8.6e-05,3e-06,2e-06,3e-06,2e-06,1.5e-05,4e-06,1.7e-05,3e-06,3e-06,2e-06,1e-05,5e-06,3e-06,4e-06,2e-06,4.6e-05


##### 5.1.5 Computar dissimilaridade entre portais e classes de viés

###### Utils

In [50]:
from math import log2
from math import sqrt
from numpy import asarray
 
def kl_divergence(p, q):
	return sum(p[i] * log2(p[i]/q[i]) if q[i] != 0 and p[i] != 0 else p[i] for i in range(len(p)))

'''
def js_divergence(p, q):
	m = 0.5 * (p + q)
	return 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m)
'''

def js_divergence(p, q):
  # calculate_shannon_entropy(probs)
  m = 0.5 * (p + q)
  a = (calculate_shannon_entropy(p) + calculate_shannon_entropy(q)) / 2
  b = calculate_shannon_entropy(m)
  return a - b
 
p = asarray([0.10, 0.40, 0.50])
q = asarray([0.80, 0.15, 0.05])

js_pq = js_divergence(p, q)
print('JS(P || Q) divergence: %.3f bits' % js_pq)

js_qp = js_divergence(q, p)
print('JS(Q || P) divergence: %.3f bits' % js_qp)

JS(P || Q) divergence: -0.420 bits
JS(Q || P) divergence: -0.420 bits


###### Dissimilaridade de portais

In [51]:
df_names = ["Source", "Divergence-Left", "Divergence-Right", "Bias"]
df_portals_divergence = pd.DataFrame(columns = df_names)
df_portals_divergence

Unnamed: 0,Source,Divergence-Left,Divergence-Right,Bias


In [52]:
for index, row in df_portals.iterrows():
  source = row['Source']
  bias = row['Bias']
  values = row.iloc[2:].values
  div_left = 0.0
  div_right = 0.0
  for index_1, row_1 in df_portals_reference_prob.iterrows():
    if index_1 == 0:
      values_left = row_1.iloc[1:].values
      div_left = js_divergence(values, values_left)
    else:
      values_right = row_1.iloc[1:].values
      div_right = js_divergence(values, values_right)
  df_portals_divergence = df_portals_divergence.append({'Source': source, 'Divergence-Left': div_left, 'Divergence-Right': div_right, "Bias": bias}, ignore_index=True)

df_portals_divergence

Unnamed: 0,Source,Divergence-Left,Divergence-Right,Bias
0,berkeleyside,-0.057572,-0.057251,left
1,migrationpolicy,-0.097006,-0.097514,left
2,deadline,-0.067894,-0.066433,left
3,washingtonpress,-0.060350,-0.059444,left
4,thehindubusinessline,-0.100337,-0.094737,right
...,...,...,...,...
243,sun-sentinel,-0.097275,-0.091164,right
244,tasnimnews,-0.076542,-0.073146,right
245,nccivitas,-0.067241,-0.062980,right
246,mexiconewsdaily,-0.075912,-0.074011,left


##### 5.1.6 Classificação

In [53]:
X = df_portals_divergence.drop(['Source', 'Bias'], axis=1)
y = df_portals_divergence['Bias']

print(X)

     Divergence-Left  Divergence-Right
0          -0.057572         -0.057251
1          -0.097006         -0.097514
2          -0.067894         -0.066433
3          -0.060350         -0.059444
4          -0.100337         -0.094737
..               ...               ...
243        -0.097275         -0.091164
244        -0.076542         -0.073146
245        -0.067241         -0.062980
246        -0.075912         -0.074011
247        -0.073935         -0.069058

[248 rows x 2 columns]


In [54]:
print(y)

0       left
1       left
2       left
3       left
4      right
       ...  
243    right
244    right
245    right
246     left
247    right
Name: Bias, Length: 248, dtype: object


In [86]:
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

cv = LeaveOneOut()
model = SVC(kernel='rbf', probability=True, C=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.871 (0.335)
