<a href="https://colab.research.google.com/github/ldocarvalho/portals-bias/blob/main/portals_bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dados

In [5]:
import pandas as pd

In [9]:
df_names = ["Source","Title","Content","Bias"]
df_news_aggregator = pd.read_csv('https://raw.githubusercontent.com/ldocarvalho/portals-bias/main/Dataset/News-Aggregator-14/news-aggregator.csv?token=AKX52J7EV5Y2TFAPAPLJ3Q3BYDMV2', names = df_names, skiprows = 1, header = None)
df_news_aggregator

Unnamed: 0,Source,Title,Content,Bias
0,wsj.com,"EU Week Ahead March 10-14: Bank Resolution, Tr...",The European Union’s to-do list isn’t long thi...,right-center
1,wsj.com,ECB's Noyer not Happy With Euro Strength -- Up...,News Corp is a network of leading companies in...,right-center
2,seattlepi.com,"EBay, Icahn talk smack",Follow @csaid EBay has come out swinging again...,left-center
3,techcrunch.com,EBay Asks Shareholders To Vote Against PayPal ...,The war of words between eBay and its new shar...,left-center
4,valuewalk.com,Carl Icahn: Donahoe Cost eBay Investors More T...,Carl Icahn has again posted an open letter on ...,right-center
...,...,...,...,...
1795,newsone.com,NewsOne Minute: Chris Rock Interviews White Pe...,Chris Rock Interviews White People at a Monste...,left-center
1796,twitchy.com,Chart-topping songwriter Diane Warren: Hope so...,Texas cheerleader and big-game hunter Kendall ...,right
1797,twitchy.com,Reality star Joanna Krupa would love to see ‘t...,Model Joanna Krupa is known for her stints on ...,right
1798,austinchronicle.com,Elaine Stritch (1925-2014),"During her seven decades as a ""Broadway Baby"" ...",left


In [10]:
df_news_july = pd.read_csv('https://raw.githubusercontent.com/ldocarvalho/portals-bias/main/Dataset/News-July-19/news-july.csv?token=AKX52J5KQ3NIC6K7PWM7XPLBYDNKA', names = df_names, skiprows = 1, header = None)
df_news_july

Unnamed: 0,Source,Title,Content,Bias
0,berkeleyside,"ArchivesJune 10, 2019",When the homeowner did not voluntarily repair ...,left-center
1,berkeleyside,Berkeley homeowner caught in costly code viola...,"Finger-pointing, confusion, worry and mistrust...",left-center
2,berkeleyside,"A small-business success story, Yalis Caf cele...","Yali's Caf 1920 Oxford St. (at Berkeley Way), ...",left-center
3,berkeleyside,Shop Talk: Bluemercury; Airport Home Appliance...,BLUEMERCURY A new business is opening in the o...,left-center
4,berkeleyside,New vision plan imagines 3-mile greenway from ...,Imagine a bike and pedestrian path stretching ...,left-center
...,...,...,...,...
4955,theblaze,Bill Cosbys official Instagram account shares ...,The official Instagram account of actor Bill C...,right
4956,theblaze,WATCH: Students condemn racist remarks they be...,Students at Marymount University in Virginia b...,right
4957,theblaze,Republicans propose constitutional amendment t...,President Donald Trump is supporting a propose...,right
4958,theblaze,108 GOP lawmakers sign letter asking Trump to ...,More than a hundred Republican members of Cong...,right


### Text Preprocessing

In [None]:
def to_lower(text):
  return text.lower()

In [None]:
def to_upper(text):
  return text.upper()

In [None]:
import re

def remove_numbers(text):
  return re.sub(r'\d+', '', text)

In [None]:
import string

def remove_puctuation(text):
  return text.translate(string.maketrans('',''), string.punctuation)

In [None]:
def remove_whitespace(text):
  return text.strip()

In [None]:
from nltk.tokenize import word_tokenize

def remove_stopword(text):
  stop_words = set(stopwords.words('english'))
  tokens = word_tokenize(text)
  return [i for i in tokens if not i in stop_words]

In [None]:
import re

def remove_urls(text):
  no_url_text = re.compile(r'https?://\S+|www\.\S+')
  return no_url_text.sub(r'', text)

In [None]:
import re

def remove_html(text):
    no_html_text = re.compile('<.*?>')
    return no_html_text.sub(r'', text)

In [None]:
import re

def tokenization(text):
  tokens = re.split('W+',text)
  return tokens

In [None]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

def stemming(text):
  stem_text = [porter_stemmer.stem(word) for word in text]
  return stem_text

### Readability

#### Utils

In [25]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.2-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 5.4 MB/s 
[?25hCollecting pyphen
  Downloading pyphen-0.11.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 46.2 MB/s 
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.11.0 textstat-0.7.2


In [86]:
import spacy
from textstat.textstat import textstatistics,legacy_round

def break_in_sentences(text):
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)
	return list(doc.sents)
 
def number_of_sentences(text):
	sentences = break_in_sentences(text)
	return len(sentences)

def number_of_words(text):
	text_sentences = break_in_sentences(text)
	number_of_words = 0
	for sentence in text_sentences:
		number_of_words += len([token for token in sentence])
	return number_of_words
 
def number_of_syllables(word):
	return textstatistics().syllable_count(word)

def average_sentence_length(text):
	n_words = number_of_words(text)
	n_sentences = number_of_sentences(text)
	average_sentence_length = n_words/n_sentences
	return float(average_sentence_length)

def average_number_of_syllables_per_word(text):
  n_syllables = number_of_syllables(text)
  n_words = number_of_words(text)
  ASPW = float(n_syllables) / float(n_words)
  return legacy_round(ASPW, 1)

def number_of_difficult_words(text):
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)

	words = []
	sentences = break_in_sentences(text)
	for sentence in sentences:
		words += [str(token) for token in sentence]

	diff_words_set = set()
	
	for word in words:
		syllable_count = number_of_syllables(word)
		if word not in nlp.Defaults.stop_words and syllable_count >= 2:
			diff_words_set.add(word)

	return len(diff_words_set)

def number_of_polysyllable_words(text):
	count = 0
	words = []
	sentences = break_in_sentences(text)
 
	for sentence in sentences:
		words += [token for token in sentence]

	for word in words:
		syllable_count = number_of_syllables(word)
		if syllable_count >= 3:
			count += 1
      
	return count

#### Flesch-Kincaid

In [42]:
def flesch_reading_index(text):
	FRE = 206.835 - float(1.015 * average_sentence_length(text)) - float(84.6 * average_number_of_syllables_per_word(text))
	return legacy_round(FRE, 2)

#### Dale-Chall

In [48]:
def dale_chall_index(text):  
  n_words = number_of_words(text)
  n_difficult_words = number_of_difficult_words(text)
  
  difficult_words_percentage = 0

  if n_words > 0:
		 difficult_words_percentage = float(n_difficult_words) / float(n_words) * 100
	
  DCI = (0.1579 * difficult_words_percentage) + (0.0496 * average_sentence_length(text)) + 3.6365
		
  return legacy_round(DCI, 2)

#### Gunning Fog

In [29]:
def gunning_fog(text):
    difficult_words_percentage = (number_of_difficult_words(text) / number_of_words(text) * 100) 
    GFI = 0.4 * (average_sentence_length(text) + difficult_words_percentage)
    return GFI

#### Tests

In [24]:
text_to_test = df_news_aggregator.loc[1000]['Content']

In [43]:
flesch_reading_index(text_to_test)

100.03

In [49]:
dale_chall_index(text_to_test)

6.84

In [50]:
gunning_fog(text_to_test)

14.121428571428574

#### Dataset features

In [83]:
df_features_news_aggregator_names = ["Flesch-Kincaid", "Dale-Chall", "Gunning Fog", "Bias"]
df_features_news_aggregator = pd.DataFrame(columns = df_features_news_aggregator_names)

In [84]:
for index, row in df_news_aggregator[:3].iterrows():
  text = row['Content']
  bias = row['Bias']
  flesch_kincaid = flesch_reading_index(text)
  dale_chall = dale_chall_index(text)
  # gunning_fog = gunning_fog(text)
  df_features_news_aggregator = df_features_news_aggregator.append({'Flesch-Kincaid': flesch_kincaid, 'Dale-Chall': dale_chall, 'Gunning Fog': 0, 'Bias': bias}, ignore_index=True)

In [85]:
df_features_news_aggregator

Unnamed: 0,Flesch-Kincaid,Dale-Chall,Gunning Fog,Bias
0,94.4,8.53,0,right-center
1,99.91,10.47,0,right-center
2,99.13,8.6,0,left-center


### n-grams

In [61]:
import re

def generate_ngrams(s, n):
    s = s.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [63]:
generate_ngrams(text_to_test, 1)

['nvidia',
 'is',
 'only',
 'just',
 'starting',
 'to',
 'put',
 'out',
 'cards',
 'that',
 'run',
 'on',
 'its',
 'new',
 'maxwell',
 'architiecture',
 'but',
 'its',
 'eyes',
 'are',
 'already',
 'on',
 'the',
 'future',
 'today',
 'at',
 'its',
 'annual',
 'gpu',
 'technology',
 'conference',
 'nvidia',
 'announced',
 'its',
 'next',
 'far',
 'future',
 'architecture',
 'pascal',
 'not',
 'due',
 'to',
 'hit',
 'until',
 '2016',
 'pascal',
 'is',
 'faaaaar',
 'from',
 'making',
 'its',
 'way',
 'into',
 'anything',
 'you',
 'hope',
 'to',
 'own',
 'but',
 'when',
 'it',
 'does',
 'it',
 'll',
 'be',
 'a',
 'screamer',
 'pascal',
 'will',
 'be',
 'faster',
 'smaller',
 'and',
 'more',
 'efficient',
 'than',
 'anything',
 'out',
 'there',
 'now',
 'but',
 'with',
 'two',
 'years',
 'before',
 'release',
 'duh',
 'it',
 'had',
 'better',
 'be',
 'advertisement',
 'as',
 'for',
 'how',
 'it',
 'will',
 'do',
 'that',
 'nvidia',
 'is',
 'pointing',
 'to',
 'a',
 'few',
 'key',
 'technolo

In [62]:
generate_ngrams(text_to_test, 2)

['nvidia is',
 'is only',
 'only just',
 'just starting',
 'starting to',
 'to put',
 'put out',
 'out cards',
 'cards that',
 'that run',
 'run on',
 'on its',
 'its new',
 'new maxwell',
 'maxwell architiecture',
 'architiecture but',
 'but its',
 'its eyes',
 'eyes are',
 'are already',
 'already on',
 'on the',
 'the future',
 'future today',
 'today at',
 'at its',
 'its annual',
 'annual gpu',
 'gpu technology',
 'technology conference',
 'conference nvidia',
 'nvidia announced',
 'announced its',
 'its next',
 'next far',
 'far future',
 'future architecture',
 'architecture pascal',
 'pascal not',
 'not due',
 'due to',
 'to hit',
 'hit until',
 'until 2016',
 '2016 pascal',
 'pascal is',
 'is faaaaar',
 'faaaaar from',
 'from making',
 'making its',
 'its way',
 'way into',
 'into anything',
 'anything you',
 'you hope',
 'hope to',
 'to own',
 'own but',
 'but when',
 'when it',
 'it does',
 'does it',
 'it ll',
 'll be',
 'be a',
 'a screamer',
 'screamer pascal',
 'pascal w