In [1]:
import pandas as pd
import numpy as np
import os, json, requests, nltk
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords

## Datasets

In [2]:
data = {'BuzzFeed': 'datasets/Data/BuzzFeed', 'PolitiFact': 'datasets/Data/PolitiFact'}

## Escolhendo o dataset

In [3]:
def get_newsids_dataframe(currentProvider):
    file_news_ids = os.path.join(os.getcwd(), data[currentProvider], 'News.txt')
    news_ids = pd.read_csv(file_news_ids, sep='\t', header=None, names=['NewsName'])
    news_ids.index += 1
    news_ids['NewsIndex'] = news_ids.index
    return news_ids

news_ids = get_newsids_dataframe('BuzzFeed')
print('Tamanho NewsIds: ', news_ids.shape)
news_ids.head()

Tamanho NewsIds:  (182, 2)


Unnamed: 0,NewsName,NewsIndex
1,BuzzFeed_Real_1,1
2,BuzzFeed_Real_2,2
3,BuzzFeed_Real_3,3
4,BuzzFeed_Real_4,4
5,BuzzFeed_Real_5,5


In [4]:
def get_userids_dataframe(currentProvider):
    file_user_ids = os.path.join(os.getcwd(), data[currentProvider], 'User.txt')
    user_ids = pd.read_csv(file_user_ids, sep='\t', header=None, names=['UserName'])
    user_ids.index += 1
    user_ids['UserIndex'] = user_ids.index
    return user_ids

user_ids = get_userids_dataframe('BuzzFeed')
print('Tamanho UserIds: ', user_ids.shape)
user_ids.head()

Tamanho UserIds:  (15257, 2)


Unnamed: 0,UserName,UserIndex
1,98d2b98ce305174e2f6c10b8f8a1a9d5,1
2,a273d0fd07c18a884ce2aa425813eb06,2
3,ac091e92df9e854a07563ffb397925d4,3
4,d2ded2de054f2ceb43dff7f80fc46774,4
5,3f2b23abf0e842f6bc97eed85596ff50,5


In [5]:
def get_newsuser_dataframe(currentProvider):
    news_user_file = os.path.join(os.getcwd(), data[currentProvider], currentProvider+'NewsUser.txt')
    news_user = pd.read_csv(news_user_file, sep='\t', header=None, names=['NewsIndex', 'UserIndex', 'Count'])
    return news_user
    
news_user = get_newsuser_dataframe('BuzzFeed')
print('Tamanho NewsUser: ', news_user.shape)
news_user.head()                                                                                   

Tamanho NewsUser:  (22779, 3)


Unnamed: 0,NewsIndex,UserIndex,Count
0,45,1,1
1,127,2,1
2,115,3,1
3,180,3,1
4,140,4,1


In [6]:
def get_useruser_dataframe(currentProvider):
    user_user_file = os.path.join(os.getcwd(), data[currentProvider], currentProvider+'UserUser.txt')
    user_user = pd.read_csv(user_user_file, sep='\t', header=None, names=['UserIndex', 'Follows'])
    return user_user

user_user = get_useruser_dataframe('BuzzFeed')
print('Tamanho UserUser: ', user_user.shape)
user_user.head()

Tamanho UserUser:  (634750, 2)


Unnamed: 0,UserIndex,Follows
0,48,1
1,899,1
2,6781,1
3,10097,1
4,100,2


## Resumo dos dados - tamanho das tabelas

In [7]:
summary = {"news": [get_newsids_dataframe('BuzzFeed').shape[0], get_newsids_dataframe('PolitiFact').shape[0]],
           "users": [get_userids_dataframe('BuzzFeed').shape[0], get_userids_dataframe('PolitiFact').shape[0]],
           "newsuser": [get_newsuser_dataframe('BuzzFeed').shape[0], get_newsuser_dataframe('PolitiFact').shape[0]],
           "useruser": [get_useruser_dataframe('BuzzFeed').shape[0], get_useruser_dataframe('PolitiFact').shape[0]] }


In [8]:
pd.DataFrame(summary, index=['BuzzFeed', 'PolitiFact'])

Unnamed: 0,news,newsuser,users,useruser
BuzzFeed,182,22779,15257,634750
PolitiFact,240,32791,23865,574744


In [9]:
def get_textfolder(newsname):
    return 'FakeNewsContent' if 'Fake' in newsname else 'RealNewsContent'

In [10]:
def get_newspublisher(newsdict):
    if 'meta_data' not in newsdict or 'og' not in newsdict['meta_data'] or 'site_name' not in newsdict['meta_data']['og']: 
        if 'source' in newsdict:
            return newsdict['source']
        return None
    return newsdict['meta_data']['og']['site_name']

In [11]:
def get_newsdict(provider, newsname):
    textfolder = get_textfolder(newsname)
    textdatafile = os.path.join(os.getcwd(), 'datasets/Data',provider, textfolder, newsname+'-Webpage.json')
    with open(textdatafile, 'r') as f:
        newsdict = json.load(f)
    return newsdict

## Exemplo de Noticia

In [12]:
news_ids = get_newsids_dataframe('BuzzFeed')
randomnews = news_ids.get_value(np.random.randint(0, news_ids.shape[0]), 'NewsName')

textnews = get_newsdict('BuzzFeed', randomnews)

pd.set_option('max_colwidth', 500)
example_news = {"title": [textnews['title']],
                "text": [textnews["text"]]}
pd.DataFrame(example_news)

Unnamed: 0,text,title
0,"Obama To UN: ‘Giving Up Liberty, Enhances Security In America…’ [VIDEO]\n\nFreedom is the bedrock that this nation was built upon. The problem is, liberals hate freedom. Oh, sure — they pay lip service to its importance, but ultimately, they want people to be subservient to a liberal-controlled government. They may not want a tyrannical king, but a tyrannical government of socialist-loving politicians pushing government-controlled everything is A-OK with them. They can’t ever say this out lo...","Obama To UN: ‘Giving Up Liberty, Enhances Security In America…’ [VIDEO]"


In [13]:
textnews

{'authors': ['Cassy Fiano'],
 'canonical_link': 'http://rightwingnews.com/barack-obama/obama-un-giving-liberty-enhances-security-america/',
 'images': ['http://rightwingnews.com/wp-content/uploads/2016/09/obama-un.jpg',
  'http://rightwingnews.com/wp-content/uploads/wordpress-popular-posts/343883-featured-90x90.png',
  'http://rightwingnews.com/wp-content/uploads/wordpress-popular-posts/343898-featured-90x90.jpg',
  'http://pixel.quantserve.com/pixel/p-RS38vHsezmL8w.gif',
  'http://rightwingnews.com/wp-content/uploads/wordpress-popular-posts/343804-featured-214x123.jpg',
  'http://rightwingnews.com/wp-content/uploads/wordpress-popular-posts/343804-featured-90x90.jpg',
  'http://rightwingnews.com/wp-content/uploads/wordpress-popular-posts/343898-featured-214x123.jpg',
  'https://www.facebook.com/tr?ev=6026286623931&cd[value]=0.00&cd[currency]=USD&noscript=1',
  'http://rightwingnews.com/wp-content/uploads/wordpress-popular-posts/343798-featured-90x90.jpg',
  'http://rightwingnews.com/wp

textnews['meta_data']['og']['site_name']

In [14]:
sourcebias_cache = {}
def get_source_bias(source):
    if source in sourcebias_cache:
        return sourcebias_cache[source]
    
    search_url = 'https://mediabiasfactcheck.com/?s=' + source
    try:
        r = requests.get(search_url)
        if r.status_code != 200:
            print('GetSourceBias StatusCode:', r.status_code)
            return None
    except Exception as ex:
        print('Erro no request de source bias')
        print(ex)
        raise(ex)
    
    soup = BeautifulSoup(r.content, 'html.parser')
    articles = soup.find_all('article')
    if len(articles)==0:
        print('GetSource Bias: Nenhum article encontrado.')
        sourcebias_cache[source] = 'UNDEFINED BIAS'
        return sourcebias_cache[source]
    
    biases = {'LEFT BIAS': -1,
              'LEAST BIASED': 0,
              'RIGHT BIAS': 1
             }
    
    result = articles[0].find('div', class_='mh-excerpt').text
    #print(result)
    
    for key in biases:
        if key in result:
            sourcebias_cache[source] = biases[key]
            return sourcebias_cache[source]
    
    sourcebias_cache[source] = 'UNDEFINED BIAS'
    return sourcebias_cache[source]    

get_source_bias('http://cnn.it')

## Calculo de bias das sources (BuzzFeed)

provider = 'BuzzFeed'
buzznews = []
for i, row in get_newsids_dataframe(provider).iterrows():
    textfolder = get_textfolder(row['NewsName'])
    textdatafile = os.path.join(os.getcwd(), 'datasets/Data',provider, textfolder, row['NewsName']+'-Webpage.json')
    with open(textdatafile, 'r') as f:
        newsdict = json.load(f)
        
    simpler_news = {
                    "name": row['NewsName'],
                    "title": newsdict['title'],
                    "text": newsdict['text'],
                    "source": newsdict['source'] if 'source' in newsdict else None,
                    "bias": get_source_bias(newsdict['source']) if 'source' in newsdict else None
                   }    
    buzznews.append(simpler_news)
pd.DataFrame(buzznews)['bias'].value_counts()

## Vocabulario (Buzzfeed)

In [23]:
vocabulary = {}

In [24]:
def tokenize_text(text):
    tokens = []
    sentences = nltk.sent_tokenize(text)
    for s in sentences:
        tokens.extend(nltk.word_tokenize(s))
    return tokens

def filter_stopwords(tokens):
    return [t for t in tokens if t not in stopwords.words('english')]

def make_lowercase(tokens):
    return [t.lower() for t in tokens]

provider = 'BuzzFeed'

news_ids = get_newsids_dataframe(provider)
vocab_counter = Counter()
for i, row in news_ids.iterrows():
    #pegando noticia do arquivo
    newsdict = get_newsdict(provider, row['NewsName'])
        
    #processamento basico
    newstext = ' '.join([newsdict['title'], newsdict['text']])
    news_tokens = make_lowercase(filter_stopwords(tokenize_text(newstext)))
    for token in news_tokens:
        vocab_counter[token] += 1
vocabulary[provider] = list(vocab_counter.keys())
print(provider, 'vocabulary size:', len(vocabulary[provider]) )

BuzzFeed vocabulary size: 11148


## Matriz X -> news x terms e y -> labels (Buzzfeed) 

In [25]:
X = np.zeros((news_ids.shape[0], len(vocabulary[provider])))
y = np.zeros(news_ids.shape[0])    

In [26]:
news_ids = get_newsids_dataframe(provider)
for i, (index, row) in enumerate(news_ids.iterrows()):
    #pegando noticia do arquivo
    textfolder = get_textfolder(row['NewsName'])
    textdatafile = os.path.join(os.getcwd(), 'datasets/Data',provider, textfolder, row['NewsName']+'-Webpage.json')
    with open(textdatafile, 'r') as f:
        newsdict = json.load(f)
        
    #processamento basico
    newstext = ' '.join([newsdict['title'], newsdict['text']])
    news_tokens = make_lowercase(filter_stopwords(tokenize_text(newstext)))
    
    xrow = np.zeros(X.shape[1])
    for token in news_tokens:
        xrow[vocabulary[provider].index(token)] += 1
    
    X[i,:] = xrow 
    y[i] = 1 if 'Fake' in row['NewsName'] else -1

## Matriz A -> user x user adjancy (Buzzfeed)

In [34]:
user_ids = get_userids_dataframe('BuzzFeed')
user_user = get_useruser_dataframe('BuzzFeed')

A = np.zeros((user_ids.shape[0], user_ids.shape[0]))

for i, (index, row) in enumerate(user_user.iterrows()):
    user_a_zero_indexed = row['UserIndex'] - 1
    user_b_zero_indexed = row['Follows'] -1
    A[user_a_zero_indexed, user_b_zero_indexed] = 1

## Matriz B -> publisher x news (Buzzfeed)

In [35]:
news_ids = get_newsids_dataframe(provider)
publisher_list = set()
for i, (index, row) in enumerate(news_ids.iterrows()):
    newsdict = get_newsdict(provider, row['NewsName'])
    newspublisher = get_newspublisher(newsdict)
    if newspublisher is not None:
        publisher_list.add(newspublisher)
    
publisher_list = list(publisher_list)

B = np.zeros((len(publisher_list), news_ids.shape[0]))
for i, (index, row) in enumerate(news_ids.iterrows()):
    newsdict = get_newsdict(provider, row['NewsName'])
    newspublisher = get_newspublisher(newsdict)
    if newspublisher is not None:
        B[ publisher_list.index(newspublisher), i ] = 1

## Matriz W -> user x news (Buzzfeed)

In [36]:
news_user = get_newsuser_dataframe('BuzzFeed')
W = np.zeros((user_ids.shape[0], news_ids.shape[0]))
for i, (index, row) in enumerate(news_user.iterrows()):
    user_zero_indexed = row['UserIndex'] - 1
    news_zero_indexed = row['NewsIndex'] - 1
    if row['Count'] > 0:
        W[user_zero_indexed, news_zero_indexed] = 1

## Matriz Y -> sign(A) (Buzzfeed)

In [37]:
Y = np.sign(A)

In [38]:
Y

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [40]:
np.array_equal(A, Y)

True