In [None]:
"""
This notebook gets labels from a csv file from ASU and scrapes the news articles linked by 
this data set to generate a dataset with text from news articles and labels from the csv file
"""

In [7]:
import pandas as pd
import pickle
from bs4 import BeautifulSoup
import requests
import time

In [8]:
# Converts the csv file to a dataframe
df_fake = pd.read_csv('politifact_fake.csv')

In [10]:
df_fake.head()

Unnamed: 0,id,news_url,title,tweet_ids
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...


In [6]:
df_real = pd.read_csv('politifact_real.csv')

In [17]:
# Taking the links from the data frame, we go to the websites and scrape the articles
# which are saved into a list
fake_polifact = []
for index, row in df_fake.iterrows():
    try:
        url = row['news_url']
        if url[0:7] not in ['https:/', 'http://']:
            url = 'https://' + url
        source_code = requests.get(url)
        soup = BeautifulSoup(source_code.text, 'html5lib')
        visible = soup.find_all(text=True)
        text = ''
        for tag in visible:
            if tag.parent.name in ['p', 'a']:
                text += ' ' + tag
        time.sleep(1)
        print(index)
        fake_polifact.append(text)
    except:
        print('Not a valid URL', row['id'])


Not a valid URL politifact15014
Not a valid URL politifact15156
Not a valid URL politifact14745
3
4
Not a valid URL politifact14404
6
7
8
9
10
11
12
Not a valid URL politifact14233
Not a valid URL politifact14890
15
16
17
18
19
Not a valid URL politifact15052
Not a valid URL politifact15309
22
Not a valid URL politifact13836
24
Not a valid URL politifact14755
Not a valid URL politifact13806
Not a valid URL politifact15164
Not a valid URL politifact14388
29
Not a valid URL politifact14544
Not a valid URL politifact13677
32
Not a valid URL politifact14310
Not a valid URL politifact15130
Not a valid URL politifact14694
36
37
38
Not a valid URL politifact14258
40
Not a valid URL politifact15251
42
43
44
45
46
47
48
Not a valid URL politifact13887
50
51
Not a valid URL politifact13589
53
54
Not a valid URL politifact13565
Not a valid URL politifact15354
Not a valid URL politifact14426
Not a valid URL politifact14565
59
60
61
Not a valid URL politifact14879
63
Not a valid URL politifact13982

In [20]:
# Saves the list into a dataframe
df_poli = pd.DataFrame(fake_polifact, columns = ['text'])

In [73]:
# Remove articles that are too short.  Possibly not a real article
# or did not get access
df_poli2 = df_poli[df_poli['text'].str.len() >200]

In [47]:
df_poli2.head()

Unnamed: 0,text
1,About Contact Privacy Policy Disclaimer \n\t ...
2,Adrenalin Boost Alternative Health Android An...
3,\n \n \n RELIGION MIND \n home Buddhism Chri...
4,\n \n Home Politics President Trump The Supre...
5,Before It's News Featured Economy Health Be...


In [74]:
# Generates a column 'true' where all the values are 0 because all the articles
# in this dataframe have been labelled fake
df_poli2['true'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_poli2['true'] = 0


In [62]:
# Cleans up text in dataframe, like removing extraneous characters, punctuation, etc.
import re
import string

cleanup = lambda x: re.sub("U.S", 'us', x)
cleanup2 = lambda x: re.sub("\n", '', x)
cleanup3 = lambda x: re.sub("\t", '', x)
apostrophe = lambda x: re.sub("\'", '', x)
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

#df['text'] = df['Text'].map(us).map(apostrophe).map(alphanumeric).map(punc_lower)
df_poli2['text'] = df_poli2['text'].map(cleanup).map(cleanup2).map(cleanup3).map(apostrophe).map(alphanumeric)

In [76]:
# Saves the cleaned up dataframe into a pickle file
with open('politifact.pickle', 'wb') as to_write:
    pickle.dump(df_poli2, to_write)