In [1]:
# Scrapiing fake and real news data from a website (politifact.com)

In [2]:
# libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests
import urllib.request
import time

In [34]:
# lists to store the scraped data
authors = []
dates = []
statements = []
sources = []
targets = []

In [35]:
# function to scrape the website
def scrape_website(page_number):
    page_num = str(page_number)
    URL = 'https://www.politifact.com/factchecks/list/?page=' + page_num
    webpage = requests.get(URL)
    soup = BeautifulSoup(webpage.text, 'html.parser')
    
    # location of the info
    statement_footer = soup.find_all('footer', 
                            attrs = {'class':'m-statement__footer'}) # location of the author
    statement_quote = soup.find_all('div',
                            attrs = {'class':'m-statement__quote'}) # location of the statement
    statement_meta = soup.find_all('div', 
                            attrs = {'class':'m-statement__meta'}) # location of the source
    target = soup.find_all('div',
                            attrs = {'class':'m-statement__meter'}) # location of the target (score)
    
    for i in statement_footer:
        link1 = i.text.strip()
        name_and_date = link1.split()
        first_name = name_and_date[1]
        last_name = name_and_date[2]
        full_name = first_name + ' ' + last_name
        month = name_and_date[4]
        day = name_and_date[5]
        year = name_and_date[6]
        date = month + ' ' + day + ' ' + year
        dates.append(date)
        authors.append(full_name)
        
    for i in statement_quote:
        link2 = i.find_all('a')
        statement_text = link2[0].text.strip()
        statements.append(statement_text)
        
    for i in statement_meta:
        link3 = i.find_all('a')
        source_text = link3[0].text.strip()
        sources.append(source_text)
        
    for i in target:
        link4 = i.find('div', attrs = {'class':'c-image'}).find('img').get('alt')
        targets.append(link4)

In [36]:
# n-1 webpage(s) loop
n = 2 # for 1 page
for i in range(1,n):
    scrape_website(i)

In [41]:
# check array length
len(dates) == len(authors) == len(statements) == len(targets) == len(sources)


True

In [38]:
# dataframe
data = pd.DataFrame(columns = ['author','statement','source','date','target'])
data['author'] = authors
data['statement'] = statements
data['source'] = sources
data['date'] = dates
data['target'] = targets

# data
data

Unnamed: 0,author,statement,source,date,target
0,Tom Kertscher,"“If I don’t take the vaccine, I’m at risk for ...",Facebook posts,"March 31, 2021",false
1,Jon Greenberg,Calls COVID-19 vaccine a “non-FDA-approved exp...,Instagram posts,"March 31, 2021",barely-true
2,Gabrielle Settles,A map shows “all the cargo ships backed up by ...,Facebook posts,"March 31, 2021",false
3,Noah Y.,Says H.R. 1 gives immigrants illegally in the ...,Instagram posts,"• March 31,",false
4,Andy Nguyen,"“From Europe: 3,964 people have died from adve...",Bloggers,"March 31, 2021",false
5,Louis Jacobson,"If the voting bill H.R. 1 is passed, “absentee...",Facebook posts,"March 31, 2021",false
6,Amy Sherman,U.S. Rep. Matt Gaetz was the lone vote in the ...,Facebook posts,"March 31, 2021",true
7,Madison Czopek,Video says COVID-19 vaccines are “weapons of m...,Facebook posts,"March 31, 2021",pants-fire
8,Michael Majchrowicz,Image of expansive pit shows a lithium mine.,Viral image,"March 31, 2021",false
9,Samantha Putterman,Says Ben Shapiro said on Twitter that his “red...,Facebook posts,"March 31, 2021",pants-fire


In [42]:
# function to get a binary number from the target
# info must be fully true
def getBinaryNumTarget(text):
    if text == 'true':
        return 1
    else:
        return 0

In [43]:
# function to get a binary label true(REAL) or false(FAKE) from the target
# info must be fully true
def getBinaryTarget(text):
    if text == 'true':
        return 'REAL'
    else:
        return 'FAKE'

In [44]:
# two new columns on the df
data['BinaryTarget'] = data['target'].apply(getBinaryTarget)
data['BinaryNumTarget'] = data['target'].apply(getBinaryNumTarget)

In [45]:
data

Unnamed: 0,author,statement,source,date,target,BinaryTarget,BinaryNumTarget
0,Tom Kertscher,"“If I don’t take the vaccine, I’m at risk for ...",Facebook posts,"March 31, 2021",false,FAKE,0
1,Jon Greenberg,Calls COVID-19 vaccine a “non-FDA-approved exp...,Instagram posts,"March 31, 2021",barely-true,FAKE,0
2,Gabrielle Settles,A map shows “all the cargo ships backed up by ...,Facebook posts,"March 31, 2021",false,FAKE,0
3,Noah Y.,Says H.R. 1 gives immigrants illegally in the ...,Instagram posts,"• March 31,",false,FAKE,0
4,Andy Nguyen,"“From Europe: 3,964 people have died from adve...",Bloggers,"March 31, 2021",false,FAKE,0
5,Louis Jacobson,"If the voting bill H.R. 1 is passed, “absentee...",Facebook posts,"March 31, 2021",false,FAKE,0
6,Amy Sherman,U.S. Rep. Matt Gaetz was the lone vote in the ...,Facebook posts,"March 31, 2021",true,REAL,1
7,Madison Czopek,Video says COVID-19 vaccines are “weapons of m...,Facebook posts,"March 31, 2021",pants-fire,FAKE,0
8,Michael Majchrowicz,Image of expansive pit shows a lithium mine.,Viral image,"March 31, 2021",false,FAKE,0
9,Samantha Putterman,Says Ben Shapiro said on Twitter that his “red...,Facebook posts,"March 31, 2021",pants-fire,FAKE,0


In [46]:
# storing the data to a csv file
data.to_csv('fact_checker.csv')