# Data Scraping Thesis

In [1]:
import pandas as pd
import requests
from lxml import html
from random import randint
from time import sleep
import pickle
import concurrent
import logging
import glob
from newsplease import NewsPlease



## Functions and Dictionaries

In the following dictionaries, I collect the information necessary to loop over the news pages. In the first dictionary, each entry specifies the name of the news outlet, the url, the number of pages, the xpath to access article titles and the xpath to access the article links. In some cases, the links only contain a part of the link. In those cases, the necessary prefix is included as a key too. In some cases, there are two xpaths which need to be accessed to retrieve all relevant titles and links. In those cases, title and link paths are recorded in a list.

The second dictionary contains information on whether the text can be downloaded with newsplease or not. If it cannot, the dictionary provides the xpath of the text and date location.

In [2]:
pages = [{'name':'Canal2',
        'url':'https://canal2tv.com/category/nacionales/page/',
        'pages':132,
        'titlepath': "//div[@class='post-container']//a[@class='post-title']/h2",
        'prefix': "",
        'linkpath': "//div[@class='post-container']//a[@class='post-title']"},
        {'name':'Canal4',
        'url':'https://www.canal4.com.ni/nicaragua/page/',
        'pages':1565,
        'titlepath': "//div[@class='tg-col-control']//h3/a",
        'prefix': "",
        'linkpath': "//div[@class='tg-col-control']//h3/a"},
        {'name':'Canal6',
        'url':'https://canal6.com.ni/category/nacionales/page/',
        'pages':307,
        'titlepath': "//figure[@class='figure']//a",
        'prefix': "",
        'linkpath': "//figure[@class='figure']//a"},
        {'name':'Canal10',
        'url':'https://www.canal10.com.ni/category/nacionales/page/',
        'pages':1234,
        'titlepath': "//div[@class='item card-type-a child']//h2/a",
        'prefix': "",
        'linkpath': "//div[@class='item card-type-a child']//h2/a"},
         {'name':'Canal10_Accion_10',
        'url':'https://www.canal10.com.ni/category/accion-10/page/',
        'pages':4014,
        'titlepath': "//div[@class='item card-type-a child']//h2/a",
        'prefix': "",
        'linkpath': "//div[@class='item card-type-a child']//h2/a"},
        {'name':'Canal13_politica',
        'url':'https://www.vivanicaragua.com.ni/category/politica/page/',
        'pages':445,
        'titlepath': "//a[@class='card-title']//h3",
        'prefix': "",
        'linkpath': "//a[@class='card-title']"},
        {'name':'Canal13_economia',
        'url':'https://www.vivanicaragua.com.ni/category/economia/page/',
        'pages':363,
        'titlepath': "//a[@class='card-title']//h3",
        'prefix': "",
        'linkpath': "//a[@class='card-title']"},
        {'name':'Canal13_sociales',
        'url':'https://www.vivanicaragua.com.ni/category/sociales/page/',
        'pages':2997,
        'titlepath': "//a[@class='card-title']//h3",
        'prefix': "",
        'linkpath': "//a[@class='card-title']"},
        {'name':'Canal14',
        'url':'https://www.vostv.com.ni/nacionales/?page=',
        'pages':669,
        'titlepath': "//section[@class='secondary-news']//h3",
        'prefix': 'https://www.vostv.com.ni',
        'linkpath': "//section[@class='secondary-news']//div[@class='figure-cap']/a[1]"},
        {'name':'Radio la Primerisima',
        'url':'https://radiolaprimerisima.com/noticias-generales/page/',
        'pages':797,
        'titlepath': "//div[@class='post_title']//a/span[1]",
        'prefix': "",
        'linkpath': "//div[@class='post_title']//a"},
        {'name':'La Nueva Radio Ya',
        'url':'https://nuevaya.com.ni/nacionales/page/',
        'pages':1430,
        'titlepath': "//div[@class='vc_column tdi_52 wpb_column vc_column_container tdc-column td-pb-span9']//h3[@class='entry-title td-module-title']//a",
        'prefix': "",
        'linkpath': "//div[@class='vc_column tdi_52 wpb_column vc_column_container tdc-column td-pb-span9']//h3[@class='entry-title td-module-title']//a"},
        {'name':'Radio 800',
        'url':'https://radio800ni.com/category/nacionales/page/',
        'pages':81,
        'titlepath': "//h2[@class='post-title']/a",
        'prefix': "",
        'linkpath': "//h2[@class='post-title']/a"},
        {'name':'Radio Nicaragua',
        'url':'https://radionicaragua.com.ni/category/nacionales/page/',
        'pages':2161,
        'titlepath': "//figcaption/a/h2",
        'prefix': "",
        'linkpath': "//figcaption/a"},
        {'name':'Radio Corporacion_nacional',
        'url':'https://radio-corporacion.com/blog/archivos/category/nacional/page/',
        'pages':584,
        'titlepath': "//h3[@class='mh-loop-title']/a",
        'prefix': "",
        'linkpath': "//h3[@class='mh-loop-title']/a"},
        {'name':'Radio Corporacion_politica',
        'url':'https://radio-corporacion.com/blog/archivos/category/politica/page/',
        'pages':264,
        'titlepath': "//h3[@class='mh-loop-title']/a",
        'prefix': "",
        'linkpath': "//h3[@class='mh-loop-title']/a"},
        {'name':'Radio Corporacion_eco',
        'url':'https://radio-corporacion.com/blog/archivos/category/eco/page/',
        'pages':116,
        'titlepath': "//h3[@class='mh-loop-title']/a",
        'prefix': "",
        'linkpath': "//h3[@class='mh-loop-title']/a"},
        {'name':'Confidencial_politica',
        'url':'https://www.confidencial.com.ni/politica/page/',
        'pages':355,
        'titlepath': "//h2[@class='archive-titles']/a",
        'prefix': "",
        'linkpath': "//h2[@class='archive-titles']/a"},
        {'name':'Confidencial_economia',
        'url':'https://www.confidencial.com.ni/economia/page/',
        'pages':168,
        'titlepath': "//h2[@class='archive-titles']/a",
        'prefix': "",
        'linkpath': "//h2[@class='archive-titles']/a"},
        {'name':'Confidencial_nacion',
        'url':'https://www.confidencial.com.ni/nacion/page/',
        'pages':637,
        'titlepath': "//h2[@class='archive-titles']/a",
        'prefix': "",
        'linkpath': "//h2[@class='archive-titles']/a"},
        {'name':'100% Noticias_nacionales',
        'url':'https://100noticias.com.ni/nacionales/?page=',
        'pages':747,
        'titlepath': ["//div[@class='col-md-6 m-bottom-10']//a//h5", "//div[@class='col-6 col-md-4']/a//h5"],
        'prefix' : "https://100noticias.com.ni",
        'linkpath': ["//div[@class='col-md-6 m-bottom-10']//a", "//div[@class='col-6 col-md-4']/a"]},
        {'name':'100% Noticias_economia',
        'url':'https://100noticias.com.ni/economia/?page=',
        'pages':73,
        'titlepath': ["//div[@class='col-md-6 m-bottom-10']//a//h5", "//div[@class='col-6 col-md-4']/a//h5"],
        'prefix' : "https://100noticias.com.ni",
        'linkpath': ["//div[@class='col-md-6 m-bottom-10']//a", "//div[@class='col-6 col-md-4']/a"]},
        {'name':'100% Noticias_politica',
        'url':'https://100noticias.com.ni/politica/?page=',
        'pages':114,
        'titlepath': ["//div[@class='col-md-6 m-bottom-10']//a//h5", "//div[@class='col-6 col-md-4']/a//h5"],
        'prefix': "https://100noticias.com.ni",
        'linkpath': ["//div[@class='col-md-6 m-bottom-10']//a", "//div[@class='col-6 col-md-4']/a"]}]

outlets_instructions = [{'name': 'Canal2-links-titles.pkl',
                         'xpath': None,
                        'date': None},
                        {'name': 'Canal6-links-titles.pkl',
                         'xpath': None,
                        'date':None},
                        {'name': 'Canal10-links-titles.pkl',
                         'xpath': None,
                        'date': "//div[@class='date']"},
                        {'name': 'Canal13-links-titles.pkl',
                         'xpath': None},
                        {'name': 'Confidencial-links-titles.pkl',
                         'xpath': None,
                        'date': None},
                        {'name': 'Radio Corporacion-links-titles.pkl',
                         'xpath': None,
                        'date': None},
                        {'name': 'Radio la Primerisima-links-titles.pkl',
                         'xpath': None,
                        'date': None},
                        {'name': 'Radio Nicaragua-links-titles.pkl',
                         'xpath': None,
                        'date': None},
                        {'name': '100% Noticias-links-titles.pkl',
                         'xpath': "//div[@class='story-body']",
                        'date': "//div[@class='story-meta top-meta text-center']/span[2]"},
                        {'name': 'Canal14-links-titles.pkl',
                         'xpath': "//div[@class='story-body']",
                        'date': "//ul[@class='story-meta m-bottom-20']/li[2]"},
                        {'name': 'Canal4-links-titles.pkl',
                         'xpath': "//span[@style='color: #000000;']",
                        'date': None}]

### Functions

In [3]:
def download_main_pages(outlet):
    '''This function loops through the pages of the main page of the provided outlet.
        It downloads the html of the sites, appends them to a list and saves that
        list once it is done.'''
    htmllist = []
    headers = requests.utils.default_headers()
    headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    baseurl = outlet['url']
    name = outlet['name']
    
    logger.info(f'Working on {name}.')
    
     # start looping through pages
    for i in range(1, outlet['pages']+1):
        # report on status at every ten pages
        if i % 10 == 0: logger.info(f"Status {name}: page {i}")
        try:
            url = baseurl+str(i)
            source = requests.get(url, headers=headers).text
            tree = html.fromstring(source)
            htmllist.append(tree)
        except Exception as e:
            logger.error(f"Error with {name} at page {i}:")
            logger.error(e)
        sleep(randint(3, 6))
    
    with open(f'{name}-html.pkl', 'wb') as f:
        pickle.dump(htmllist, f)
        
def scrape_articles(outlet):
    '''This funtion loops throught the htmllist of the input outlet and extracts all article
    links and titles. These are then saved in two lists, which are then combined in a list of
    lists and saved in pickle format.'''
    linklist = []
    titlelist = []
    with open(f'html/{name}-html.pkl', 'rb') as f:
        htmllist = pickle.load(f)
    
    # start looping through pages
    for i in htmllist:
        try:
            tree = i
            if isinstance(outlet['titlepath'] , list):
                links = [outlet['prefix'] + l.attrib['href'] for l in (tree.xpath(outlet["linkpath"][0]) + tree.xpath(outlet["linkpath"][1]))]
                titles = [l.text for l in (tree.xpath(outlet["titlepath"][0]) + tree.xpath(outlet["titlepath"][1]))]
            else:
                links = [outlet['prefix'] + l.attrib['href']for l in tree.xpath(outlet["linkpath"])]
                titles = [l.text for l in tree.xpath(outlet["titlepath"])]
            [linklist.append(x) for x in links]
            [titlelist.append(x) for x in titles]
        except Exception as e:
            print(f"Error with {name} at page {i}:")
            print(e)
    
    combined = [linklist, titlelist]
    with open(f'data/{name}-links-titles.pkl', 'wb') as f:
        pickle.dump(combined, f)

def download_articles(outlet):
    '''This function loops through the articles of one outlet and downloads them. It then
    appends them to a list and saves that list once it is done.'''
    headers = requests.utils.default_headers()
    headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    htmllist = []
    name = outlet["name"].split("-")[0]
    
    with open(outlet["name"], "rb") as f:
        articles = pickle.load(f)
    logger.info(f'Working on {outlet["name"]}.')
    if outlet['xpath'] is None:
        for url in articles[0]:
            try:
                art = NewsPlease.from_url(url)
                htmllist.append(tree)
            except Exception as e:
                logger.error(f"{outlet['name']}-{url}: {e}")
            sleep(randint(1, 4))
    else:
        for url in articles[0]:
            try:
                source = requests.get(url, headers=headers).text
                tree = html.fromstring(source)
                htmllist.append(tree)
            except Exception as e:
                logger.error(f"{outlet['name']}-{url}: {e}")
            sleep(randint(1, 4))
            
    with open(f'html/{name}-articles-html.pkl', 'wb') as f:
        pickle.dump(htmllist, f)

def scrape_text_date(outlet):
    '''This funtion loops throught the htmllist of the input outlet and extracts all text,
    and dates. These are then saved in two lists, which are then combined in a list of
    lists and saved in pickle format.'''
    text_list = []
    date_list = []
    name = outlet["name"].split("-")[0]
    with open(f"html/{name}-articles-html.pkl", "rb") as f:
        htmllist = pickle.load(f)
    
    # scrape the text with newsplease if possible
    if outlet['xpath'] is None:
        for html in htmllist:
            try:
                art = html
                text = art.maintext
                text_list.append(text)
            except Exception as e:
                print(f"{outlet['name']}-{url}: {e}")
                text_list.append(None)
                
    # otherwise use the xpath
    else:
        for html in htmllist:
            try:
                tree = html
                text = " ".join([l.text_content() for l in tree.xpath(outlet['xpath'])])
                text_list.append(text)
            except Exception as e:
                print(f"{outlet['name']}-{url}: {e}")
                text_list.append(None)
    
    # download the date with newsplease if possible
    if outlet['date'] is None:
        for html in htmllist:
            try:
                art = html
                date = art.date_publish
                date_list.append(date)
            except Exception as e:
                print(f"{outlet['name']}-{url}: {e}")
                date_list.append(None)
            sleep(randint(3, 6))
    else:
        for html in htmllist:
            try:
                tree = html
                date = " ".join([l.text_content() for l in tree.xpath(outlet['xpath'])])
                date_list.append(date)
            except Exception as e:
                print(f"{outlet['name']}-{url}: {e}")
                date_list.append(None)
                
    with open(outlet["name"], "rb") as f:
        articles = pickle.load(f)
    link_list = articles[0]
    title_list = articles[1]
    
    combined = [link_list, title_list, text_list, date_list]
    with open(f'data/{outlet["name"]}_full.pkl', 'wb') as f:
        pickle.dump(combined, f)

## Download the HTML of the main pages and scrape them

In [4]:
pages[4]

{'name': 'Canal10_Accion_10',
 'url': 'https://www.canal10.com.ni/category/accion-10/page/',
 'pages': 4014,
 'titlepath': "//div[@class='item card-type-a child']//h2/a",
 'prefix': '',
 'linkpath': "//div[@class='item card-type-a child']//h2/a"}

In [None]:
#Creating and Configuring Logger
Log_Format = "%(levelname)s %(asctime)s - %(message)s"

logging.basicConfig(filename = f"logfile.log",
                    filemode = "w",
                    format = Log_Format, 
                    level = logging.INFO,
                    force = True)
logger = logging.getLogger()

# download htmls of all outlets at the same time
with concurrent.futures.ThreadPoolExecutor(max_workers=len(pages)) as executor:
    executor.map(download_main_pages, pages)

In [None]:
# scrape four outlets at the same time
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(scrape_articles, pages)

### Combine individual outlet categories

In [None]:
# get set of unique outlets
namelist = [e.split("-")[0] for e glob.glob("data/*")]
namelist = set(namelist)

# get all files
all_files = glob.glob("data/*")

# for each unique outlet
for name in namelist:
    links = []
    art_name = []
    
    # find all files that pertain to outlet
    files = [f for f in all_files if f.split("/")[1].startswith(name)]
    
    # for each of them, combine all links and article names in single file
    for file in files:
        with open(file, "rb") as f:
            articles = pickle.load(f)
            links.append(articles[0])
            art_name.append(articles[1])
    combined = [links, art_name]
    
    # save that file
    with open(f'data/{name}-links-titles.pkl', 'wb') as f:
        pickle.dump(combined, f)

## Download and Scrape individual Articles

In [None]:
#Creating and Configuring Logger
Log_Format = "%(levelname)s %(asctime)s - %(message)s"

logging.basicConfig(filename = f"logfile.log",
                    filemode = "w",
                    format = Log_Format, 
                    level = logging.INFO,
                    force = True)
logger = logging.getLogger()

# download htmls of all outlets at the same time
with concurrent.futures.ThreadPoolExecutor(max_workers=len(pages)) as executor:
    executor.map(download_articles, pages)

In [None]:
# scrape four outlets at the same time
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(scrape_text_date, pages)

## Save to csv

In [None]:
# putting individual lists of lists into single dataframe
df = pd.DataFrame(columns = ['page', 'date', 'title', 'text', 'url'])

# for each of full outlet files
for file in [x for x in glob.glob("data/*") if x.endswith("._full.pkl")]:
    with open(file, "rb") as f:
        articles = pickle.load(f)
    name = file.split(".")[0]
    # append to dataframe in appropriate columns
    df = df.append(pd.DataFrame({"page":name, "title":articles[1],
                                 "text":articles[2], "date":articles[3],
                                 "url":articles[0]}))

# reset index and save to csv
df.reset_index(drop = True, inplace = True)
df.to_csv("dataset.csv", index = False)