# Data Scraping

### Package Imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from lxml import html
from random import randint
from time import sleep

In [108]:
pages = {
    'TV1': {
        'name':'Canal2',
        'url':'https://canal2tv.com/category/nacionales/page/',
        'pages':132,
        'titlepath': [l.text for l in tree.xpath("//div[@class='post-container']//a[@class='post-title']/h2")],
        'linkpath': [l.attrib['href'] for l in tree.xpath("//div[@class='post-container']//a[@class='post-title']")]
    },
    'TV2': {
        'name':'Canal4',
        'url':'https://www.canal4.com.ni/nicaragua/page/',
        'pages':1565,
        'titlepath': [l.text for l in tree.xpath("//div[@class='tg-col-control']//h3/a")],
        'linkpath': [l.attrib['href'] for l in tree.xpath("//div[@class='tg-col-control']//h3/a")]
    },
    'TV3': {
        'name':'Canal6',
        'url':'https://canal6.com.ni/category/nacionales/page/',
        'pages':307,
        'titlepath': [l.text for l in tree.xpath("//figure[@class='figure']//a")],
        'linkpath': [l.attrib['href'] for l in tree.xpath("//figure[@class='figure']//a")]
    },
    'TV4': {
        'name':'Canal10',
        'url':'https://www.canal10.com.ni/category/nacionales/page/',
        'pages':1234,
        'titlepath': [l.text for l in tree.xpath("//div[@class='item card-type-a child']//h2/a")],
        'linkpath': [l.attrib['href'] for l in tree.xpath("//div[@class='item card-type-a child']//h2/a")]
    },
    'TV5.1': {
        'name':'Canal13',
        'url':'https://www.vivanicaragua.com.ni/category/politica/page/',
        'pages':445,
        'titlepath': [l.text for l in tree.xpath("//a[@class='card-title']//h3")],
        'linkpath': [l.attrib['href'] for l in tree.xpath("//a[@class='card-title']")]
    },
    'TV5.2': {
        'name':'Canal13',
        'url':'https://www.vivanicaragua.com.ni/category/economia/page/',
        'pages':363,
        'titlepath': [l.text for l in tree.xpath("//a[@class='card-title']//h3")],
        'linkpath': [l.attrib['href'] for l in tree.xpath("//a[@class='card-title']")]
    },
    'TV5.3': {
        'name':'Canal13',
        'url':'https://www.vivanicaragua.com.ni/category/sociales/page/',
        'pages':2997,
        'titlepath': [l.text for l in tree.xpath("//a[@class='card-title']//h3")],
        'linkpath': [l.attrib['href'] for l in tree.xpath("//a[@class='card-title']")]
    },
    'TV6': {
        'name':'Canal14',
        'url':'https://www.vostv.com.ni/nacionales/?page=',
        'pages':669,
        'titlepath': [l.text for l in tree.xpath("//section[@class='secondary-news']//h3")],
        'linkpath': ['https://www.vostv.com.ni' + l.attrib['href']
                     for l in tree.xpath("//section[@class='secondary-news']//div[@class='figure-cap']/a[1]")]
    },
    'Radio1': {
        'name':'Radio la Primerisima',
        'url':'https://radiolaprimerisima.com/noticias-generales/page/',
        'pages':797,
        'titlepath': [l.text for l in tree.xpath("//div[@class='post-title']//a/span[1]")],
        'linkpath': [l.attrib['href'] for l in tree.xpath("//div[@class='post-title']//a")]
    },
    'Radio2': {
        'name':'La Nueva Radio Ya',
        'url':'https://nuevaya.com.ni/nacionales/page/',
        'pages':1430,
        'titlepath': [l.text for l in tree.xpath("//div[@class='vc_column tdi_52 wpb_column vc_column_container tdc-column td-pb-span9']//h3[@class='entry-title td-module-title']//a")],
        'linkpath': [l.attrib['href']for l in tree.xpath("//div[@class='vc_column tdi_52 wpb_column vc_column_container tdc-column td-pb-span9']//h3[@class='entry-title td-module-title']//a")]
    },
    'Radio3': {
        'name':'Radio 800',
        'url':'https://radio800ni.com/category/nacionales/page/',
        'pages':81,
        'titlepath': [l.text for l in tree.xpath("//h2[@class='post-title']/a")],
        'linkpath': [l.attrib['href']for l in tree.xpath("//h2[@class='post-title']/a")]
    },
    'Radio4': {
        'name':'Radio Nicaragua',
        'url':'https://radionicaragua.com.ni/category/nacionales/page/',
        'pages':2161,
        'titlepath': [l.text for l in tree.xpath("//figcaption/a/h2")],
        'linkpath': [l.attrib['href']for l in tree.xpath("//figcaption/a")]
    },
    'Radio5.1': {
        'name':'Radio Corporacion',
        'url':'https://radio-corporacion.com/blog/archivos/category/nacional/page/',
        'pages':584,
        'titlepath': [l.text for l in tree.xpath("//h3[@class='mh-loop-title']/a")],
        'linkpath': [l.attrib['href']for l in tree.xpath("//h3[@class='mh-loop-title']/a")]
    },
    'Radio5.2': {
        'name':'Radio Corporacion',
        'url':'https://radio-corporacion.com/blog/archivos/category/politica/page/',
        'pages':264,
        'titlepath': [l.text for l in tree.xpath("//h3[@class='mh-loop-title']/a")],
        'linkpath': [l.attrib['href']for l in tree.xpath("//h3[@class='mh-loop-title']/a")]
    },
    'Radio5.3': {
        'name':'Radio Corporacion',
        'url':'https://radio-corporacion.com/blog/archivos/category/eco/page/',
        'pages':116,
        'titlepath': [l.text for l in tree.xpath("//h3[@class='mh-loop-title']/a")],
        'linkpath': [l.attrib['href']for l in tree.xpath("//h3[@class='mh-loop-title']/a")]
    },
    'Newspaper1.1': {
        'name':'Confidencial',
        'url':'https://www.confidencial.com.ni/politica/page/',
        'pages':355,
        'titlepath': [l.text for l in tree.xpath("//h2[@class='archive-titles']/a")],
        'linkpath': [l.attrib['href']for l in tree.xpath("//h2[@class='archive-titles']/a")]
    },
    'Newspaper1.2': {
        'name':'Confidencial',
        'url':'https://www.confidencial.com.ni/economia/page/',
        'pages':168,
        'titlepath': [l.text for l in tree.xpath("//h2[@class='archive-titles']/a")],
        'linkpath': [l.attrib['href']for l in tree.xpath("//h2[@class='archive-titles']/a")]
    },
    'Newspaper1.3': {
        'name':'Confidencial',
        'url':'https://www.confidencial.com.ni/nacion/page/',
        'pages':637,
        'titlepath': [l.text for l in tree.xpath("//h2[@class='archive-titles']/a")],
        'linkpath': [l.attrib['href']for l in tree.xpath("//h2[@class='archive-titles']/a")]
    },
    'Newspaper2.1': {
        'name':'100% Noticias',
        'url':'https://100noticias.com.ni/nacionales/?page=',
        'pages':747,
        'titlepath': [l.text for l in (tree.xpath("//div[@class='col-md-6 m-bottom-10']//a//h5") +
                                       tree.xpath("//div[@class='col-6 col-md-4']/a//h5"))],
        'linkpath': ["https://100noticias.com.ni" + l.attrib['href']
                     for l in (tree.xpath("//div[@class='col-md-6 m-bottom-10']//a") + 
                               tree.xpath("//div[@class='col-6 col-md-4']/a"))]
    },
    'Newspaper2.2': {
        'name':'100% Noticias',
        'url':'https://100noticias.com.ni/economia/?page=',
        'pages':73,
        'titlepath': [l.text for l in (tree.xpath("//div[@class='col-md-6 m-bottom-10']//a//h5") +
                                       tree.xpath("//div[@class='col-6 col-md-4']/a//h5"))],
        'linkpath': ["https://100noticias.com.ni" + l.attrib['href']
                     for l in (tree.xpath("//div[@class='col-md-6 m-bottom-10']//a") + 
                               tree.xpath("//div[@class='col-6 col-md-4']/a"))]
    },
    'Newspaper2.3': {
        'name':'100% Noticias',
        'url':'https://100noticias.com.ni/politica/?page=',
        'pages':114,
        'titlepath': [l.text for l in (tree.xpath("//div[@class='col-md-6 m-bottom-10']//a//h5") +
                                       tree.xpath("//div[@class='col-6 col-md-4']/a//h5"))],
        'linkpath': ["https://100noticias.com.ni" + l.attrib['href']
                     for l in (tree.xpath("//div[@class='col-md-6 m-bottom-10']//a") + 
                               tree.xpath("//div[@class='col-6 col-md-4']/a"))]
    }
}

In [None]:
def linkscraper(page):
    

In [102]:
baseurl = 'https://100noticias.com.ni/nacionales/?page='
linklist = []
titlelist = []

for i in range(1, 3):
    url = baseurl+str(i)
    source = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}).text
    tree = html.fromstring(source)
    links = ["https://100noticias.com.ni" + l.attrib['href']for l in (tree.xpath("//div[@class='col-md-6 m-bottom-10']//a") + tree.xpath("//div[@class='col-6 col-md-4']/a"))]
    titles = [l.text for l in (tree.xpath("//div[@class='col-md-6 m-bottom-10']//a//h5") + tree.xpath("//div[@class='col-6 col-md-4']/a//h5"))]
    [linklist.append(x) for x in links]
    [titlelist.append(x) for x in titles]
    sleep(randint(3, 6))

In [10]:
baseurl = 'https://www.vostv.com.ni/nacionales/?page=2'
source = requests.get(baseurl, headers={"User-Agent": "Mozilla/5.0"}).text
tree = html.fromstring(source)
titles = [l.text for l in tree.xpath("//section[@class='secondary-news']//h3")]
links = [l.attrib['href'] for l in tree.xpath("//a[@class='card-title']")]

In [145]:
soup = BeautifulSoup(source, "html.parser")
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html lang="es">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<link href="https://gmpg.org/xfn/11" rel="profile"/>
<meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
<title>Canal 6 Nicaragua - Noticias Nacionales</title>
<meta content="Infórmese aquí con las noticias nacionales en nicaragua que relatan con veracidad la realidad. Encuentre fotos, videos crónicas y reportajes." name="description">
<link href="https://canal6.com.ni/category/nacionales/page/2/" rel="canonical">
<link href="https://canal6.com.ni/category/nacionales/" rel="prev"/>
<link href="https://canal6.com.ni/category/nacionales/page/3/" rel="next"/>
<meta content="es_ES" property="og:locale"/>
<meta content="article" property="og:type"/>
<meta content="Canal 6 Nicaragua - Noticias Nacionales" property="og:title"/>
<meta content="Infórmese aquí 