Obtain Daily Caller

In [6]:
#Import necessary libraries
import time
import requests
import urllib
import lxml.html
from bs4 import BeautifulSoup
import newspaper
import trafilatura
import re

In [7]:
def obtain_page_urls(page="1"):
    """ 
    Obtain the urls of the politics section of a url in the Daily Caller politics
    section
    Inputs:
    - page(str): page of the politics section to fetch urls
    Return:
    - articles_set (set): Set of unique urls
    - year (int): year of last article fetched on the page
    """
    url = "https://dailycaller.com/section/politics/page/{}/".format(page)
    resp = requests.get(url)
    root = lxml.html.fromstring(resp.text)
    article_elements = root.cssselect("article.relative")
    article_list = []
    for article in article_elements:
        link = article.cssselect("a")[0].get("href")
        #Some articles in the Daily Caller politcs section
        #Are articles from another webpage checkyourfact and we will drop these
        if link.startswith("http://checkyourfact"):
            continue
        full_link = "dailycaller.com" + link

        article_list.append(full_link)
    
    year = re.search(r'\d{4}',article_list[-1]).group()
    print("Year:",year)
    articles_set = set(article_list)

    return articles_set, int(year)
    

obtain_page_urls("85")

Obtaining results for: https://dailycaller.com/section/politics/page/85/
Year: 2023


({'dailycaller.com/2023/02/27/14-million-lose-medicaid-coverage-public-health-emergency/',
  'dailycaller.com/2023/02/27/2024-gop-hopefuls-attend-donor-retreat-trump-not-invited/',
  'dailycaller.com/2023/02/27/biden-announces-new-director-office-public-engagement-stephen-benjamin/',
  'dailycaller.com/2023/02/27/donald-trump-joe-biden-china-imports/',
  'dailycaller.com/2023/02/27/fords-electric-car-poisoning-amazon/',
  'dailycaller.com/2023/02/27/foreign-affairs-committee-tiktok-ban-michael-mccaul-joe-biden/',
  'dailycaller.com/2023/02/27/jeb-bush-ron-desantis-donald-trump-2024-race/',
  'dailycaller.com/2023/02/27/jim-banks-cisneros-army-covid-vaccine/',
  'dailycaller.com/2023/02/27/joe-biden-fcc-nominee-gigi-sohn-establish-truth-reconciliation-commission-2020-blm-riots/',
  'dailycaller.com/2023/02/27/john-fetterman-office-health-update-clinical-depression/',
  'dailycaller.com/2023/02/27/judy-chu-all-america-chinese-youth-federation-ccp-ufwd-forums-for-peaceful-reunification-of

In [9]:
def obtain_politics_url(page = 1,year=2023):
    """ 
    Obtain the results of 
    """
    links_set  = set ()
    article_year = 2023
    while article_year >= year: 
        print("Obtaining results for page", page)
        page_links, article_year = obtain_page_urls(str(page))
        links_set.update(page_links)
        page += 1
        time.sleep(.5)
    
    print(len(links_set))
    
    return links_set, page


Obtaining results for page 118
Obtaining results for: https://dailycaller.com/section/politics/page/118/
Year: 2022
Obtaining results for page 119
Obtaining results for: https://dailycaller.com/section/politics/page/119/


KeyboardInterrupt: 

In [11]:
def crawl_politcs_url(min_year = 2016):
    """
    Starting from 2023 it fetches the urls of the daily caller politics section
    """
    politics_set = set()
    years = [*range(min_year,2024,1)]
    page = 1
    for year in reversed(years):
        print("Obtainings links for", year)
        year_set, page = obtain_politics_url(page,year)
        politics_set.update(year_set)
        page += 1

    return politics_set

crawl_politcs_url(2022)



Obtainings links for 2023
Obtaining results for page 1
Obtaining results for: https://dailycaller.com/section/politics/page/1/
Year: 2023
Obtaining results for page 2
Obtaining results for: https://dailycaller.com/section/politics/page/2/
Year: 2023
Obtaining results for page 3
Obtaining results for: https://dailycaller.com/section/politics/page/3/
Year: 2023
Obtaining results for page 4
Obtaining results for: https://dailycaller.com/section/politics/page/4/
Year: 2023
Obtaining results for page 5
Obtaining results for: https://dailycaller.com/section/politics/page/5/
Year: 2023
Obtaining results for page 6
Obtaining results for: https://dailycaller.com/section/politics/page/6/
Year: 2023
Obtaining results for page 7
Obtaining results for: https://dailycaller.com/section/politics/page/7/
Year: 2023
Obtaining results for page 8
Obtaining results for: https://dailycaller.com/section/politics/page/8/
Year: 2023
Obtaining results for page 9
Obtaining results for: https://dailycaller.com/se

{'dailycaller.com/2022/05/17/hispanic-gop-house-campaign-outreach/',
 'dailycaller.com/2022/05/27/patriots-live-chat-carrie-severino-news-discussion-roe-wade-abortion/',
 'dailycaller.com/2022/04/07/obama-biden-instagram-accounts/',
 'dailycaller.com/2022/06/08/topless-women-wnba-protest-woman-hating-fascists-scotus-roe-v-wade-abortion/',
 'dailycaller.com/2022/01/14/biden-press-conference-one-year-white-house-psaki/',
 'dailycaller.com/2022/08/01/its-embarrassingly-hypocritical-dem-rep-bashes-partys-primary-meddling/',
 'dailycaller.com/2022/10/13/jan-6-committee-votes-subpoena-donald-trump/',
 'dailycaller.com/2022/06/09/marco-rubios-office-responds-david-hogg-false-claim/',
 'dailycaller.com/2023/02/15/americans-businesses-politics-poll-esg/',
 'dailycaller.com/2022/08/09/michael-bennet-fishing-license-campaign-ad-colorado/',
 'dailycaller.com/2023/07/07/val-hoyle-oregon-nrcc-releases-ad-slamming-freshman-democrat-shady-ties-weed-industry/',
 'dailycaller.com/2022/01/11/joe-manchin-