# Web Scraping The Times UK (without API)

## Putting search results from multiple search pages from the same site in one dataframe

In [1]:
# Libraries

from requests_html import HTMLSession
import pandas as pd

pd.set_option("display.max_columns",101)



In [2]:
# Collecting links to all articles based on keyword and search page number from the Spectator

def news_articles_from_given_page (keyword, page_number):
    '''Collects the links of the search results from a given search page for a given term'''
    ht_url = 'https://www.thetimes.co.uk/search?p=' + str(page_number) + '&q=' + keyword + '&source=search-page'
    print(ht_url)
    session = HTMLSession()
    r = session.get(ht_url)
    articles_url =[] # empty list to collect the links into
    for i in r.html.find('h2'):
        for link in i.links:
            link = 'https://www.thetimes.co.uk' + link
            articles_url.append(link)
    return articles_url

In [3]:
# Test

news_articles_from_given_page('Trump',1)

https://www.thetimes.co.uk/search?p=1&q=Trump&source=search-page


['https://www.thetimes.co.uk/article/trump-ramps-up-drama-as-he-picks-his-supreme-judge-s3vlpztbm',
 'https://www.thetimes.co.uk/article/opponents-ousted-and-mocked-as-trump-hits-back-x60fbsl9n',
 'https://www.thetimes.co.uk/article/we-had-bigger-crowds-than-obama-insists-trump-team-hz7mcmgvw',
 'https://www.thetimes.co.uk/article/bmw-shares-go-into-reverse-after-trump-tax-threat-9jzgcfz82',
 'https://www.thetimes.co.uk/article/trump-legacy-is-pile-of-unwanted-history-j76p5ctdp',
 'https://www.thetimes.co.uk/article/far-right-puts-boot-into-trump-boycott-f2gvvd3bz',
 'https://www.thetimes.co.uk/article/trump-force-one-is-not-an-option-say-secret-service-kfnx376xk',
 'https://www.thetimes.co.uk/article/trump-gears-up-for-new-war-on-terror-led-by-maverick-general-68zhfxxxs',
 'https://www.thetimes.co.uk/article/trump-gg8kl7glf',
 'https://www.thetimes.co.uk/article/donald-trump-turns-fire-on-anthony-fauci-in-return-to-campaign-trail-whf3978nc']

In [4]:
# Putting news articles from multiple search pages in a list using the previously defined function

article_links=[]

for page_number in range(1, 60):
    new_links = news_articles_from_given_page('Trump', page_number)
    article_links = article_links + new_links

https://www.thetimes.co.uk/search?p=1&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=2&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=3&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=4&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=5&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=6&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=7&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=8&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=9&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=10&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=11&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=12&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=13&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=14&q=Trump&source=search-page
https://www.thetimes.co.uk/search?p=15&q=Trump&source=search-page
https://www.thetime

In [5]:
# Function that scrapes the first search result (one article among the many search results) from a single search page -> article title, author, date, text... (put in a dictionary, so we can later comply a json file.)
# so it opens the first find -> article_links[0]

def scrape_the_times_uk(url):
    session = HTMLSession()
    page=session.get(url)
    html = page.html

    title = html.find('h1')[0].text
    author_location = html.find('span')[13].text
    date = html.find('time')[0].text
   
    text = '' 
    
    for item in html.find('p'):
        text = text + ' ' + item.text
    
    d = {'title' : title,
         'author & location' : author_location,
         'date' : date,
         'text' : text}
    
    return d

In [6]:
article_links[0]

'https://www.thetimes.co.uk/article/trump-ramps-up-drama-as-he-picks-his-supreme-judge-s3vlpztbm'

In [7]:
scrape_the_times_uk(article_links[0])

{'title': 'Trump ramps up drama as he picks Supreme Court judge',
 'author & location': 'Ben Hoyle, Los Angeles',
 'date': 'Friday June 11 2021',
 'text': ' President Trump kicked off a fierce battle for control of the Supreme Court last night when he nominated a staunch conservative for the vacancy that Barack Obama had been prevented by Republicans from filling. With characteristic showmanship the former Apprentice host unveiled Neil Gorsuch as his chosen nominee on prime time television, having first ramped up the suspense by summoning both leading candidates for the lifetime appointment to the capital. Moments after he revealed Judge Gorsuch, 49, as his pick, Mr Trump stretched his arms out before a White House audience that included senior political figures, members of the president’s family and the widow of the late Justice Antonin Scalia, whose death created the vacancy. “So, was that a surprise? Was it?” Mr Trump demanded. © Times Newspapers Limited 2021. Registered in England 

In [15]:
# Loop to scrape all (a list of X) articles from multiple pages of the actual search results

articles =[]

for url in article_links:
    if 'thetimes' in url:
        print(url)
        details = scrape_the_times_uk(url)
        details['url'] = url
        articles.append(details)

https://www.thetimes.co.uk/article/trump-ramps-up-drama-as-he-picks-his-supreme-judge-s3vlpztbm
https://www.thetimes.co.uk/article/opponents-ousted-and-mocked-as-trump-hits-back-x60fbsl9n
https://www.thetimes.co.uk/article/we-had-bigger-crowds-than-obama-insists-trump-team-hz7mcmgvw
https://www.thetimes.co.uk/article/bmw-shares-go-into-reverse-after-trump-tax-threat-9jzgcfz82
https://www.thetimes.co.uk/article/trump-legacy-is-pile-of-unwanted-history-j76p5ctdp
https://www.thetimes.co.uk/article/far-right-puts-boot-into-trump-boycott-f2gvvd3bz
https://www.thetimes.co.uk/article/trump-force-one-is-not-an-option-say-secret-service-kfnx376xk
https://www.thetimes.co.uk/article/trump-gears-up-for-new-war-on-terror-led-by-maverick-general-68zhfxxxs
https://www.thetimes.co.uk/article/trump-gg8kl7glf
https://www.thetimes.co.uk/article/donald-trump-turns-fire-on-anthony-fauci-in-return-to-campaign-trail-whf3978nc
https://www.thetimes.co.uk/article/netanyahu-decries-biggest-fraud-in-history-of-de

https://www.thetimes.co.uk/article/donald-trump-puts-mar-a-lago-mansion-on-sale-for-49m-vx667dvdb
https://www.thetimes.co.uk/article/capitol-on-high-alert-for-qanon-second-coming-of-trump-conspiracy-fxr8fp9gz
https://www.thetimes.co.uk/article/david-cameron-rules-out-donald-trump-style-return-to-politics-xrvssd3sm
https://www.thetimes.co.uk/article/trumps-failure-to-declare-his-2024-candidacy-outright-suggests-a-man-in-two-minds-jxkhb3xpd
https://www.thetimes.co.uk/article/unite-behind-me-to-win-again-in-2024-donald-trump-tells-party-vh3n2s0wf
https://www.thetimes.co.uk/article/the-times-view-on-the-republicans-future-trumps-shadow-gkcgtnzg3
https://www.thetimes.co.uk/article/bad-news-for-donald-trump-jr-the-right-is-fast-tiring-of-trumps-msm9phdcz
https://www.thetimes.co.uk/article/qanon-prophecy-of-donald-trumps-return-puts-congress-on-alert-q72tcst6k
https://www.thetimes.co.uk/article/jose-mourinho-has-become-the-donald-trump-of-football-bwvtj99cn
https://www.thetimes.co.uk/article/

https://www.thetimes.co.uk/article/donald-trump-to-pardon-steve-bannon-in-final-acts-of-clemency-vlhbvk0vx
https://www.thetimes.co.uk/article/after-daddy-what-next-for-ivanka-and-the-trump-kids-hqz56gktf
https://www.thetimes.co.uk/article/iran-sanctions-trump-as-it-steps-up-war-games-in-the-gulf-5dj6stmgk
https://www.thetimes.co.uk/article/donald-trump-vows-weve-only-just-begun-before-leaving-washington-3mcqlkqzm
https://www.thetimes.co.uk/article/national-guardsmens-links-to-pro-trump-militias-dl968hmnx
https://www.thetimes.co.uk/article/tories-accused-of-trump-style-tactics-for-comparing-snp-to-nazis-l5s3kn6wk
https://www.thetimes.co.uk/article/burning-down-kenosha-trumps-fractured-legacy-wlt3pzqwz
https://www.thetimes.co.uk/article/what-did-donald-trump-achieve-his-presidency-in-numbers-5jsl3bkzk
https://www.thetimes.co.uk/article/trump-prepares-100-presidential-pardons-for-his-last-day-in-office-lckzfh2r5
https://www.thetimes.co.uk/article/trumps-twitter-ban-heralds-end-times-claim

https://www.thetimes.co.uk/article/president-trumps-assault-on-american-values-will-fail-96nj5mqcb
https://www.thetimes.co.uk/article/analysis-was-trump-the-architect-of-republicans-poor-performance-in-georgia-jlgvnrm9h
https://www.thetimes.co.uk/article/donald-trump-has-disgraced-tradition-and-that-will-be-his-lasting-legacy-vcxxjc6m2
https://www.thetimes.co.uk/article/rioters-attacked-the-hill-after-loyalists-deserted-president-h5zjlz26n
https://www.thetimes.co.uk/article/the-times-view-on-trumps-incitement-of-the-assault-on-the-capitol-day-of-infamy-l3gwr8bkh
https://www.thetimes.co.uk/article/smashing-glass-and-gunfire-as-mob-shames-us-democracy-7prr83dx9
https://www.thetimes.co.uk/article/end-this-seige-biden-urges-trump-gnb8mnqpj
https://www.thetimes.co.uk/article/christian-lindberg-2017-review-a-horrified-response-to-donald-trumps-arrival-as-president-x0mq8l7hl
https://www.thetimes.co.uk/article/donald-trump-piles-pressure-on-mike-pence-to-reverse-his-election-defeat-to-joe-bide

IndexError: list index out of range

In [16]:
df = pd.DataFrame(articles)
df.sample(5)

Unnamed: 0,title,author & location,date,text,url
142,What will Melania Trump do now? A smile replac...,The Sunday Times,Friday June 11 2021,A sour mood surely settled over Mar-a-Lago in...,https://www.thetimes.co.uk/article/what-will-m...
157,National guardsmen’s links to pro-Trump militias,"Jacqui Goddard, Miami",Friday June 11 2021,Two members of the US Army National Guard wer...,https://www.thetimes.co.uk/article/national-gu...
68,"Donald Trump lost $700m in office, report says","Charlie Mitchell, Ottawa",Friday June 11 2021,The presidency cost Donald Trump an estimated...,https://www.thetimes.co.uk/article/donald-trum...
21,Trump Organisation is under criminal investiga...,"Alistair Dawber, Washington",Friday June 11 2021,New York state’s attorney-general said last n...,https://www.thetimes.co.uk/article/investigati...
209,Gillian Bowditch: Nicola Sturgeon’s endgame wi...,The Sunday Times,Friday June 11 2021,Not long after Alex Salmond became first mini...,https://www.thetimes.co.uk/article/gillian-bow...


In [17]:
def has_Trump(string):
    if 'Trump'in string:
        return True
    else:
        return False

In [18]:
df['Trump'] = df['text'].apply(has_Trump)

In [19]:
df['Trump'].sum()

219

In [20]:
len(df)

250

In [21]:
df.to_csv(r'/Users/lilivandermeersch/CAB/Class Session 2 - 29:04:2021/output/The_Times_UK_Trump.csv', index = False)