In [2]:
import requests
from functools import reduce

from bs4 import BeautifulSoup
from newspaper import Article
import pandas as pd

# Scrape Bellingcat's Articles

This notebook aims to scrape all of Bellingcat's articles and output them to a file.

The steps include:
1. Looping over each month of each year and collecting URLS using BeautifulSoup
2. Using the same URL list with `newspaper3k` to obtain the article data
3. Save article data to a CSV file

## `newspaper3k` Example

In [3]:
url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
article = Article(url)

In [4]:
article.download()

In [5]:
article.parse()
article.authors

[]

In [6]:
article.publish_date

datetime.datetime(2013, 12, 30, 0, 0)

In [15]:
article.text

'By Leigh Ann Caldwell\n\nWASHINGTON (CNN) — Not everyone subscribes to a New Year’s resolution, but Americans will be required to follow new laws in 2014.\n\nSome 40,000 measures taking effect range from sweeping, national mandates under Obamacare to marijuana legalization in Colorado, drone prohibition in Illinois and transgender protections in California.\n\nAlthough many new laws are controversial, they made it through legislatures, public referendum or city councils and represent the shifting composition of American beliefs.\n\nFederal: Health care, of course, and vending machines\n\nThe biggest and most politically charged change comes at the federal level with the imposition of a new fee for those adults without health insurance.\n\nFor 2014, the penalty is either $95 per adult or 1% of family income, whichever results in a larger fine.\n\nThe Obamacare, of Affordable Care Act, mandate also requires that insurers cover immunizations and some preventive care.\n\nAdditionally, mil

In [7]:
article.summary

''

In [8]:
article.additional_data

{}

In [9]:
article.title

'New Year, new laws: Obamacare, pot, guns and drones'

## Get Month's Articles' URLs

In [11]:
BASE_URL = "https://www.bellingcat.com"
BELLINGCAT_START_YEAR = 2014  # earliest article on site

In [12]:
def list_months_article(year: int, month: int):
    url = f"{BASE_URL}/news/{year}/0{month}/"
    res = requests.get(url)
    articles = BeautifulSoup(res.content, "html.parser")
    news_item_tags = articles.find_all("div", {"class": "news_item__image"})

    create_object = lambda tag: {
        "year": year,
        "month": month,
        "url": tag.findChild("a")["href"],
    }

    return [create_object(t) for t in news_item_tags]


list_months_article(2023, 3)

[{'year': 2023,
  'month': 3,
  'url': 'https://www.bellingcat.com/news/2023/03/29/how-online-investigators-proved-video-of-ukrainian-soldiers-harassing-woman-was-staged/'},
 {'year': 2023,
  'month': 3,
  'url': 'https://www.bellingcat.com/news/2023/03/21/tiger-sheikhs-uae-royals-wildlife-shoots/'},
 {'year': 2023,
  'month': 3,
  'url': 'https://www.bellingcat.com/news/2023/03/03/ryodan-anime-teens-kremlin-russia-ukraine/'}]

In [13]:
def flatten_list(x, y):
    return x + y


def list_all_articles():
    nested_links = [list_years_articles(y) for y in range(BELLINGCAT_START_YEAR, 2024)]
    return reduce(flatten_list, nested_links)


# TODO refactor
def list_years_articles(year: int):
    nested_links = [list_months_article(year, i) for i in range(1, 13)]
    return reduce(flatten_list, nested_links)


list_years_articles(2023)

[{'year': 2023,
  'month': 1,
  'url': 'https://www.bellingcat.com/news/2023/01/27/anatomy-of-a-shelling-how-russian-rocket-artillery-struck-mykolaiv/'},
 {'year': 2023,
  'month': 2,
  'url': 'https://www.bellingcat.com/news/2023/02/24/russias-assault-on-daily-life-in-ukraine/'},
 {'year': 2023,
  'month': 2,
  'url': 'https://www.bellingcat.com/news/2023/02/21/borderless-vigilantism-the-nativist-us-militias-entering-mexico/'},
 {'year': 2023,
  'month': 2,
  'url': 'https://www.bellingcat.com/news/uk-and-europe/2023/02/20/ukraine-war-anniversary-one-year/'},
 {'year': 2023,
  'month': 2,
  'url': 'https://www.bellingcat.com/news/2023/02/13/how-wagner-gave-three-90s-russian-crime-bosses-a-new-lease-of-death/'},
 {'year': 2023,
  'month': 2,
  'url': 'https://www.bellingcat.com/news/2023/02/03/wanted-by-interpol-relaxing-in-dubai-geolocating-isabel-dos-santos-life-of-luxury/'},
 {'year': 2023,
  'month': 3,
  'url': 'https://www.bellingcat.com/news/2023/03/29/how-online-investigators-p

In [14]:
all_articles = list_all_articles() # 29s to run

In [15]:
df = pd.DataFrame(all_articles)
df['path'] = df.url.apply(lambda x: x.split(BASE_URL, 1)[1])
df

Unnamed: 0,year,month,url,path
0,2014,7,https://www.bellingcat.com/news/uk-and-europe/...,/news/uk-and-europe/2014/07/31/did-coulsons-ne...
1,2014,7,https://www.bellingcat.com/news/uk-and-europe/...,/news/uk-and-europe/2014/07/30/the-context-of-...
2,2014,7,https://www.bellingcat.com/news/uk-and-europe/...,/news/uk-and-europe/2014/07/30/other-dark-arts...
3,2014,7,https://www.bellingcat.com/news/uk-and-europe/...,/news/uk-and-europe/2014/07/28/how-rebekah-bro...
4,2014,7,https://www.bellingcat.com/news/uk-and-europe/...,/news/uk-and-europe/2014/07/28/the-buk-that-co...
...,...,...,...,...
586,2023,8,https://www.bellingcat.com/news/2023/08/02/jen...,/news/2023/08/02/jenin-open-source-insights-on...
587,2023,9,https://www.bellingcat.com/news/2023/09/28/aze...,/news/2023/09/28/azerbaijan-consolidates-contr...
588,2023,9,https://www.bellingcat.com/news/2023/09/21/cha...,/news/2023/09/21/chaos-and-crisis-as-azerbaija...
589,2023,9,https://www.bellingcat.com/news/2023/09/13/us-...,/news/2023/09/13/us-neo-nazi-says-he-fought-in...


In [16]:
first_5 = df.url[:5]
first_5

0    https://www.bellingcat.com/news/uk-and-europe/...
1    https://www.bellingcat.com/news/uk-and-europe/...
2    https://www.bellingcat.com/news/uk-and-europe/...
3    https://www.bellingcat.com/news/uk-and-europe/...
4    https://www.bellingcat.com/news/uk-and-europe/...
Name: url, dtype: object

In [80]:
# df.to_csv("all-bellingcat-articles.csv", index=False)

## Parse Article

In [52]:
def get_article_text(url: str):
    article = Article(url)
    article.download()
    article.parse()

    return {
        "text": (
            article.text.split(article.title)[1]
            if article.title in article.text
            else article.text
        ),  # removes title from text if there
        "publish_date": article.publish_date,
        "title": article.title,
    }


t = get_article_text(
    "https://www.bellingcat.com/news/uk-and-europe/2014/07/31/did-coulsons-news-of-the-world-incite-others-to-commit-crimes-and-cause-unsafe-convictions/"
)
t

{'text': '\n\nMore on the Fake Sheikh, the Police, and News of the World by occasional blogger @jpublik.\n\nAndy Coulson‘s News of the World sent a man to jail after luring him to sell them drugs he was terrified of carrying by promising him a job. He was sentenced to four years in prison before his conviction was quashed – after he’d already served his time.\n\nIn a case which has hardly received any publicity, according to high court documents, Albanian Besnik Qema was asked to supply News of the World cocaine and a passport on a promise of job as security for a wealthy Arab family.\n\nThe High Court documents detail how in January 2005, Mazher Mahmood had asked Florim Gashi, a contact of his who he had used in previous “set-up” stings to find someone who could be implicated in a story he or the News of the World wanted to run about false passports, drugs and guns. Gashi then adopted the identity of a female called Aurora and through an internet chat room used by expatriate Albanians

In [53]:
articles_text = df.url.map(get_article_text)

In [60]:
extract_dict_key = lambda s, key: s.apply(lambda x: x[key])

df["articles_text"] = extract_dict_key(articles_text, "text")
df["publish_date"] = extract_dict_key(articles_text, "publish_date")
df["title"] = extract_dict_key(articles_text, "title")

df.to_csv("all-bellingcat-articles.csv", index=False)
