# 2020 Black Lives Matter Uprisings Media Analysis

This project attempts to answer through social natural language processing: How does mainstream media coverage of the 2020 Black Lives Matter uprisings following the deaths of Nina Pop, Breonna Taylor, George Floyd, Tony McDade, Ahmaud Arbery, and others perpetuate or interrupt anti-Blackness and carceral logics? In a critical moment in the movement, mainstream media has a large stake in framing uprisings to the general population and have historically been part of the perpetuation and platforming of anti-Black rhetoric and carceral logics. Exposing anti-Blackness and its ties to carceral logics allows for a way to not only deconstruct systems that do not serve Black communities in particular, but additionally allows for space for a conversation to imagine new ones. 

## Grabbing Data
The dataset will be a corpus of articles scraped from major mainstream U.S. news sources, including the Associated Press, CNN, San Francisco Chronicle, Washington Post, and USA Today by searching “George Floyd protests” on each site.

In [1]:
import requests
from newsapi import NewsApiClient
from bs4 import BeautifulSoup
import json
import numpy as np
import pandas as pd

In [2]:
news_contents = []
list_links = []
list_titles = []

def parsing(articles_list, num, link_begin):

    for n in np.arange(0, num):

        # Getting the link of the article
        link = articles_list[n]['href']
        link = link_begin + link
        list_links.append(link)

        # Getting the title
        title = articles_list[n].find('h1').get_text()
        list_titles.append(title)

        # Reading the content (it is divided in paragraphs)
        article = requests.get(link)
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'html5lib')
        body = soup_article.find_all('div', class_='Article')
        x = body[0].find_all('p')

        # Unifying the paragraphs
        list_paragraphs = []
        for p in np.arange(0, len(x)):
            paragraph = x[p].get_text()
            list_paragraphs.append(paragraph)
            final_article = " ".join(list_paragraphs)

        news_contents.append(final_article)

def make_soup(url, html_marker, class_name):
    r = requests.get(url)
    content = r.content
    soup = BeautifulSoup(content, 'html5lib')
    articles = soup.find_all(html_marker, class_=class_name)
    return articles, len(articles)

In [3]:
apnews_art, apnews_len = make_soup('https://apnews.com/GeorgeFloyd', 'a', 'Component-headline-0-2-106')

In [4]:
parsing(apnews_art, apnews_len, "https://apnews.com/")

In [5]:
print("len of news_contents:", len(news_contents))
print("len of list_links:", len(list_links))
print("len of news_titles:", len(list_titles))

len of news_contents: 50
len of list_links: 50
len of news_titles: 50


In [6]:
def cnn_soup(url):
    with requests.Session() as req:
        for item in range(1, 1000, 100):
            r = req.get(url.format(item)).json()
            for a in r['result']:
                news_contents.append(a["body"])
                list_links.append(a["url"])
                list_titles.append(a["headline"])
                
cnn_soup('https://search.api.cnn.io/content?q=george%20floyd%20protest&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}')

In [7]:
print("len of news_contents:", len(news_contents))
print("len of list_links:", len(list_links))
print("len of news_titles:", len(list_titles))

len of news_contents: 390
len of list_links: 390
len of news_titles: 390


In [8]:
newsapi = NewsApiClient(api_key='93befd90547344029350d2f06bf2f1ca')

In [9]:
sfchron_articles = newsapi.get_everything(q="george floyd protests",
                                      domains='sfchronicle.com',
                                      from_param='2020-05-29',
                                      to='2020-06-28',
                                      language='en',
                                      sort_by='relevancy',
                                      page_size=100,
                                      page=1)

In [11]:
sfchron_links = []
sfchron_titles = []
for item in range(len(sfchron_articles['articles'])):
    sfchron_links.append(sfchron_articles['articles'][item]['url'])
    sfchron_titles.append(sfchron_articles['articles'][item]['title'])

In [13]:
def get_content(linkslst, titles, html_marker, marker_class):
    
    for i in np.arange(0, len(linkslst)):
        link = linkslst[i]
        
        article = requests.get(link)
        article_content = article.content
        soup_article = BeautifulSoup(article_content, 'html5lib')
        body = soup_article.find_all(html_marker, class_=marker_class)
        if len(body) > 0:
            list_titles.append(titles[i])
            list_links.append(link)
            x = body[0].find_all('p')
        
            list_paragraphs = []
            for p in np.arange(0, len(x)):
                paragraph = x[p].get_text()
                list_paragraphs.append(paragraph)
                final_article = " ".join(list_paragraphs)

            news_contents.append(final_article)

In [14]:
get_content(sfchron_links, sfchron_titles, 'section', 'body')


In [15]:
print("len of news_contents:", len(news_contents))
print("len of list_links:", len(list_links))
print("len of news_titles:", len(list_titles))

len of news_contents: 404
len of list_links: 404
len of news_titles: 404


In [23]:
wapo_articles = newsapi.get_everything(q="george floyd protests",
                                      domains='washingtonpost.com',
                                      from_param='2020-05-29',
                                      to='2020-06-28',
                                      language='en',
                                      sort_by='relevancy',
                                      page_size=100,
                                      page=1)

In [25]:
wapo_links = []
wapo_titles = []
for item in range(len(wapo_articles['articles'])):
    wapo_links.append(wapo_articles['articles'][item]['url'])
    wapo_titles.append(wapo_articles['articles'][item]['title'])

In [32]:
get_content(wapo_links, wapo_titles, 'div', 'article-body')

In [33]:
print("len of news_contents:", len(news_contents))
print("len of list_links:", len(list_links))
print("len of news_titles:", len(list_titles))

len of news_contents: 501
len of list_links: 501
len of news_titles: 501


In [46]:
usa_articles = newsapi.get_everything(q="george floyd protests",
                                      domains='usatoday.com',
                                      from_param='2020-05-29',
                                      to='2020-06-28',
                                      language='en',
                                      sort_by='relevancy',
                                      page_size=100,
                                      page=1)

In [47]:
usa_links = []
usa_titles = []
for item in range(len(usa_articles['articles'])):
    usa_links.append(usa_articles['articles'][item]['url'])
    usa_titles.append(usa_articles['articles'][item]['title'])

In [50]:
get_content(usa_links, usa_titles, 'div', 'gnt_ar_b')

In [57]:
print("len of news_contents:", len(news_contents))
print("len of list_links:", len(list_links))
print("len of list_titles:", len(list_titles))

len of news_contents: 590
len of list_links: 590
len of list_titles: 590


In [58]:
list_titles[589]

"Beyoncé drops 'Black Parade' on Juneteenth, proceeds to benefit Black-owned businesses"

In [60]:
df = pd.DataFrame(list(zip(list_titles, list_links, news_contents)), columns=['title', 'link', 'article_body'])
df.to_csv('articles.csv', index=False)