In [1]:
# importing dependencies for MediaCloud API
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import mediacloud.api
from dateutil import parser

# loading config from .env file
load_dotenv()

True

In [4]:
# reading API key from .env
mc_api_key = os.getenv("MC_API_KEY")

# instantiating MediaCloud API
mc = mediacloud.api.MediaCloud(mc_api_key)
mediacloud.__version__

'3.12.3'

In [5]:
# verifying API connection by printing some high-level stats
mc.stats()

{'active_crawled_feeds': 163967,
 'active_crawled_media': 58815,
 'daily_downloads': 1182747,
 'daily_stories': 722806,
 'mediacloud_stats_id': 722,
 'stats_date': '2021-09-26',
 'total_downloads': 0,
 'total_sentences': 0,
 'total_stories': 1980746752}

In [None]:
# making dict of sources and source id's
sources = {
    "Washington Post": 2,
    "New York Times": 1,
    "USA Today": 4,
    "LA Times": 6,
    "Bloomberg": 40944,
    "Christian Science Monitor": 3,
#     "New York Daily News": ,
#     "Star Ledger (NJ)": ,
    "The Philadelphia Inquirer (PA)": 18,
    "The Pittsburgh Post-Gazette (PA)": 45,
    "PennLive/Patriot-News (PA)": 662863,
#     "The Day (CT)": ,
    "Hartford Courant (CT)": 59,
    "Baltimore Sun (MD)": 34,
#     "The Capital Gazette (MD)": ,
    "The Virginian-Pilot (VA)": 55,
    "The Richmond Times-Dispatch (VA)": 53,
    "Boston Globe (MA)": 15,
    "Portland Press Herald (ME)": 366984,
#     "Houston Chronicle (TX)": ,
    "Austin American Statesman (TX)": 62,
    "Atlanta Journal Constitution (GA)": 22916
#     "Raleigh News and Observer (NC)": ,
#     "The Cleveland Plain Dealer (OH)": ,
}

In [None]:
# searching for 9/11 in each source
for source in sources:
    test_query = f'"9/11" and media_id:{sources[source]}'
    total_query = f'media_id:{sources[source]}'
    num_stories = mc.storyCount(test_query)['count']
    total_stories = mc.storyCount(total_query)['count']
    print(f"{source}:")
    print(f"{num_stories} stories about 9/11, {total_stories} total")
    print(f"{(num_stories/total_stories)*100}% stories are about 9/11\n")

In [2]:
# building a function to search a string among the sources given
def search_sources(query, date_range=None, api_key=None, verbose=False):
    # ensuring the query is a string
    query = str(query)
    
    # defining API key, instantiating MediaCloud API
    # if no API key is passed, function assumes API key is defined in .env file as MC_API_KEY
    if not api_key:
        api_key = os.getenv("MC_API_KEY")
    mc = mediacloud.api.MediaCloud(api_key)
    
    # formatting date range for API query
    if date_range:
        from datetime import datetime
        
        assert len(date_range) == 2, "Please provide both a start and end date for the date interval."
        start_date = parser.parse(date_range[0])
        end_date = parser.parse(date_range[1])
        api_date_range = mc.dates_as_query_clause(start_date, end_date)
        print(f"Query: {query}, between {start_date.strftime('%m/%d/%Y')} and {end_date.strftime('%m/%d/%Y')}\n")
    else:
        print(f"Query: {query}\n")
    
    
    # defining sources and ID's in MediaCloud API. Commented sources cannot be found in MediaCloud
    sources = {
        "Washington Post": 2,
        "New York Times": 1,
        "USA Today": 4,
        "LA Times": 6,
        "Bloomberg": 40944,
        "Christian Science Monitor": 3,
#         "New York Daily News": ,
#         "Star Ledger (NJ)": ,
        "The Philadelphia Inquirer (PA)": 18,
        "The Pittsburgh Post-Gazette (PA)": 45,
        "PennLive/Patriot-News (PA)": 662863,
#         "The Day (CT)": ,
        "Hartford Courant (CT)": 59,
        "Baltimore Sun (MD)": 34,
#         "The Capital Gazette (MD)": ,
        "The Virginian-Pilot (VA)": 55,
        "The Richmond Times-Dispatch (VA)": 53,
        "Boston Globe (MA)": 15,
        "Portland Press Herald (ME)": 366984,
#         "Houston Chronicle (TX)": ,
        "Austin American Statesman (TX)": 62,
        "Atlanta Journal Constitution (GA)": 22916
#         "Raleigh News and Observer (NC)": ,
#         "The Cleveland Plain Dealer (OH)": 
    }
    
    # initializing dataframe to store the query data
    story_counts = pd.DataFrame(columns=["Name", "Relevant Stories", "Total Stories", "Attention (%)"])
    story_counts.index.name = "MediaCloud ID"
    
    # going through each source and querying relevant and total stories
    for source_name in sources:
        if verbose:
            print(f"{source_name}:")
        
        # defining queries for topic and total stories
        api_query = f'"{query}" and media_id:{sources[source_name]}'
        total_query = f'media_id:{sources[source_name]}'
        
        # including date range if passed into function
        if date_range:
            relevant_stories = mc.storyCount(api_query, api_date_range)['count']
            total_stories = mc.storyCount(total_query, api_date_range)['count']

        else:
            relevant_stories = mc.storyCount(api_query)['count']
            total_stories = mc.storyCount(total_query)['count']
        
        # appending data to dataframe
        try:
            attention = (relevant_stories / total_stories) * 100
        except ZeroDivisionError:
            attention = np.nan
        story_counts.loc[sources[source_name]] = [source_name, relevant_stories, total_stories, attention]
        
        # printing story count and attention
        if verbose:
            print(f"{relevant_stories} stories about {query}, {total_stories} total")
            if not np.isnan(attention):
                print(f"{attention}% of stories are about {query}\n")
            else:
                print("")
    
    return story_counts

In [25]:
# calling function
df = search_sources("9 11", date_range=["8/1/2021", "9/24/2021"])
df.sort_values(by="Attention (%)", ascending=False)

Query: 9 11, between 08/01/2021 and 09/24/2021



Unnamed: 0_level_0,Name,Relevant Stories,Total Stories,Attention (%)
MediaCloud ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,Christian Science Monitor,29,473,6.131078
4,USA Today,183,6311,2.899699
6,LA Times,90,3659,2.459688
1,New York Times,213,8772,2.428181
366984,Portland Press Herald (ME),113,4943,2.286061
15,Boston Globe (MA),46,2404,1.913478
40944,Bloomberg,4,231,1.731602
2,Washington Post,123,8536,1.440956
55,The Virginian-Pilot (VA),9,649,1.386749
34,Baltimore Sun (MD),40,3169,1.262228


In [6]:
# testing outputs/formats of dates
test_date1 = parser.parse("8/1/2021")
test_date2 = parser.parse("9/23/2021")
test_range = mc.dates_as_query_clause(test_date1, test_date2)

In [43]:
# sampling 20 stories from a source containing the search string
query = f'"9 11" and media_id:2'
stories = mc.storyList(query, test_range, sort=mc.SORT_RANDOM)
pd.DataFrame(stories)

Unnamed: 0,ap_syndicated,collect_date,feeds,guid,language,media_id,media_name,media_url,processed_stories_id,publish_date,stories_id,story_tags,title,url,word_count,metadata
0,False,2021-09-04 05:34:31.637144,,https://www.washingtonpost.com/history/2021/09...,en,2,Washington Post,http://washingtonpost.com,2436697792,2021-09-03 07:00:21,2032761840,"[{'stories_id': 2032761840, 'tag': 'nyt_labell...","After 9/11, Kenneth Feinberg was asked to do t...",https://www.washingtonpost.com/history/2021/09...,,"{'date_guess_method': None, 'extractor_version..."
1,False,2021-09-09 14:41:22.570298,,https://www.washingtonpost.com/local/when-the-...,en,2,Washington Post,http://washingtonpost.com,2441925125,2021-09-09 14:23:29,2038000406,"[{'stories_id': 2038000406, 'tag': 'nyt_labell...",When the counselor needs help: ‘I didn’t get a...,https://www.washingtonpost.com/local/when-the-...,,"{'date_guess_method': None, 'extractor_version..."
2,False,2021-09-10 11:38:51.386772,,https://www.washingtonpost.com/religion/americ...,en,2,Washington Post,http://washingtonpost.com,2442848935,2021-09-10 11:23:53,2038942716,"[{'stories_id': 2038942716, 'tag': 'geonames_6...",America and US Muslims have come a long way si...,https://www.washingtonpost.com/religion/americ...,,"{'date_guess_method': None, 'extractor_version..."
3,False,2021-09-09 12:36:44.641478,,https://www.washingtonpost.com/opinions/2021/0...,en,2,Washington Post,http://washingtonpost.com,2441806862,2021-09-09 11:25:06,2037879379,"[{'stories_id': 2037879379, 'tag': 'nyt_labell...",Christopher Wray: Hard-earned lessons from 9/1...,https://www.washingtonpost.com/opinions/2021/0...,,"{'date_guess_method': None, 'extractor_version..."
4,False,2021-09-10 18:37:10.505337,,https://www.washingtonpost.com/opinions/2021/0...,en,2,Washington Post,http://washingtonpost.com,2443160952,2021-09-10 06:16:33,2039257365,"[{'stories_id': 2039257365, 'tag': 'nyt_labell...",The 9/11 anniversary should make us think abou...,https://www.washingtonpost.com/opinions/2021/0...,,"{'date_guess_method': None, 'extractor_version..."
5,False,2021-09-10 11:07:05.674276,,https://www.washingtonpost.com/opinions/2021/0...,en,2,Washington Post,http://washingtonpost.com,2442816069,2021-09-10 10:03:48,2038908719,"[{'stories_id': 2038908719, 'tag': 'nyt_labell...","20 years later, Americans kid themselves if th...",https://www.washingtonpost.com/opinions/2021/0...,,"{'date_guess_method': None, 'extractor_version..."
6,False,2021-09-10 07:10:48.671899,,https://www.washingtonpost.com/world/2021/09/1...,en,2,Washington Post,http://washingtonpost.com,2442586061,2021-09-10 07:00:00,2038672890,"[{'stories_id': 2038672890, 'tag': 'nyt_labell...","Xenophobia, vulnerability and disillusionment:...",https://www.washingtonpost.com/world/2021/09/1...,,"{'date_guess_method': None, 'extractor_version..."
7,False,2021-09-10 08:09:43.460915,,https://www.washingtonpost.com/sports/2021/09/...,en,2,Washington Post,http://washingtonpost.com,2442641866,2021-09-10 08:06:51.027886,2038730801,"[{'stories_id': 2038730801, 'tag': 'geonames_6...",How a Maryland punter from New York launched h...,https://www.washingtonpost.com/sports/2021/09/...,,"{'date_guess_method': None, 'extractor_version..."
8,False,2021-09-08 12:08:04.717429,,https://www.washingtonpost.com/religion/anti-s...,en,2,Washington Post,http://washingtonpost.com,2440691152,2021-09-08 11:33:40,2036746430,"[{'stories_id': 2036746430, 'tag': 'nyt_labell...",Anti-Sikh bigotry didn't start with 9/11. That...,https://www.washingtonpost.com/religion/anti-s...,,"{'date_guess_method': None, 'extractor_version..."
9,False,2021-09-09 09:15:15.137388,,https://www.washingtonpost.com/lifestyle/2021/...,en,2,Washington Post,http://washingtonpost.com,2441617704,2021-09-09 09:00:16,2037687936,"[{'stories_id': 2037687936, 'tag': 'nyt_labell...","How to talk to kids about 9/11, and why you sh...",https://www.washingtonpost.com/lifestyle/2021/...,,"{'date_guess_method': None, 'extractor_version..."


In [47]:
# retrieving word counts from a source containing the search string
query = f'"9 11" and media_id:2'
word_count = mc.wordCount(query, test_range)
# pd.DataFrame(word_count)

# building term/document matrix
story_count = mc.storyCount(query, test_range)["count"]
doc_term_matrix = mc.storyWordMatrix(query, test_range, rows=story_count)
doc_term_matrix["word_matrix"]

{'2012877805': {'0': 1,
  '1': 1,
  '10': 2,
  '11': 1,
  '12': 1,
  '13': 2,
  '14': 1,
  '15': 1,
  '16': 1,
  '17': 1,
  '18': 1,
  '19': 1,
  '2': 1,
  '20': 1,
  '3': 1,
  '4': 1,
  '5': 1,
  '6': 1,
  '7': 1,
  '8': 2,
  '9': 2},
 '2013969117': {'13': 1,
  '21': 1,
  '22': 1,
  '23': 1,
  '24': 1,
  '25': 1,
  '26': 1,
  '27': 1,
  '28': 1,
  '29': 1,
  '30': 1,
  '31': 1,
  '32': 1,
  '33': 1,
  '34': 1,
  '35': 1,
  '36': 1,
  '37': 1,
  '38': 1,
  '39': 1,
  '8': 1},
 '2014571755': {'13': 1,
  '17': 1,
  '3': 1,
  '40': 1,
  '41': 1,
  '42': 1,
  '43': 1,
  '44': 1,
  '45': 1,
  '46': 1,
  '5': 1,
  '6': 1,
  '8': 1,
  '9': 1},
 '2017661464': {'13': 1,
  '24': 1,
  '47': 1,
  '48': 1,
  '49': 1,
  '50': 1,
  '51': 1,
  '52': 1,
  '53': 1,
  '54': 1,
  '55': 2,
  '56': 1,
  '57': 1,
  '58': 1,
  '59': 1,
  '60': 1,
  '61': 1,
  '62': 1,
  '63': 1},
 '2019758487': {'1': 1,
  '100': 1,
  '101': 1,
  '102': 1,
  '103': 1,
  '104': 1,
  '105': 1,
  '106': 1,
  '107': 1,
  '108': 1,