In [12]:
# importing dependencies for MediaCloud API
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import mediacloud.api
from dateutil import parser

# loading config from .env file
load_dotenv()

True

In [None]:
# reading API key from .env
mc_api_key = os.getenv("MC_API_KEY")

# instantiating MediaCloud API
mc = mediacloud.api.MediaCloud(mc_api_key)
mediacloud.__version__

In [None]:
# verifying API connection by printing some high-level stats
mc.stats()

In [None]:
# making dict of sources and source id's
sources = {
    "Washington Post": 2,
    "New York Times": 1,
    "USA Today": 4,
    "LA Times": 6,
    "Bloomberg": 40944,
    "Christian Science Monitor": 3,
#     "New York Daily News": ,
#     "Star Ledger (NJ)": ,
    "The Philadelphia Inquirer (PA)": 18,
    "The Pittsburgh Post-Gazette (PA)": 45,
    "PennLive/Patriot-News (PA)": 662863,
#     "The Day (CT)": ,
    "Hartford Courant (CT)": 59,
    "Baltimore Sun (MD)": 34,
#     "The Capital Gazette (MD)": ,
    "The Virginian-Pilot (VA)": 55,
    "The Richmond Times-Dispatch (VA)": 53,
    "Boston Globe (MA)": 15,
    "Portland Press Herald (ME)": 366984,
#     "Houston Chronicle (TX)": ,
    "Austin American Statesman (TX)": 62,
    "Atlanta Journal Constitution (GA)": 22916
#     "Raleigh News and Observer (NC)": ,
#     "The Cleveland Plain Dealer (OH)": ,
}

In [None]:
# searching for 9/11 in each source
for source in sources:
    test_query = f'"9/11" and media_id:{sources[source]}'
    total_query = f'media_id:{sources[source]}'
    num_stories = mc.storyCount(test_query)['count']
    total_stories = mc.storyCount(total_query)['count']
    print(f"{source}:")
    print(f"{num_stories} stories about 9/11, {total_stories} total")
    print(f"{(num_stories/total_stories)*100}% stories are about 9/11\n")

In [37]:
# building a function to search a string among the sources given
def search_sources(query, date_range=None, api_key=None, verbose=False):
    # ensuring the query is a string
    query = str(query)
    
    # defining API key, instantiating MediaCloud API
    # if no API key is passed, function assumes API key is defined in .env file as MC_API_KEY
    if not api_key:
        api_key = os.getenv("MC_API_KEY")
    mc = mediacloud.api.MediaCloud(api_key)
    
    # formatting date range for API query
    if date_range:
        from datetime import datetime
        
        assert len(date_range) == 2, "Please provide both a start and end date for the date interval."
        start_date = parser.parse(date_range[0])
        end_date = parser.parse(date_range[1])
        api_date_range = mc.dates_as_query_clause(start_date, end_date)
        print(f"Query: {query}, between {start_date.strftime('%m/%d/%Y')} and {end_date.strftime('%m/%d/%Y')}\n")
    else:
        print(f"Query: {query}\n")
    
    
    # defining sources and ID's in MediaCloud API. Commented sources cannot be found in MediaCloud
    sources = {
        "Washington Post": 2,
        "New York Times": 1,
        "USA Today": 4,
        "LA Times": 6,
        "Bloomberg": 40944,
        "Christian Science Monitor": 3,
#         "New York Daily News": ,
#         "Star Ledger (NJ)": ,
        "The Philadelphia Inquirer (PA)": 18,
        "The Pittsburgh Post-Gazette (PA)": 45,
        "PennLive/Patriot-News (PA)": 662863,
#         "The Day (CT)": ,
        "Hartford Courant (CT)": 59,
        "Baltimore Sun (MD)": 34,
#         "The Capital Gazette (MD)": ,
        "The Virginian-Pilot (VA)": 55,
        "The Richmond Times-Dispatch (VA)": 53,
        "Boston Globe (MA)": 15,
        "Portland Press Herald (ME)": 366984,
#         "Houston Chronicle (TX)": ,
        "Austin American Statesman (TX)": 62,
        "Atlanta Journal Constitution (GA)": 22916
#         "Raleigh News and Observer (NC)": ,
#         "The Cleveland Plain Dealer (OH)": 
    }
    
    # initializing dataframe to store the query data
    story_counts = pd.DataFrame(columns=["Name", "Relevant Stories", "Total Stories", "Attention (%)"])
    story_counts.index.name = "MediaCloud ID"
    
    # going through each source and querying relevant and total stories
    for source_name in sources:
        if verbose:
            print(f"{source_name}:")
        
        # defining queries for topic and total stories
        api_query = f'"{query}" and media_id:{sources[source_name]}'
        total_query = f'media_id:{sources[source_name]}'
        
        # including date range if passed into function
        if date_range:
            relevant_stories = mc.storyCount(api_query, api_date_range)['count']
            total_stories = mc.storyCount(total_query, api_date_range)['count']

        else:
            relevant_stories = mc.storyCount(api_query)['count']
            total_stories = mc.storyCount(total_query)['count']
        
        # appending data to dataframe
        try:
            attention = (relevant_stories / total_stories) * 100
        except ZeroDivisionError:
            attention = np.nan
        story_counts.loc[sources[source_name]] = [source_name, relevant_stories, total_stories, attention]
        
        # printing story count and attention
        if verbose:
            print(f"{relevant_stories} stories about {query}, {total_stories} total")
            if not np.isnan(attention):
                print(f"{attention}% of stories are about {query}\n")
            else:
                print("")
    
    return story_counts

In [38]:
df = search_sources("9/11", date_range=["8/1/2021", "9/24/2021"])
df

Query: 9/11, between 08/01/2021 and 09/24/2021



Unnamed: 0_level_0,Name,Relevant Stories,Total Stories,Attention (%)
MediaCloud ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,Washington Post,123,8443,1.456828
1,New York Times,212,8679,2.442678
4,USA Today,179,6240,2.86859
6,LA Times,90,3624,2.483444
40944,Bloomberg,4,231,1.731602
3,Christian Science Monitor,29,464,6.25
18,The Philadelphia Inquirer (PA),0,1,0.0
45,The Pittsburgh Post-Gazette (PA),32,2645,1.20983
662863,PennLive/Patriot-News (PA),0,0,
59,Hartford Courant (CT),0,316,0.0


In [None]:
parser.parse("10/1/1998")

In [None]:
test_date1 = parser.parse("8/1/2021")
test_date2 = parser.parse("9/23/2021")
mc.dates_as_query_clause(test_date1, test_date2)

In [7]:
1./0

ZeroDivisionError: float division by zero