In [2]:
# importing dependencies for MediaCloud API
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import mediacloud.api
from dateutil import parser
import time

# loading config from .env file
load_dotenv()

True

In [3]:
# reading API key from .env
mc_api_key = os.getenv("MC_API_KEY")

# instantiating MediaCloud API
mc = mediacloud.api.MediaCloud(mc_api_key)
mediacloud.__version__

'3.12.4'

In [None]:
# verifying API connection by printing some high-level stats
mc.stats()

In [7]:
# building a function to search a string among the sources given
def search_sources(query, date_range=None, api_key=None, verbose=False):
    # ensuring the query is a string
    query = str(query)
    
    # defining API key, instantiating MediaCloud API
    # if no API key is passed, function assumes API key is defined in .env file as MC_API_KEY
    if not api_key:
        api_key = os.getenv("MC_API_KEY")
    mc = mediacloud.api.MediaCloud(api_key)
    
    # formatting date range for API query
    if date_range:
        from datetime import datetime
        
        assert len(date_range) == 2, "Please provide both a start and end date for the date interval."
        start_date = parser.parse(date_range[0])
        end_date = parser.parse(date_range[1])
        api_date_range = mc.dates_as_query_clause(start_date, end_date)
        print(f"Query: {query}, between {start_date.strftime('%m/%d/%Y')} and {end_date.strftime('%m/%d/%Y')}\n")
    else:
        print(f"Query: {query}\n")
    
    # initializing dataframe to store the query data
    story_counts = pd.DataFrame(columns=["Name", "Relevant Stories", "Total Stories", "Attention (%)"])
    story_counts.index.name = "MediaCloud ID"
    
    # going through each source and querying relevant and total stories
    for source_name in sources:
        if verbose:
            print(f"{source_name}:")
        
        # defining queries for topic and total stories
        api_query = f'{query} and media_id:{sources[source_name]}'
        total_query = f'media_id:{sources[source_name]}'
        
        # including date range if passed into function
        if date_range:
            relevant_stories = mc.storyCount(api_query, api_date_range)['count']
            total_stories = mc.storyCount(total_query, api_date_range)['count']

        else:
            relevant_stories = mc.storyCount(api_query)['count']
            total_stories = mc.storyCount(total_query)['count']
        
        # appending data to dataframe
        try:
            attention = (relevant_stories / total_stories) * 100
        except ZeroDivisionError:
            attention = np.nan
        story_counts.loc[sources[source_name]] = [source_name, relevant_stories, total_stories, attention]
        
        # printing story count and attention
        if verbose:
            print(f"{relevant_stories} stories about {query}, {total_stories} total")
            if not np.isnan(attention):
                print(f"{attention}% of stories are about {query}\n")
            else:
                print("")
    
    return story_counts

In [4]:
# defining sources and ID's in MediaCloud API. Commented sources cannot be found in MediaCloud
sources = {
    "Washington Post": 2,
    "New York Times": 1,
    "USA Today": 4,
    "LA Times": 6,
    "Bloomberg": 40944,
    "Christian Science Monitor": 3,
    "New York Daily News": 8,
    "Star Ledger (NJ)": 16,
    "The Philadelphia Inquirer (PA)": 18,
    "The Pittsburgh Post-Gazette (PA)": 45,
    "PennLive/Patriot-News (PA)": 662863,
    "The Day (CT)": 76581,
    "Hartford Courant (CT)": 59,
    "Baltimore Sun (MD)": 34,
    "The Capital Gazette (MD)": 70310,
    "The Virginian-Pilot (VA)": 55,
    "The Richmond Times-Dispatch (VA)": 53,
    "Boston Globe (MA)": 15,
    "Portland Press Herald (ME)": 366984,
    "Houston Chronicle (TX)": 10,
    "Austin American Statesman (TX)": 62,
    "Atlanta Journal Constitution (GA)": 22916,
    "Raleigh News and Observer (NC)": 58,
    "The Cleveland Plain Dealer (OH)": 662541
}

In [5]:
# defining search strings
sep11_search = '"9/11" OR "September 11th"'

In [14]:
# calling function
start_time = time.time()
df = search_sources(sep11_search, date_range=["8/1/2021", "9/24/2021"])
end_time = time.time()
print(f"Time: {end_time - start_time} s")
df.sort_values(by="Attention (%)", ascending=False)

Query: "9/11" OR "September 11th", between 08/01/2021 and 09/24/2021

Time: 43.101094007492065 s


Unnamed: 0_level_0,Name,Relevant Stories,Total Stories,Attention (%)
MediaCloud ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
76581,The Day (CT),5,2245,0.222717
4,USA Today,8,6313,0.126723
15,Boston Globe (MA),3,2407,0.124636
6,LA Times,4,3661,0.10926
10,Houston Chronicle (TX),1,1682,0.059453
34,Baltimore Sun (MD),1,3169,0.031556
8,New York Daily News,1,4073,0.024552
2,Washington Post,2,8544,0.023408
45,The Pittsburgh Post-Gazette (PA),0,2660,0.0
1,New York Times,0,8774,0.0


In [9]:
# testing outputs/formats of dates
test_date1 = parser.parse("8/1/2021")
test_date2 = parser.parse("9/23/2021")
test_range = mc.dates_as_query_clause(test_date1, test_date2)

In [None]:
# sampling 20 stories from a source containing the search string
query = f'"9 11" and media_id:2'
stories = mc.storyList(query, test_range, sort=mc.SORT_RANDOM)
pd.DataFrame(stories)

In [None]:
# retrieving word counts from a source containing the search string
query = f'"9 11" and media_id:2'
word_count = mc.wordCount(query, test_range)
# pd.DataFrame(word_count)

# building term/document matrix
story_count = mc.storyCount(query, test_range)["count"]
doc_term_matrix = mc.storyWordMatrix(query, test_range, rows=story_count)
doc_term_matrix["word_matrix"]

In [13]:
query = '"9/11" OR "September 11th"'
date_range = test_range
api_query = f'{query} and media_id:2'
print(f"Query: {api_query}")
relevant_stories = mc.storyCount(api_query, date_range)['count']
print(f"Relevant: {relevant_stories}\n")

Query: "9/11" OR "September 11th" and media_id:2
Relevant: 2



In [72]:
?mc.storyCount

In [91]:
query1 = '"September 11th"'
query2 = '"9/11"'
api_query1 = f'{query1} and media_id:2'
api_query2 = f'{query2} and media_id:2'
print(api_query1)
print(api_query2)
relevant_stories1 = mc.storyCount(api_query1, date_range)['count']
relevant_stories2 = mc.storyCount(api_query2, date_range)['count']
relevant_stories_or = mc.storyCount(f"{api_query1} OR {api_query2}", date_range)['count']

print(f"Relevant1: {relevant_stories1}")
print(f"Relevant2: {relevant_stories2}")
print(f"Or: {relevant_stories_or}")

"September 11th" and media_id:2
"9/11" and media_id:2
Relevant1: 2
Relevant2: 123
Or: 2
