In [15]:
from waybacknews.searchapi import SearchApiClient
from datetime import datetime
import pandas as pd
import requests
from retrying import retry
import requests_cache
from tqdm import tqdm
import mediacloud.api
from newspaper import Article
# import unicodedata
import concurrent.futures

In [16]:
collection_id = "38379429"
directory_api = mediacloud.api.DirectoryApi(auth_token='e85cce24da8b73eaa05329d258146c044ef055db')
api = SearchApiClient("mediacloud")

In [17]:
# Function to split the domain list into chunks
def split_into_chunks(domains, chunk_size):
    for i in range(0, len(domains), chunk_size):
        yield domains[i:i + chunk_size]


In [18]:
all_domains = []
# Pagination setup
offset = 0
limit = 100  # seems to have a 100 limit
more_pages = True

while more_pages:
    # Fetch sources with current offset
    sources_response = directory_api.source_list(collection_id=collection_id, limit=limit, offset=offset)
    sources = sources_response.get('results', [])
    domains = [source['homepage'] for source in sources]
    all_domains.extend(domains)
    
    # Update the offset
    offset += limit
    
    # Check if there are more pages to fetch
    more_pages = len(sources) == limit

# Cleaning up domains
cleaned_domains = [
    domain.replace('https://www.', '').replace('http://www.', '').replace('https://','').replace('http://','').rstrip('/')
    for domain in all_domains
    if domain  # Ensure the domain is not None or empty
]
print(f"Number of sources: {len(cleaned_domains)}")

Number of sources: 8913


In [21]:
domain_chunks = list(split_into_chunks(cleaned_domains, 1000))

# sources_response = directory_api.source_list(collection_id=collection_id)
# domains = [source['homepage'] for source in sources_response['results']]
# num_sources = len(sources_response['results'])
# print(f"Number of sources: {num_sources}")

# cleaned_domains = [domain.replace('https://www.', '').replace('http://www.', '').replace('https://','').replace('http://','') for domain in domains]
# cleaned_domains = [url.rstrip('/') for url in cleaned_domains]
# print(cleaned_domains)

# Enable requests cache
requests_cache.install_cache('article_cache', backend='filesystem', expire_after=3600)


# Query parameters
query_term = '("police shooting" OR "shot by police" OR "police shot" OR "officer-involved shooting" OR "police-involved shooting" OR "police officer shooting" OR "police shot" OR "officer shot")'
# query_term = '("police shooting" OR "shot by police" OR "police shot" OR "officer-involved shooting" OR "police-involved shooting" OR "police officer shooting" OR "police shot" OR "officer shot" OR "deputy shot" OR "sheriff shot" OR "cop shot")'
# query_term = 'police AND shot'
start = datetime(2023, 9, 1) #11/6 - 11/15
end = datetime(2023, 10, 1) 
language = "en"

# DataFrame to store combined results
combined_results = pd.DataFrame()
results_list = []

for chunk in domain_chunks:
    domains_str = f"domain:({' OR '.join(chunk)})"
    query = f"{query_term} AND language:{language} AND {domains_str}"
    
    # Perform the search with the current chunk
    articles = []
    for list_of_articles in api.all_articles(query, start, end):
        articles.extend(list_of_articles)
        print(f"all_articles endpoint: {len(articles)} articles")
    
    if articles:
        chunk_results = pd.DataFrame(articles)
        results_list.append(chunk_results)
    

# Concatenate all DataFrames in the list
combined_results = pd.concat(results_list, ignore_index=True)
combined_results.sort_values(by='publication_date', ascending=False, inplace=True)
print(combined_results.head())


all_articles endpoint: 622 articles
all_articles endpoint: 261 articles
all_articles endpoint: 67 articles
all_articles endpoint: 15 articles
all_articles endpoint: 34 articles
all_articles endpoint: 25 articles
all_articles endpoint: 14 articles
all_articles endpoint: 14 articles
all_articles endpoint: 2 articles
                                                 title publication_date  \
256  Antioch: Man wanted for homicide is shot by po...       2023-10-01   
112  Antioch: Man wanted for homicide is shot by po...       2023-10-01   
328  1 killed in police shooting at Pa. traffic sto...       2023-10-01   
417  St. Louis police shoot man who threatened rela...       2023-10-01   
173  Demonstrators call for justice after deadly po...       2023-10-01   

             capture_time language            domain  \
256  2023-10-03T11:47:16Z       en   mercurynews.com   
112  2023-10-03T13:10:17Z       en  eastbaytimes.com   
328  2023-10-13T01:29:15Z       en      pennlive.com   
417  2023

In [22]:
combined_results.shape

(1054, 9)

In [23]:
def get_snippet_from_newspaper3k(url):
    extracted_article = Article(url)
    extracted_article.download()
    extracted_article.parse()
    return extracted_article.text

def get_snippet_from_wayback_machine(url):
    response = requests.get(url)
    response.raise_for_status()
    snippet = response.json().get('snippet', '')
    # return response.text 
    return snippet

def fetch_snippet(article_url, wayback_url):
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        future_to_method = {
            executor.submit(get_snippet_from_newspaper3k, article_url): 'newspaper3k',
            executor.submit(get_snippet_from_wayback_machine, wayback_url): 'wayback'
        }
        for future in concurrent.futures.as_completed(future_to_method):
            method = future_to_method[future]
            try:
                data = future.result()
                return data, method
            except Exception as exc:
                print(f"{method} method failed with exception: {exc}")


def sanitize_snippet(snippet):
    sanitized_snippet = snippet.replace('\n', ' ').replace('\r', '').strip()
    # Normalize Unicode characters
    # normalized_snippet = unicodedata.normalize('NFKD', sanitized_snippet)
    return sanitized_snippet

In [24]:
for index, article in tqdm(combined_results.iterrows(), total=combined_results.shape[0]):
    article_url = article['url']
    wayback_url = article['article_url']
    snippet, method_used = fetch_snippet(article_url, wayback_url)
    if snippet:
        sanitized_snippet = sanitize_snippet(snippet)
        combined_results.loc[index, 'snippet'] = sanitized_snippet
        print(f"Snippet fetched using {method_used} method for URL: {article_url}")

  0%|          | 1/1054 [00:09<2:38:28,  9.03s/it]

Snippet fetched using newspaper3k method for URL: https://www.mercurynews.com/2023/10/01/antioch-man-wanted-for-homicide-earlier-this-year-shot-by-police-during-pursuit/


  0%|          | 2/1054 [00:25<3:59:23, 13.65s/it]

Snippet fetched using newspaper3k method for URL: https://www.eastbaytimes.com/2023/10/01/antioch-man-wanted-for-homicide-earlier-this-year-shot-by-police-during-pursuit/


  0%|          | 3/1054 [00:26<2:16:54,  7.82s/it]

Snippet fetched using wayback method for URL: https://www.pennlive.com/news/2023/10/1-killed-in-police-shooting-at-pa-traffic-stop-reports.html


  0%|          | 4/1054 [00:27<1:27:53,  5.02s/it]

Snippet fetched using wayback method for URL: https://www.stltoday.com/news/local/crime-courts/st-louis-police-shoot-man-who-threatened-relatives-with-gun-shot-at-officers-police-say/article_15c7fc48-5fa7-11ee-96ab-77443b13065e.html


  0%|          | 5/1054 [00:28<1:04:57,  3.72s/it]

Snippet fetched using wayback method for URL: https://ktla.com/news/local-news/demonstrators-call-for-justice-after-deadly-police-shooting-in-south-l-a/


  1%|          | 6/1054 [00:29<44:22,  2.54s/it]  

Snippet fetched using newspaper3k method for URL: https://www.newsday.com/long-island/whitney-pond-park-manhasset-daniels-law-mszq6hcp


  1%|          | 7/1054 [00:41<1:41:24,  5.81s/it]

Snippet fetched using newspaper3k method for URL: https://www.campustimes.org/2023/10/01/penis/


  1%|          | 8/1054 [00:59<2:45:01,  9.47s/it]

Snippet fetched using newspaper3k method for URL: https://www.kron4.com/news/bay-area/homicide-suspect-shot-by-antioch-police-taken-to-hospital/


  1%|          | 9/1054 [00:59<1:57:05,  6.72s/it]

Snippet fetched using wayback method for URL: https://www.wbbjtv.com/2023/10/01/tbi-investigating-officer-involved-shooting-in-hamilton-co/


  1%|          | 10/1054 [01:00<1:24:20,  4.85s/it]

Snippet fetched using wayback method for URL: https://www.clickondetroit.com/news/local/2023/10/01/detroit-police-officer-struck-by-vehicle-shoots-injures-driver/


  1%|          | 11/1054 [01:01<1:02:04,  3.57s/it]

Snippet fetched using wayback method for URL: https://mynewsla.com/business/2023/10/01/another-laco-das-office-employee-sues-for-retaliation/
newspaper3k method failed with exception: Article `download()` failed with 404 Client Error: Not Found for url: https://foxwilmington.com/headlines/at-least-2-dead-2-injured-in-houston-area-family-dispute-report/ on URL https://foxwilmington.com/headlines/at-least-2-dead-2-injured-in-houston-area-family-dispute-report/


  1%|          | 12/1054 [01:07<1:16:48,  4.42s/it]

Snippet fetched using wayback method for URL: https://foxwilmington.com/headlines/at-least-2-dead-2-injured-in-houston-area-family-dispute-report/


  1%|          | 13/1054 [01:12<1:21:22,  4.69s/it]

Snippet fetched using newspaper3k method for URL: https://www.campustimes.org/2023/10/01/man-shot-by-rpd-officer-during-skirmish/


  1%|▏         | 14/1054 [01:15<1:10:13,  4.05s/it]

Snippet fetched using newspaper3k method for URL: https://ktvz.com/news/national-world/cnn-world/2023/09/30/one-year-on-from-stadium-disaster-that-killed-135-these-families-are-still-seeking-answers/


  2%|▏         | 16/1054 [01:42<2:15:46,  7.85s/it]

Snippet fetched using newspaper3k method for URL: https://republicmonews.com/2023/09/30/foot-pursuit-suspect-struck-by-officers-gunfire-in-northeast-houston-according-to-hpd/


  2%|▏         | 17/1054 [01:42<1:36:59,  5.61s/it]

Snippet fetched using wayback method for URL: https://ktar.com/story/5542142/police-fatally-shoot-suicidal-man-brandishing-knife-in-mesa/


  2%|▏         | 18/1054 [01:43<1:11:22,  4.13s/it]

Snippet fetched using wayback method for URL: https://localnews8.com/news/national-world/cnn-world/2023/09/30/one-year-on-from-stadium-disaster-that-killed-135-these-families-are-still-seeking-answers/


  2%|▏         | 19/1054 [01:44<53:08,  3.08s/it]  

Snippet fetched using wayback method for URL: https://abc17news.com/news/national-world/cnn-world/2023/09/30/one-year-on-from-stadium-disaster-that-killed-135-these-families-are-still-seeking-answers/
Snippet fetched using wayback method for URL: https://ktvz.com/news/national-world/cnn-world/2023/09/30/one-year-on-from-stadium-disaster-that-killed-135-these-families-are-still-seeking-answers/


  2%|▏         | 21/1054 [01:44<31:42,  1.84s/it]

Snippet fetched using wayback method for URL: https://kion546.com/news/national-world/cnn-national/2023/09/29/two-former-chicago-police-officers-acquitted-of-shooting-unarmed-man/


  2%|▏         | 22/1054 [01:45<26:43,  1.55s/it]

Snippet fetched using wayback method for URL: https://ktvz.com/news/national-world/cnn-national/2023/09/29/two-former-chicago-police-officers-acquitted-of-shooting-unarmed-man/


  2%|▏         | 23/1054 [01:48<33:51,  1.97s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/electrical-mishap-sparks-small-grass-fire-knocks-out-power-along-college-drive/


  2%|▏         | 24/1054 [01:51<39:01,  2.27s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/ebrso-looking-for-missing-woman-with-dementia-161514/


  2%|▏         | 25/1054 [02:08<1:46:49,  6.23s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/little-brother-beating-big-brother-to-the-field-for-lsu-football/


  2%|▏         | 26/1054 [02:40<3:52:28, 13.57s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/louisiana-s-struggle-with-influx-of-salt-water-prompts-a-request-for-biden-to-declare-an-emergency/


  3%|▎         | 27/1054 [03:10<5:17:17, 18.54s/it]

Snippet fetched using newspaper3k method for URL: https://cbs4indy.com/news/man-dead-following-officer-involved-shooting-in-bedford/


  3%|▎         | 28/1054 [03:12<3:53:09, 13.64s/it]

Snippet fetched using wayback method for URL: https://cbs4indy.com/news/indycrime/2-impd-officers-indicted-after-shooting-man-who-was-sleeping-in-grandmas-driveway/


  3%|▎         | 29/1054 [03:13<2:47:30,  9.81s/it]

Snippet fetched using wayback method for URL: https://www.wbrz.com/news/north-korea-says-it-will-expel-the-us-soldier-who-crossed-into-the-country-in-july/


  3%|▎         | 30/1054 [03:13<2:01:12,  7.10s/it]

Snippet fetched using wayback method for URL: https://abc17news.com/news/national-world/cnn-national/2023/09/29/two-former-chicago-police-officers-acquitted-of-shooting-unarmed-man/


  3%|▎         | 31/1054 [03:14<1:28:16,  5.18s/it]

Snippet fetched using wayback method for URL: https://www.wbrz.com/news/michael-gambon-actor-who-played-prof-dumbledore-in-6-harry-potter-movies-dies-at-age-82/


  3%|▎         | 32/1054 [03:15<1:07:24,  3.96s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/team-2-traffic-i-10-wb-shut-down-before-whiskey-bay-due-to-vehicle-fire/


  3%|▎         | 33/1054 [03:16<50:06,  2.94s/it]  

Snippet fetched using wayback method for URL: https://indianapolisrecorder.com/two-officers-indicted-for-roles-in-shooting-of-man-sleeping-in-grandmothers-driveway/


  3%|▎         | 34/1054 [03:17<42:33,  2.50s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/state-will-allow-individual-parishes-to-opt-out-of-statewide-burn-ban/


  3%|▎         | 35/1054 [03:18<35:19,  2.08s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/what-would-a-government-shutdown-mean-for-me-snap-student-loans-and-travel-impacts-explained/


  3%|▎         | 36/1054 [03:19<28:01,  1.65s/it]

Snippet fetched using wayback method for URL: http://www.westsideconnect.com/news/local_news/deputy-shoots-carjacking-suspect/article_3b622d28-5f36-11ee-bedb-3f4a48282005.html


  4%|▎         | 37/1054 [03:21<28:19,  1.67s/it]

Snippet fetched using newspaper3k method for URL: https://www.wfaa.com/article/news/crime/dealership-customers-thursday-night-shooting-arlington/287-fb72bad4-8913-499f-a926-844121416e7d


  4%|▎         | 38/1054 [03:22<24:30,  1.45s/it]

Snippet fetched using newspaper3k method for URL: https://www.eastbaytimes.com/2023/09/29/martinez-police-release-body-worn-camera-footage-of-fatal-aug-18-shooting/


  4%|▎         | 39/1054 [03:24<28:31,  1.69s/it]

Snippet fetched using newspaper3k method for URL: https://www.eastbaytimes.com/2023/09/29/fairfield-police-officer-shot-man-brandishing-bb-gun-near-school/


  4%|▍         | 40/1054 [03:28<40:14,  2.38s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/pig-carcasses-scattered-along-mississippi-river-bridge-after-animals-fell-out-of-truck-hit-by-drivers/


  4%|▍         | 41/1054 [03:29<32:49,  1.94s/it]

Snippet fetched using newspaper3k method for URL: https://www.wfaa.com/article/news/local/dog-bites-fort-worth-cop-shot-kentwood-place/287-0d5e0323-c6b3-4232-a595-e0600186056b


  4%|▍         | 42/1054 [03:30<26:51,  1.59s/it]

Snippet fetched using newspaper3k method for URL: https://www.wfaa.com/article/news/world/mexican-migrants-killed-tecate-baja-california-mexico/269-82c920a9-0534-4aa8-9ae3-15d6db241fe1


  4%|▍         | 43/1054 [03:30<21:15,  1.26s/it]

Snippet fetched using wayback method for URL: https://whdh.com/news/authorities-investigating-officer-involved-shooting-in-lakeville/


  4%|▍         | 44/1054 [03:31<19:03,  1.13s/it]

Snippet fetched using wayback method for URL: https://www.whec.com/top-news/several-city-council-members-object-to-3-members-statement-on-shooting-involving-rpd-officer/


  4%|▍         | 45/1054 [03:32<21:25,  1.27s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/one-week-after-his-son-s-arrest-high-ranking-brpd-officer-placed-on-leave-as-feds-investigate-department/


  4%|▍         | 46/1054 [03:35<26:32,  1.58s/it]

Snippet fetched using wayback method for URL: https://www.wishtv.com/news/crime-watch-8/impd-ids-3-officers-in-shooting-that-killed-man-hurt-woman-outside-lucas-oil-stadium/


  4%|▍         | 47/1054 [03:37<29:08,  1.74s/it]

Snippet fetched using wayback method for URL: https://www.wishtv.com/news/local-news/man-dies-following-bedford-police-shooting/


  5%|▍         | 48/1054 [03:39<32:42,  1.95s/it]

Snippet fetched using wayback method for URL: https://www.wpri.com/news/local-news/se-mass/lakeville-police-investigating-officer-involved-shooting/
Snippet fetched using wayback method for URL: https://ktvz.com/news/national-world/cnn-national/2023/09/29/two-former-chicago-police-officers-acquitted-of-shooting-unarmed-man/


  5%|▍         | 50/1054 [03:49<54:44,  3.27s/it]

Snippet fetched using newspaper3k method for URL: https://www.wtae.com/article/pittsburgh-shooting-justified-da-zappala/45375990


  5%|▍         | 51/1054 [04:23<3:04:22, 11.03s/it]

Snippet fetched using newspaper3k method for URL: https://www.wbrz.com/news/a-new-tropical-storm-forms-east-of-philippe-in-the-central-atlantic/


  5%|▍         | 52/1054 [04:24<2:18:26,  8.29s/it]

Snippet fetched using wayback method for URL: https://www.wbrz.com/news/disheartened-and-dismayed-baton-rouge-union-of-police-blasts-brpd-for-placing-deputy-chief-on-leave/


  5%|▌         | 53/1054 [04:25<1:43:29,  6.20s/it]

Snippet fetched using wayback method for URL: https://localnews8.com/news/national-world/cnn-national/2023/09/29/two-former-chicago-police-officers-acquitted-of-shooting-unarmed-man/


  5%|▌         | 54/1054 [04:27<1:24:18,  5.06s/it]

Snippet fetched using wayback method for URL: https://www.wearegreenbay.com/news/local-news/officer-involved-in-kaukauna-shooting-identified-investigation-continues/


  5%|▌         | 55/1054 [04:28<1:05:47,  3.95s/it]

Snippet fetched using wayback method for URL: https://www.abc6.com/man-shot-during-confrontation-with-police-in-lakeville/


  5%|▌         | 56/1054 [04:29<51:43,  3.11s/it]  

Snippet fetched using wayback method for URL: https://www.abc6.com/one-injured-during-early-morning-officer-involved-shooting-in-lakeville/


  5%|▌         | 57/1054 [04:31<45:12,  2.72s/it]

Snippet fetched using wayback method for URL: https://www.startribune.com/ruling-likely-puts-pressure-on-minneapolis-to-settle-lawsuit-by-journalists-observers-say/600308686/


  6%|▌         | 58/1054 [04:33<44:20,  2.67s/it]

Snippet fetched using wayback method for URL: https://www.baltimoresun.com/maryland/baltimore-city/bs-md-ci-west-lafayette-avenue-shooting-20230929-uf2osclhubb6phhr4khdlu3pq4-story.html
newspaper3k method failed with exception: Article `download()` failed with 410 Client Error: Gone for url: https://www.sfgate.com/news/world/article/2-mexican-migrants-shot-dead-3-injured-in-dawn-18397735.php on URL https://www.sfgate.com/news/world/article/2-mexican-migrants-shot-dead-3-injured-in-dawn-18397735.php


  6%|▌         | 59/1054 [04:34<34:44,  2.09s/it]

Snippet fetched using wayback method for URL: https://www.sfgate.com/news/world/article/2-mexican-migrants-shot-dead-3-injured-in-dawn-18397735.php


  6%|▌         | 60/1054 [04:34<26:01,  1.57s/it]

Snippet fetched using wayback method for URL: https://www.seattlepi.com/news/world/article/2-mexican-migrants-shot-dead-3-injured-in-dawn-18397735.php


  6%|▌         | 61/1054 [04:36<26:25,  1.60s/it]

Snippet fetched using wayback method for URL: https://martinezgazette.com/update-on-martinez-officer-involved-shooting/


  6%|▌         | 62/1054 [04:38<27:04,  1.64s/it]

Snippet fetched using newspaper3k method for URL: https://www.local10.com/news/local/2023/09/29/officers-shoot-armed-man-in-northwest-miami-dade-police-say/


  6%|▌         | 63/1054 [04:38<22:14,  1.35s/it]

Snippet fetched using wayback method for URL: https://www.sandiegouniontribune.com/news/nation-world/story/2023-09-29/2-mexican-migrants-shot-dead-3-injured-in-dawn-attack-on-us-border-near-tecate-mexico


  6%|▌         | 64/1054 [04:39<16:56,  1.03s/it]

Snippet fetched using newspaper3k method for URL: https://nypost.com/2023/09/29/influencer-meatball-regrets-part-in-riots-family-ashamed/


  6%|▌         | 64/1054 [04:40<1:12:20,  4.38s/it]


KeyboardInterrupt: 

In [None]:
combined_results.to_csv(f'./data_storage/{start}.csv', index=False)
print(f"Data retrieval complete. Results saved to './data_storage/{start}.csv'.")