In [1]:
import gdelt
import pandas as pd
import requests
import time
import newspaper

from gdeltdoc import GdeltDoc, Filters
from concurrent.futures import ThreadPoolExecutor, as_completed
from newspaper import Article
from datetime import datetime, timedelta

here


In [4]:
def fetch_gdelt_articles_by_week(start_date, end_date, keywords, domains, theme="ECON_STOCKMARKET"):
    """
    Fetch GDELT articles by week and combine into a single DataFrame.
    
    Parameters:
    - start_date: str, format "YYYY-MM-DD"
    - end_date: str, format "YYYY-MM-DD"
    - keywords: list of keywords to search
    - domains: list of domain names
    - theme: GDELT theme (default: "ECON_STOCKMARKET")
    
    Returns:
    - DataFrame with all articles
    """
    # Convert string dates to datetime objects
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    
    all_articles = []
    current_start = start
    week_num = 1
    
    while current_start < end:
        current_end = min(current_start + timedelta(days=7), end)
        
        # Format dates for GDELT
        start_str = current_start.strftime("%Y-%m-%d")
        end_str = current_end.strftime("%Y-%m-%d")
        
        try:
            f = Filters(
                start_date=start_str,
                end_date=end_str,
                keyword=keywords,
                domain=domains,
                country=["UK", "US"],
                theme=theme
            )
            
            gd = GdeltDoc()
            articles = gd.article_search(f)
            
            if articles is not None and len(articles) > 0:       
                all_articles.append(articles)
            time.sleep(5)
            
        except Exception as e:
            print(f"✗ Error fetching week {week_num}: {e}")
        
        current_start = current_end
        week_num += 1
    
    # Combine all DataFrames
    if all_articles:
        combined_df = pd.concat(all_articles, ignore_index=True)
        
        # record only english articles & select relevant columns
        combined_df = combined_df[combined_df["language"]=='English']
        combined_df = combined_df.loc[:, ['url', 'title', 'seendate', 'domain']]
        
        # Remove duplicates based on title and date (ignoring time)
        combined_df['seendate_date'] = pd.to_datetime(combined_df['seendate']).dt.date
        duplicates_before = combined_df.duplicated(subset=['title', 'seendate_date'], keep=False).sum()
        combined_df = combined_df.drop_duplicates(subset=['title', 'seendate_date'], keep='first')
        combined_df = combined_df.drop(columns=['seendate_date'])
        
        print(f"\n{'='*60}")
        print(f"✓ Total articles fetched: {len(combined_df) + (duplicates_before // 2 if duplicates_before > 0 else 0)}")
        print(f"✓ Duplicates removed: {duplicates_before // 2 if duplicates_before > 0 else 0}")
        print(f"✓ Unique articles: {len(combined_df)}")
        print(f"✓ Date range: {start_date} to {end_date}")
        print(f"✓ Number of weeks processed: {week_num - 1}")
        print(f"{'='*60}")
        return combined_df
    else:
        print("No articles found in the specified date range")
        return pd.DataFrame()

In [5]:
def fetch_gdelt_articles_multi_year(start_year, end_year, keywords, domains, theme="ECON_STOCKMARKET"):
    """
    Fetch GDELT articles for multiple years by week.
    
    Parameters:
    - start_year: int, starting year (e.g., 2017)
    - end_year: int, ending year (e.g., 2024)
    - keywords: list of keywords to search
    - domains: list of domain names
    - theme: GDELT theme (default: "ECON_STOCKMARKET")
    
    Returns:
    - DataFrame with all articles
    """
    all_years_articles = []
    
    for year in range(start_year, end_year + 1):
        start_date = f"{year}-01-01"
        end_date = f"{year}-12-31"
        
        print(f"\n{'#'*60}")
        print(f"# FETCHING ARTICLES FOR YEAR {year}")
        print(f"{'#'*60}\n")
        
        # Fetch articles for this year
        year_articles = fetch_gdelt_articles_by_week(
            start_date=start_date,
            end_date=end_date,
            keywords=keywords,
            domains=domains,
            theme=theme
        )
        
        if not year_articles.empty:
            year_articles['year'] = year
            all_years_articles.append(year_articles)
    
    # Combine all years
    if all_years_articles:
        combined_df = pd.concat(all_years_articles, ignore_index=True)
        
        print(f"\n{'='*60}")
        print(f"SUMMARY - ALL YEARS ({start_year}-{end_year})")
        print(f"{'='*60}")
        print(f"✓ Total articles fetched: {len(combined_df)}")
        
        # Show breakdown by year
        print(f"\nArticles by year:")
        year_counts = combined_df['year'].value_counts().sort_index()
        for year, count in year_counts.items():
            print(f"  {year}: {count} articles")
        
        # Save combined results
        combined_df.to_csv(f'gdelt_articles_{start_year}_{end_year}_all.csv', index=False)
        print(f"\n✓ Saved all articles to 'gdelt_articles_{start_year}_{end_year}_all.csv'")

        return combined_df
    else:
        print("No articles found in the specified date range")
        return pd.DataFrame()

In [22]:
def fetch_article_text2(url):
    """
    Fetch article text using newspaper4k library.
    """
    try:
        article = Article(url)
        article.download()
        article.parse()
        return url, article.text
    except Exception as e:
        return url, None

def fetch_articles_parallel(df, max_workers=10, delay_between_batches=1):
    """
    Fetch article texts in parallel using ThreadPoolExecutor.
    
    Parameters:
    - df: DataFrame with 'url' column
    - max_workers: number of parallel threads (default: 10)
    - delay_between_batches: seconds to wait between batches
    
    Returns:
    - DataFrame with 'body' column added
    """
    df = df.copy()
    df['body'] = None
    
    urls = df['url'].tolist()
    total_urls = len(urls)
    
    print(f"Fetching {total_urls} articles using {max_workers} parallel workers...")
    
    # Dictionary to store results
    url_to_text = {}
    
    # Use ThreadPoolExecutor for parallel fetching
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_url = {executor.submit(fetch_article_text2, url): url for url in urls}
        
        # Process completed tasks
        completed = 0
        for future in as_completed(future_to_url):
            url, text = future.result()
            url_to_text[url] = text
            completed += 1
            
            if completed % 1000 == 0:
                print(f"Progress: {completed}/{total_urls} articles fetched ({completed/total_urls*100:.1f}%)")
            
            # Small delay to be respectful to servers
            if completed % 50 == 0:
                time.sleep(delay_between_batches)
    
    # Map results back to DataFrame
    df['body'] = df['url'].map(url_to_text)
    
    # Summary
    successful = df['body'].notna().sum()
    failed = df['body'].isna().sum()

    print(f"\n{'='*60}")
    print(f"✓ Successfully fetched: {successful} articles ({successful/total_urls*100:.1f}%)")
    print(f"✗ Failed to fetch: {failed} articles ({failed/total_urls*100:.1f}%)")
    print(f"{'='*60}")
    
    return df

In [None]:
# Fetch all articles from 2017 to 2023
# - Run seperately for 2017-2020, 2021, 2022-2023, 2024 due to large data size

In [7]:
# Fetch all articles from 2017 to 2020
articles_2017_2020 = fetch_gdelt_articles_multi_year(
    start_year=2017,
    end_year=2020,
    keywords=["stock", "market"],
    domains=["nytimes.com", "cnn.com", "wsj.com", "bloomberg.com", "cnbc.com", "marketwatch.com"]
)

# Display summary
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Total articles: {len(articles_2017_2020)}")
print(f"Date range: {articles_2017_2020['seendate'].min()} to {articles_2017_2020['seendate'].max()}")
print(f"Unique domains: {articles_2017_2020['domain'].nunique()}")


############################################################
# FETCHING ARTICLES FOR YEAR 2017
############################################################


✓ Total articles fetched: 12879
✓ Duplicates removed: 329
✓ Unique articles: 12550
✓ Date range: 2017-01-01 to 2017-12-31
✓ Number of weeks processed: 52

############################################################
# FETCHING ARTICLES FOR YEAR 2018
############################################################


✓ Total articles fetched: 12890
✓ Duplicates removed: 163
✓ Unique articles: 12727
✓ Date range: 2018-01-01 to 2018-12-31
✓ Number of weeks processed: 52

############################################################
# FETCHING ARTICLES FOR YEAR 2019
############################################################


✓ Total articles fetched: 12964
✓ Duplicates removed: 144
✓ Unique articles: 12820
✓ Date range: 2019-01-01 to 2019-12-31
✓ Number of weeks processed: 52

############################################################

In [12]:
# Fetch for 2021
articles_2021 = fetch_gdelt_articles_by_week(
    start_date="2021-01-01",
    end_date="2021-12-31",
    keywords=["stock", "market"],
    domains=["nytimes.com", "cnn.com", "wsj.com", "bloomberg.com", "cnbc.com", "marketwatch.com"]
)

✗ Error fetching week 16: 

✓ Total articles fetched: 12721
✓ Duplicates removed: 232
✓ Unique articles: 12489
✓ Date range: 2021-01-01 to 2021-12-31
✓ Number of weeks processed: 52


In [17]:
articles_2021['year'] = 2021
articles_2021.to_csv('gdelt_articles_2021_all.csv', index=False)

In [None]:
# Fetch all articles from 2022 to 2023
articles_2022_2023 = fetch_gdelt_articles_multi_year(
    start_year=2022,
    end_year=2023,
    keywords=["stock", "market"],
    domains=["nytimes.com", "cnn.com", "wsj.com", "bloomberg.com", "cnbc.com", "marketwatch.com"]
)
# Display summary
print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Total articles: {len(articles_2022_2023)}")
print(f"Date range: {articles_2022_2023['seendate'].min()} to {articles_2022_2023['seendate'].max()}")
print(f"Unique domains: {articles_2022_2023['domain'].nunique()}")


############################################################
# FETCHING ARTICLES FOR YEAR 2022
############################################################


✓ Total articles fetched: 12949
✓ Duplicates removed: 347
✓ Unique articles: 12602
✓ Date range: 2022-01-01 to 2022-12-31
✓ Number of weeks processed: 52

############################################################
# FETCHING ARTICLES FOR YEAR 2023
############################################################


✓ Total articles fetched: 12823
✓ Duplicates removed: 291
✓ Unique articles: 12532
✓ Date range: 2023-01-01 to 2023-12-31
✓ Number of weeks processed: 52

SUMMARY - ALL YEARS (2022-2023)
✓ Total articles fetched: 25134

Articles by year:
  2022: 12602 articles
  2023: 12532 articles

✓ Saved all articles to 'gdelt_articles_2022_2023_all.csv'

FINAL SUMMARY
Total articles: 25134
Date range: 20220101T003000Z to 20231230T161500Z
Unique domains: 12


In [None]:
# Extract article bodies in parallel by data range

In [None]:
articles_2017_2020_body = fetch_articles_parallel(articles_2017_2020.copy(), max_workers=10)
articles_2021_body = fetch_articles_parallel(articles_2021.copy(), max_workers=10)
articles_2022_2023_body = fetch_articles_parallel(articles_2022_2023.copy(), max_workers=10)

Fetching 50460 articles using 10 parallel workers...
Progress: 1000/50460 articles fetched (2.0%)
Progress: 2000/50460 articles fetched (4.0%)
Progress: 3000/50460 articles fetched (5.9%)
Progress: 4000/50460 articles fetched (7.9%)
Progress: 5000/50460 articles fetched (9.9%)
Progress: 6000/50460 articles fetched (11.9%)
Progress: 7000/50460 articles fetched (13.9%)
Progress: 8000/50460 articles fetched (15.9%)
Progress: 9000/50460 articles fetched (17.8%)
Progress: 10000/50460 articles fetched (19.8%)
Progress: 11000/50460 articles fetched (21.8%)
Progress: 12000/50460 articles fetched (23.8%)
Progress: 13000/50460 articles fetched (25.8%)
Progress: 14000/50460 articles fetched (27.7%)
Progress: 15000/50460 articles fetched (29.7%)
Progress: 16000/50460 articles fetched (31.7%)
Progress: 17000/50460 articles fetched (33.7%)
Progress: 18000/50460 articles fetched (35.7%)
Progress: 19000/50460 articles fetched (37.7%)
Progress: 20000/50460 articles fetched (39.6%)
Progress: 21000/50460

In [None]:
#------------24 Example------------#
# - run 24 as example first, stored as articles_2024.csv

In [None]:
articles_2024 = fetch_gdelt_articles_by_week(
    start_date="2024-01-01",
    end_date="2024-12-31",
    keywords=["stock", "market"],
    domains=["nytimes.com", "cnn.com", "wsj.com", "bloomberg.com", "cnbc.com", "marketwatch.com"]
)

In [75]:
articles_2024 = articles_2024[articles_2024["language"]=='English'].copy()

In [98]:
# Use the parallel function
articles_2024_2 = fetch_articles_parallel(articles_2024.copy(), max_workers=10)

Fetching 6513 articles using 10 parallel workers...
Progress: 10/6513 articles fetched (0.2%)
Progress: 20/6513 articles fetched (0.3%)
Progress: 30/6513 articles fetched (0.5%)
Progress: 40/6513 articles fetched (0.6%)
Progress: 50/6513 articles fetched (0.8%)
Progress: 60/6513 articles fetched (0.9%)
Progress: 70/6513 articles fetched (1.1%)
Progress: 80/6513 articles fetched (1.2%)
Progress: 90/6513 articles fetched (1.4%)
Progress: 100/6513 articles fetched (1.5%)
Progress: 110/6513 articles fetched (1.7%)
Progress: 120/6513 articles fetched (1.8%)
Progress: 130/6513 articles fetched (2.0%)
Progress: 140/6513 articles fetched (2.1%)
Progress: 150/6513 articles fetched (2.3%)
Progress: 160/6513 articles fetched (2.5%)
Progress: 170/6513 articles fetched (2.6%)
Progress: 180/6513 articles fetched (2.8%)
Progress: 190/6513 articles fetched (2.9%)
Progress: 200/6513 articles fetched (3.1%)
Progress: 210/6513 articles fetched (3.2%)
Progress: 220/6513 articles fetched (3.4%)
Progress: 2

In [110]:
articles_2024_2 = articles_2024_2.loc[:, ['url', 'title', 'seendate', 'domain', 'text2']]
articles_2024_2.head(5)

Unnamed: 0,url,title,seendate,domain,text2
0,https://www.marketwatch.com/story/jpmorgan-cha...,JPMorgan Chase stock flirts with all - time high,20240102T211500Z,marketwatch.com,
1,https://www.cnbc.com/2024/01/04/top-stocks-to-...,Top stocks to watch on Wall Street Thursday,20240104T161500Z,cnbc.com,Here are the biggest calls on Wall Street on T...
2,https://www.marketwatch.com/story/rocket-mortg...,Rocket Mortgage parent stock rocked by analyst...,20240103T194500Z,marketwatch.com,
3,https://www.marketwatch.com/story/sofis-stock-...,SoFi stock could follow record year with a sha...,20240103T160000Z,marketwatch.com,
4,https://www.wsj.com/buyside/personal-finance/p...,What Is Portfolio Diversification ?,20240101T210000Z,wsj.com,


In [124]:
articles_2024_2['pubdate'] = pd.to_datetime(articles_2024_2['seendate']).dt.date

# Find duplicates based on title and date
duplicates = articles_2024_2.duplicated(subset=['title', 'pubdate'], keep=False)

# Count duplicates
num_duplicates = duplicates.sum()
num_unique = (~duplicates).sum()
total = len(articles_2024_2)

print(f"{'='*60}")
print(f"DUPLICATE DETECTION RESULTS")
print(f"{'='*60}")
print(f"Total articles: {total}")
print(f"Duplicate articles: {num_duplicates} ({num_duplicates/total*100:.2f}%)")
print(f"Unique articles: {num_unique} ({num_unique/total*100:.2f}%)")
print(f"{'='*60}")

# Show some examples of duplicates
if num_duplicates > 0:
    print("\nExamples of duplicate articles:")
    duplicate_groups = articles_2024_2[duplicates].groupby(['title', 'pubdate'])

    for i, ((title, date), group) in enumerate(duplicate_groups):
        if i >= 5:  # Show first 5 duplicate groups
            break
        print(f"\n{i+1}. Title: {title[:80]}...")
        print(f"   Date: {date}")
        print(f"   Number of duplicates: {len(group)}")
        print(f"   URLs:")
        for url in group['url'].values:
            print(f"     - {url}")

# Remove duplicates (keep first occurrence)
articles_2024_2_unique = articles_2024_2.drop_duplicates(subset=['title', 'pubdate'], keep='first')

print(f"\n{'='*60}")
print(f"After removing duplicates: {len(articles_2024_2_unique)} unique articles")
print(f"Removed: {len(articles_2024_2) - len(articles_2024_2_unique)} duplicate articles")
print(f"{'='*60}")

# Drop the temporary column
articles_2024_2_unique = articles_2024_2_unique.drop(columns=['pubdate'])

DUPLICATE DETECTION RESULTS
Total articles: 6513
Duplicate articles: 773 (11.87%)
Unique articles: 5740 (88.13%)

Examples of duplicate articles:

1. Title:   Could it have been avoided ? Local cops detail breakdown in efforts to stop Tr...
   Date: 2024-08-10
   Number of duplicates: 2
   URLs:
     - https://www.cnn.com/2024/08/10/politics/snipers-detail-breakdowns-trump-assassination-attempt-invs/
     - https://edition.cnn.com/2024/08/10/politics/snipers-detail-breakdowns-trump-assassination-attempt-invs/

2. Title:   Enlist or die : Fear , looming famine and a deadly ultimatum swell the ranks o...
   Date: 2024-03-19
   Number of duplicates: 3
   URLs:
     - https://edition.cnn.com/2024/03/18/africa/sudan-hunger-forcible-recruitment-jazira-intl-cmd/
     - https://www.cnn.com/2024/03/18/africa/sudan-hunger-forcible-recruitment-jazira-intl-cmd/
     - https://us.cnn.com/2024/03/18/africa/sudan-hunger-forcible-recruitment-jazira-intl-cmd/

3. Title:   Ghostly  city : How Russia war

In [127]:
articles_2024_2_unique.to_csv('articles_2024.csv', index=False)

In [31]:
# Concatenate all years' data with bodies
articles_2024_body = pd.read_csv('articles_2024.csv')
articles_2024_body = articles_2024_body.rename(columns={'text2': 'body'})
articles_2024_body.insert(loc=articles_2024_body.columns.get_loc('domain') + 1, column='year', value=2024)

all_articles = pd.concat([
    articles_2017_2020_body,
    articles_2021_body,
    articles_2022_2023_body,
    articles_2024_body
], ignore_index=True)

all_articles_with_body = all_articles[all_articles['body'].notna()].copy()
all_articles_with_body.to_csv('gdelt_articles_2017_2024_with_body.csv', index=False)
print(f"\n✓ Saved {len(all_articles_with_body)} articles with body to 'gdelt_articles_2017_2024_with_body.csv'")


✓ Saved 19606 articles with body to 'gdelt_articles_2017_2024_with_body.csv'


In [38]:
all_articles.to_csv('gdelt_articles_2017_2024_all.csv', index=False)
print(f"\n✓ Saved {len(all_articles)} articles to 'gdelt_articles_2017_2024_all.csv'")


✓ Saved 94175 articles to 'gdelt_articles_2017_2024_all.csv'


In [32]:
len(all_articles_with_body)/len(all_articles)

0.2081868861162729

In [33]:
all_articles_with_body.head(5)

Unnamed: 0,url,title,seendate,domain,year,body
3,http://www.cnbc.com/2017/01/04/marc-lasry-stoc...,Marc Lasry : Stocks could climb 10 % this year,20170104T194500Z,cnbc.com,2017,Expectations for positive GDP growth and less ...
5,http://www.cnbc.com/2017/01/06/cramer-says-the...,Cramer says the secret engine behind the Trump...,20170107T001500Z,cnbc.com,2017,The radical rally in the stock market ever sin...
15,http://www.cnbc.com/2017/01/04/cramer-remix-th...,Cramer Remix : The latest Trump tweet and the ...,20170105T004500Z,cnbc.com,2017,Deregulation is a large part of President-elec...
16,http://www.cnbc.com/2017/01/03/vietnam-airline...,"Vietnam Airlines soars 40 percent , giving it ...",20170103T134500Z,cnbc.com,2017,Shares of Vietnam Airlines surged 40 percent o...
17,http://www.cnbc.com/2017/01/06/this-market-rem...,"This market reminds me of 1987 , strategist Ji...",20170106T214500Z,cnbc.com,2017,There is a renewal of optimism that will carry...


In [34]:
all_articles_with_body.domain.value_counts()

domain
cnbc.com                  15362
nytimes.com                1619
cnn.com                    1591
us.cnn.com                  356
edition.cnn.com             342
bloomberg.com               282
vendingmarketwatch.com       21
wnewsj.com                   20
cooking.nytimes.com          10
marketwatch.com               1
graphics.wsj.com              1
archive.nytimes.com           1
Name: count, dtype: int64

In [36]:
all_articles_with_body.year.value_counts()

year
2019    4013
2024    3888
2018    3392
2020    2910
2023    2397
2017    2294
2022     387
2021     325
Name: count, dtype: int64