We can use SerpAPi's google news search. I have 100 requests in my Free account. I can get at most 100 article per request. So, in total we can have at most 10000 training data. 

In [13]:
!pip install google-search-results 


Collecting google-search-results
  Using cached google_search_results-2.4.2-py3-none-any.whl
Installing collected packages: google-search-results
Successfully installed google-search-results-2.4.2


trying one api call

In [17]:
from serpapi.google_search import GoogleSearch

params = {
  "engine": "google",
  "q": "mergers and acquisitions",
  "google_domain": "google.com",
  "gl": "us",
  "hl": "en",
  "tbm": "nws",
  "num": "20",
  "start": "0",
  "tbs": "cdr:1,cd_min:02-03-2024,cd_max:02-04-2024",
  "api_key": "mykey"
}

search = GoogleSearch(params)
results = search.get_dict()

print(results)

{'search_metadata': {'id': '668fe5df47ccf18c40d7ee93', 'status': 'Success', 'json_endpoint': 'https://serpapi.com/searches/bc273cd9b431b314/668fe5df47ccf18c40d7ee93.json', 'created_at': '2024-07-11 14:02:07 UTC', 'processed_at': '2024-07-11 14:02:07 UTC', 'google_url': 'https://www.google.com/search?q=mergers+and+acquisitions&oq=mergers+and+acquisitions&hl=en&gl=us&num=20&tbm=nws&tbs=cdr:1,cd_min:02-03-2024,cd_max:02-04-2024&start=0&sourceid=chrome&ie=UTF-8', 'raw_html_file': 'https://serpapi.com/searches/bc273cd9b431b314/668fe5df47ccf18c40d7ee93.html', 'total_time_taken': 1.68}, 'search_parameters': {'engine': 'google', 'q': 'mergers and acquisitions', 'google_domain': 'google.com', 'hl': 'en', 'gl': 'us', 'start': 0, 'num': '20', 'device': 'desktop', 'tbm': 'nws', 'tbs': 'cdr:1,cd_min:02-03-2024,cd_max:02-04-2024'}, 'search_information': {'query_displayed': 'mergers and acquisitions', 'total_results': 3, 'time_taken_displayed': 0.21, 'news_results_state': 'Results for exact spelling'

On experimenting, I saw that you get approximately 100 articles if you request with one week range. Thus we will request 7 days 100 times. we have 52 weeks in a year, so we need to search news up to two years ago. 

In [19]:
import datetime
from serpapi.google_search import GoogleSearch
import csv

API_KEY = "mykey"
QUERY = "mergers and acquisitions"
CSV_FILENAME = 'mergers_acquisitions_articles.csv'

def get_date_range(end_date):
    start_date = end_date - datetime.timedelta(days=7)
    return start_date.strftime("%m-%d-%Y"), end_date.strftime("%m-%d-%Y")

def search_articles(start_date, end_date):
    params = {
        "engine": "google",
        "q": QUERY,
        "google_domain": "google.com",
        "gl": "us",
        "hl": "en",
        "tbm": "nws",
        "num": "100",
        "start": "0",
        "tbs": f"cdr:1,cd_min:{start_date},cd_max:{end_date}",
        "api_key": API_KEY
    }
    
    try:
        search = GoogleSearch(params)
        results = search.get_dict()
        
        articles = []
        if 'news_results' in results:
            for article in results['news_results']:
                articles.append({
                    'title': article['title'],
                    'link': article['link']
                })
        return articles
    except Exception as e:
        print(f"Error occurred while searching: {e}")
        return []

def save_to_csv(articles, mode='a'):
    with open(CSV_FILENAME, mode, newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['title', 'link'])
        if mode == 'w':
            writer.writeheader()
        for article in articles:
            writer.writerow(article)

# Get the current date and the date 2 years ago
end_date = datetime.datetime.now()
start_date = end_date - datetime.timedelta(days=365*2)

# Initialize CSV file with headers
save_to_csv([], mode='w')

# Iterate week by week
total_articles = 0
try:
    while start_date < end_date:
        week_start, week_end = get_date_range(start_date)
        print(f"Searching for week: {week_start} to {week_end}")
        articles = search_articles(week_start, week_end)
        print(f"Found {len(articles)} articles")
        save_to_csv(articles, mode='a')
        total_articles += len(articles)
        start_date += datetime.timedelta(days=7)
except KeyboardInterrupt:
    print("\nScript interrupted by user. Saving collected data...")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    print(f"Scraping complete. Found {total_articles} articles.")
    print(f"Data saved to {CSV_FILENAME}")

Searching for week: 07-05-2022 to 07-12-2022
Found 100 articles
Searching for week: 07-12-2022 to 07-19-2022
Found 100 articles
Searching for week: 07-19-2022 to 07-26-2022
Found 100 articles
Searching for week: 07-26-2022 to 08-02-2022
Found 100 articles
Searching for week: 08-02-2022 to 08-09-2022
Found 97 articles
Searching for week: 08-09-2022 to 08-16-2022
Found 100 articles
Searching for week: 08-16-2022 to 08-23-2022
Found 94 articles
Searching for week: 08-23-2022 to 08-30-2022
Found 100 articles
Searching for week: 08-30-2022 to 09-06-2022
Found 100 articles
Searching for week: 09-06-2022 to 09-13-2022
Found 100 articles
Searching for week: 09-13-2022 to 09-20-2022
Found 89 articles
Searching for week: 09-20-2022 to 09-27-2022
Found 100 articles
Searching for week: 09-27-2022 to 10-04-2022
Found 100 articles
Searching for week: 10-04-2022 to 10-11-2022
Found 100 articles
Searching for week: 10-11-2022 to 10-18-2022
Found 100 articles
Searching for week: 10-18-2022 to 10-25-202

Strangely, I got duplicate articles so I needed to remove them in following code.

In [50]:
from collections import OrderedDict

def remove_duplicates(input_filename, output_filename):
    unique_articles = OrderedDict()
    duplicate_count = 0

    with open(input_filename, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            link = row['link']
            if link not in unique_articles:
                unique_articles[link] = row
            else:
                duplicate_count += 1
                
    with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['title', 'link']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in unique_articles.values():
            writer.writerow(row)

    return len(unique_articles), duplicate_count

if __name__ == "__main__":
    input_filename = 'mergers_acquisitions_articles.csv'  
    output_filename = 'deduplicated_articles.csv' 

    unique_count, removed_count = remove_duplicates(input_filename, output_filename)

    print(f"Processed the input file: {input_filename}")
    print(f"Removed {removed_count} duplicate entries")
    print(f"Saved {unique_count} unique articles to: {output_filename}")

Processed the input file: mergers_acquisitions_articles.csv
Removed 1412 duplicate entries
Saved 7732 unique articles to: deduplicated_articles.csv


In [49]:
import pandas as pd
csv_file = 'deduplicated_articles.csv'
df = pd.read_csv(csv_file)
df= pd.DataFrame(df)
print(df.shape[0])

7730


We can use the extract_content() function from manual_web_scrap.ipynb to extract the content from the article URLs and we will have our unlabeled data.