In [1]:
import requests
from bs4 import BeautifulSoup
import helper_functions
import pandas as pd
import os
import datetime
import json

# Initialize the OpenAIPipeline
OpenAIPipeline = helper_functions.OpenAIPipeline()

# Fetch the webpage content
name = "bcg_search_posts"

# Set the start and end page limits
start_page = 1
end_page = 100  # Example: Scrape pages from 1 to 100

In [2]:
def parse_article_blocks(article_blocks) -> list:
    data = []
    for block in article_blocks:
        try:
            title_tag = block.find("h2", class_="title").find("a", class_="Link")
            if title_tag is None:
                return None
            img_tag = block.find("div", class_="result-picture")
            if img_tag is not None:
                img_tag = img_tag.find("picture")
                if img_tag is not None:
                    img_tag = img_tag.find("img")
            subtitle_tag = block.find("p", class_="subtitle")
            intro_tag = block.find("p", class_="intro")

            title = title_tag.get_text(strip=True)
            href = title_tag["href"] if title_tag else None
            img_url = img_tag["src"] if img_tag else None
            if subtitle_tag is not None:
                subtitle_text = subtitle_tag.get_text(strip=True)
                # Handling subtitle with or without type
                if "|" in subtitle_text:
                    article_type, date = map(str.strip, subtitle_text.split("|"))
                else:
                    article_type = "N/A"
                    date = subtitle_text

            description = intro_tag.get_text(strip=True)
            print(
                f"Title: {title}, Link: {href}, Image URL: {img_url}, Type: {article_type}, Date: {date}, Description: {description}"
            )
            parsed_article = helper_functions.parse_article(href)

            item = {
                "title": title,
                "link": href,
                "img_url": img_url,
                "type": article_type,
                "date": date,
                "description": description,
            }

            # append the parsed article to the item
            item.update(parsed_article)

            # call the OpenAIPipeline to get the summary of the article and append it to the item
            OpenAIPipeline.process_item(item)

            # append the item to the data list
            data.append(item)
        except Exception as e:
            print(f"Error: {e}")
            continue

    return data

In [3]:
# Define the output paths
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
json_output_file_path = os.path.join(
    os.getcwd(), "..", "data", name + f"_{timestamp}.json"
)
csv_output_file_path = os.path.join(
    os.getcwd(), "..", "data", name + f"_{timestamp}.csv"
)
GoogleSheetsPipeline = helper_functions.GoogleSheetsPipeline(
    spreadsheet_name="Web Scraping Data", worksheet_name=name
)
GoogleSheetsPipeline.open_connection()


def export_data(data: list, name: str):

    # Append data to JSON Lines file (one JSON object per line)
    with open(json_output_file_path, "a") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")  # Write each item on a new line

    # Convert data to DataFrame
    df = pd.DataFrame(data)

    # Handling CSV with dynamic headers
    if not os.path.exists(csv_output_file_path):
        # If the file doesn't exist, create it with headers
        df.to_csv(csv_output_file_path, index=False)
    else:
        # If the file exists, check for new headers
        existing_df = pd.read_csv(csv_output_file_path)
        combined_headers = sorted(set(existing_df.columns).union(set(df.columns)))

        # Reorder both DataFrames to have the same columns
        existing_df = existing_df.reindex(columns=combined_headers)
        df = df.reindex(columns=combined_headers)

        # Append the new data without rewriting the existing header
        df.to_csv(csv_output_file_path, mode="a", index=False, header=False)

    print(
        f"Data has been appended to {json_output_file_path} and {csv_output_file_path}"
    )

    # Upload the data to Google Sheets
    GoogleSheetsPipeline.process_data(data)

In [4]:
# Create empty lists to accumulate data
all_data = []
batch_size = 5  # Define how many pages you want to accumulate before writing to file

# Loop through the pages
for page_num in range(start_page, end_page + 1):
    # Construct the URL with the page number
    url = f"https://www.bcg.com/search?p={page_num}"

    print(f"Fetching page {page_num}...")

    # Fetch the webpage content
    response = requests.get(url)
    webpage_content = response.content

    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(webpage_content, "html.parser")

    # Locate the container for the article blocks
    results_container = soup.find("div", class_="results-container")
    articles_container = results_container.find("div", {"data-qa": "hits"})

    # Find all the article blocks within the container
    article_blocks = articles_container.find_all("section", class_="search-result")

    # parse the article blocks
    data_temp = parse_article_blocks(article_blocks)

    # append the data to the main data list
    all_data.extend(data_temp)

    print(f"Total articles: {len(all_data)}")

    # If the batch size is reached, export the data and clear the list
    if page_num % batch_size == 0 or page_num == end_page:
        export_data(all_data, name)
        all_data.clear()

Fetching page 1...
Title: The Golden Opportunities Buried in E-Waste, Link: https://www.bcg.com/publications/2024/golden-opportunities-buried-in-ewaste-recycling, Image URL: https://web-assets.bcg.com/dims4/default/c148859/2147483647/strip/true/crop/1620x1620+630+0/resize/100x100!/format/webp/quality/90/?url=http%3A%2F%2Fboston-consulting-group-brightspot.s3.amazonaws.com%2Fba%2F31%2Ffa4925564f1da8fbf3602ffd62e2%2Fthe-golden-opportunities-buried-in-e-waste-rectangle.jpg, Type: Update, Date: August 20, 2024, Description: Advancing clean technologies can extract critical minerals such as copper, gold, and palladium from electronic waste.
Generated summary: The UK's Royal Mint is recycling gold from electronic waste, part of a trend utilizing clean technologies for local e-waste processing. This approach emphasizes sustainability, supports local economies, and addresses supply chain risks. Raising consumer awareness, regulatory support, and designing for
Title: How CEOs Can Find the Right

In [5]:
GoogleSheetsPipeline.close_connection()

GoogleSheetsPipeline finished processing and saved data to https://docs.google.com/spreadsheets/d/1qmPXhTIHutHWUto37O97aKG-vW2A1fCzD6WqbqJu9v4
