# CDP Data Scraping
This scrapes the CDP score for a company, visible from "https://www.google.com/finance/quote/{company_ticker}:ETR".

- Only 1 request is sent per ticker
- The resulting CSV saves 1 row per ticker
- There are only ~35 rows because many companies have 100s of document entries in the esg dataset. This isn't a problem for the classifier as those examples will still be useful for training.

In [6]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import pandas as pd

import nest_asyncio
nest_asyncio.apply()

async def get_company_cdp_score(session, company_ticker):
    url = f"https://www.google.com/finance/quote/{company_ticker}:ETR"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    }

    async with session.get(url, headers=headers) as response:
        if response.status == 200:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            target_element = soup.find(string="CDP Climate Change Score")
            
            if target_element:
                parent = target_element.find_parent("span").find_parent("div")
                grade_element = parent.find("div", recursive=False)
                return grade_element.text.strip()
        return None

async def scrape_scores(tickers):
    semaphore = asyncio.Semaphore(5)  # limit concurrent requests
    seen_tickers = set()

    async def fetch_score(ticker):
        if ticker in seen_tickers:
            return None
        seen_tickers.add(ticker)

        async with semaphore:
            async with aiohttp.ClientSession() as session:
                return await get_company_cdp_score(session, ticker)

    tasks = [fetch_score(ticker) for ticker in tickers]
    return await asyncio.gather(*tasks)

if __name__ == "__main__":
    esg_documents_df = pd.read_csv('./dax_esg_media_dataset/esg_documents_for_dax_companies.csv', sep='|')
    tickers = esg_documents_df['symbol'].tolist()

    scores = asyncio.run(scrape_scores(tickers))

    results_df = pd.DataFrame({'Ticker': tickers, 'CDP Score': scores})

    results_df.dropna(subset=['CDP Score'], inplace=True)
    results_df.to_csv('./cdp_scores.csv', index=False)

    print(f"CDP scores saved to 'cdp_scores.csv'")


CDP scores saved to 'cdp_scores.csv'
