In [2]:
from scrapy.http import HtmlResponse
import requests

# Fetch the content
url = 'https://www.mckinsey.com/about-us/case-studies'
r = requests.get(url)

# Create a Scrapy response object
response = HtmlResponse(url=url, body=r.text, encoding='utf-8')

In [2]:
title = response.xpath('//title/text()').extract_first()
title
main_container = response.xpath('/html/body/div[1]/main/div[2]')
# find div with data-component = "mdc-c-content-block"
containers = main_container.xpath('//div[@class = "mck-o-container"]')
containers = containers.xpath('//div[contains(@class, "mdc-u-grid mdc-u-grid-gutter-lg")]')
containers.getall()

['<div class="mdc-u-grid mdc-u-grid-gutter-lg mdc-u-grid-col-sm-1 mdc-u-grid--align-start GenericItem_mck-c-generic-item__sGwKL"><div data-component="mdc-c-content-block" class="mdc-c-content-block___7p6Lu_93a27bc mdc-u-grid-gutter-xs GenericItem_mck-c-generic-item__content__gq1m0"><div data-component="mdc-c-content-block" class="mdc-c-content-block___7p6Lu_93a27bc"><h1 data-component="mdc-c-heading" class="mdc-c-heading___0fM1W_93a27bc mdc-u-ts-2 mdc-u-align-center"><div>Case Studies</div></h1></div></div></div>',
 '<div class="mdc-u-grid mdc-u-grid-gutter-lg mdc-u-grid-col-sm-1 mdc-u-grid--align-start mck-c-two-up-small__two-up-small-generic-item mck-c-two-up-small TwoUpSmall_mck-c-two-up-small--large__6LR8A GenericItem_mck-c-generic-item__sGwKL GenericItem_mck-c-generic-item--with-hover-effect__hmVy2 GenericItem_mck-c-generic-item--1by1-above-desktop__p_xzy"><div class="GenericItem_mck-c-generic-item__image__F_tIc"><a data-component="mdc-c-link" href="/industries/retail/how-we-help-

In [43]:
# Assuming containers contains all the relevant blocks:
# containers = containers.xpath('//div[contains(@class, "mdc-u-grid")]')

# Iterate over each container to extract the data
parsed_data_list = []

for container in containers:
    # Extract the title
    title = container.xpath('.//h5[@data-component="mdc-c-heading"]/a/span/text()').get()
    
    # Extract the hyperlink to the article
    article_url = container.xpath('.//h5[@data-component="mdc-c-heading"]/a/@href').get()
    
    # Extract the image URL
    image_src = container.xpath('.//picture/img/@src').get()
    image_url = response.urljoin(image_src) if image_src else None
    
    # Extract the description
    description = container.xpath('.//div[@data-component="mdc-c-description"]/div/text()').get()
    
    # Extract the publication date
    date = container.xpath('.//time/@datetime').get()
    
    # if there is no title and link, skip the row
    if not title and not article_url:
        continue
    
    # Combine the results
    parsed_data = {
        'title': title.strip() if title else 'No title found',
        'url': response.urljoin(article_url) if article_url else 'No URL found',
        'image_url': image_url if image_url else 'No image found',
        'description': description.strip() if description else 'No description found',
        'date': date if date else 'No date found'
    }
    

    
    parsed_data_list.append(parsed_data)

# Display parsed data
parsed_data_list

[{'title': 'Helping Starbucks design stores that are inclusive for all',
  'url': 'https://www.mckinsey.com/industries/retail/how-we-help-clients/helping-starbucks-design-stores-that-are-inclusive-for-all',
  'image_url': 'https://www.mckinsey.com/~/media/mckinsey/industries/retail/how%20we%20help%20clients/helping%20starbucks%20design%20stores%20that%20are%20inclusive%20for%20all/starbucks-case-study-thumb_1536x1536.jpg?cq=50&mw=767&car=42:25&cpy=Center',
  'description': 'The coffee company worked with McKinsey to create a design framework for more inclusive spaces for persons with disabilities.',
  'date': '2024-05-13T12:00:00Z'},
 {'title': 'Building a next-generation carbon platform to accelerate the path to net zero',
  'url': 'https://www.mckinsey.com/capabilities/mckinsey-digital/how-we-help-clients/building-a-next-generation-carbon-platform-to-accelerate-the-path-to-net-zero',
  'image_url': 'https://www.mckinsey.com/~/media/mckinsey/business%20functions/mckinsey%20digital/how

In [58]:
article_url = parsed_data_list[-1]['url']

# Fetch the content
r = requests.get(article_url)

# Create a Scrapy response object
response = HtmlResponse(url=article_url, body=r.text, encoding='utf-8')

In [59]:
# Locate the download link
download_link = response.xpath('//a[@data-layer-event-prefix="Download Link"]')
download_link.getall()

['<a data-component="mdc-c-link" href="/~/media/mckinsey/about%20us/social%20responsibility/2023%20esg%20report/mckinsey-and-company-2023-esg-report.pdf" class="DownloadsSidebar_mck-c-downloads-sidebar__download-link__fPqFQ mdc-c-link___lBbY1_93a27bc" target="_blank" data-layer-event-prefix="Download Link" data-layer-action="click" data-layer-report-type="" data-layer-file-name="mckinsey-and-company-2023-esg-report" data-layer-report-name="mckinsey-and-company-2023-esg-report&gt;"><span data-component="mdc-c-icon" class="mdc-c-icon___oi7ef_93a27bc mdc-c-icon--size-md___yi5fA_93a27bc mck-download-icon"></span><span class="mdc-c-link__label___Pfqtd_93a27bc"> (89 pages)</span></a>',
 '<a data-component="mdc-c-link" href="/~/media/mckinsey/about%20us/social%20responsibility/2023%20esg%20report/mckinsey-and-company-2023-esg-report-executive-summary.pdf" class="DownloadsSidebar_mck-c-downloads-sidebar__download-link__fPqFQ mdc-c-link___lBbY1_93a27bc" target="_blank" data-layer-event-prefix="

In [50]:
# Extract text content
article_parts = response.xpath(
    '//main[@data-layer-region="article-body"]//p//text() | '
    '//main[@data-layer-region="article-body"]//h3//text() | '
    '//main[@data-layer-region="article-body"]//h5//text() | '
    '//main[@data-layer-region="article-body"]//blockquote//p//text()'
).extract()

In [51]:
article_text = ' '.join(part.strip() for part in article_parts)

In [52]:
article_text

'Â On this page:  Opportunity | Solution | Impact The Opportunity Elevating standards of accessibility 2023 ESG Report: Accelerating sustainable and inclusive growth for all One in four adults in the U.S. have a disability. While retail spaces must comply with ADA codes, once elements like moveable furniture and merchandise are added, accessibility can become limited. Starbucks is hoping to set an elevated standard of accessibility by going beyond ADA requirements to create spaces that make everyone feel welcome. With 16,000 stores in the U.S., the coffee giant knew it had a big opportunity to catalyze change for this large and diverse population. Starbucks wanted to make accessible design the rule rather than the exception for its customers and employees. â\x80\x9cDesigning for people with disabilities is just good design for everybody,â\x80\x9d says Katie Young, Starbucks SVP of Store Operations. â\x80\x9cWe wanted to create a broader spectrum of choice and independence for everyone 

In [3]:
from scrapy.http import HtmlResponse
import requests

# Fetch the content
url = 'https://www.bcg.com/search'
r = requests.get(url)

# Create a Scrapy response object
response = HtmlResponse(url=url, body=r.text, encoding='utf-8')

In [None]:
response.text

In [1]:
# login page
url = 'https://www.bcg.com/search'

# create a session
s = requests.Session()

# get the page
r = s.get(url)
# handle the response
if r.status_code == 200:
    print(f'page {url} loaded successfully')
elif r.status_code == 404:
    print(f'page {url} not found')
else:
    print(f'page {url} failed to load with status code {r.status_code}')

NameError: name 'requests' is not defined