In [1]:
# emulate login with this website
# https://www.mckinsey.com/industries

import requests
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin, urlencode
import ipywidgets as widgets
from IPython.display import display, HTML
import re
import base64
import pandas as pd
import datetime

# login page
url = 'https://www.mckinsey.com/about-us/case-studies'
uri = urlparse(url)
# get base url
base_url = '{uri.scheme}://{uri.netloc}'.format(uri=uri)
print(f'base url: {base_url}')

# create a session
s = requests.Session()

# get the page
r = s.get(url)
# handle the response
if r.status_code == 200:
    print(f'page {url} loaded successfully')
elif r.status_code == 404:
    print(f'page {url} not found')
else:
    print(f'page {url} failed to load with status code {r.status_code}')

# parse the page
soup = BeautifulSoup(r.content.decode('utf-8'), 'html.parser')

base url: https://www.mckinsey.com
page https://www.mckinsey.com/about-us/case-studies loaded successfully


In [2]:
soup

<!DOCTYPE html>
<html dir="ltr" lang="en"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1.0" name="viewport"/><meta content="IE=edge" http-equiv="X-UA-Compatible"/><script type="text/javascript">window.NREUM||(NREUM={});NREUM.info = {"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"NRJS-209506966f4b1a90574","applicationID":"1062211690","transactionName":"MgRQNxEFCkoFUUZfXgtOfzUgSzdQEFdRWUMAIl0NFxYLVQhXQBl4CwVXGw==","queueTime":15,"applicationTime":4709,"agent":"","atts":""}</script><script type="text/javascript">(window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:[]},session_trace:{sampling_rate:0.0,mode:"FIXED_RATE",enabled:true,error_sampling_rate:0.0},distributed_tracing:{enabled:true}};(window.NREUM||(NREUM={})).loader_config={agentID:"1134196315",accountID:"2626151",trustKey:"475556",xpid:"VgAFV1dWCRABVFdQBQAGUF0B",licenseKey:"NRJS-209506966f4b1a90574",applicationID:"1062211690"};window.NREUM||(NRE

In [3]:
# extract #skipToMain > div:nth-child(2)
container = soup.select_one('#skipToMain > div:nth-child(2)')
# extract element with keyword mdc-u-grid
elements = container.select('.mdc-u-grid')
# filter those that doesn't have a data-component name mdc-c-heading and mdc-c-description and without href
elements = [e for e in elements if e.select_one('[data-component="mdc-c-heading"]') and e.select_one('[data-component="mdc-c-description"]') and e.select_one('[data-component="mdc-c-icon"]') and e.select_one('a')]

In [4]:
# for each element, extract title (mdc-c-heading), description (mdc-c-description), and link (a) and picture (mdc-c-picture), store them in a list of dictionary
data = []
for e in elements:
    data.append({
        'title': e.select_one('[data-component="mdc-c-heading"]').text,
        'description': e.select_one('[data-component="mdc-c-description"]').text,
        'link': urljoin(base_url, e.select_one('a')['href']),
        'picture': urljoin(base_url, e.select_one('[data-component="mdc-c-picture"] img')['src']),
        # add a timestamp
        'scrapped_at': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    })

In [5]:
data

[{'title': 'Helping Starbucks design stores that are inclusive for all',
  'description': "Our clients around the world are achieving enduring change in their capabilities and performance. Leading with technology, we partner with them to see new potential for growth, innovate to net zero, and build capabilities across their entire organization, creating impact that goes beyond financial and operational performance improvements. Here's what that looks like.",
  'link': 'https://www.mckinsey.com/industries/retail/how-we-help-clients/helping-starbucks-design-stores-that-are-inclusive-for-all',
  'picture': 'https://www.mckinsey.com/~/media/mckinsey/industries/retail/how%20we%20help%20clients/helping%20starbucks%20design%20stores%20that%20are%20inclusive%20for%20all/starbucks-case-study-thumb_1536x1536.jpg?cq=50&mw=767&car=42:25&cpy=Center',
  'scrapped_at': '2024-08-15 11:35:18'},
 {'title': 'Helping Starbucks design stores that are inclusive for all',
  'description': 'May 13, 2024 - The

In [6]:
# based on the list, create a folder named mckinsey, in that folder create a folder named case-studies, in that folder for each element, create a folder named title, in that folder save the dictionary as json file, and download the picture and save it as picture.jpg
from pathlib import Path
import os
import shutil

# create mckinsey folder
mckinsey_folder = Path('mckinsey')

if not mckinsey_folder.exists():
    mckinsey_folder.mkdir()
    
# create case-studies folder
case_studies_folder = mckinsey_folder / 'case-studies'

if not case_studies_folder.exists():
    case_studies_folder.mkdir()
    
# for each element
for d in data:
    # create title folder, skip invalid characters for folder name
    title_folder = case_studies_folder / re.sub(r'[<>:"/\\|?*]', ' ', d['title'])
    
    print(f'processing: {d["title"]}')
    
    if not title_folder.exists():
        title_folder.mkdir()
        
    # save the dictionary as json file
    with open(title_folder / 'data.json', 'w') as f:
        json.dump(d, f)
        
    # download the picture and save it as picture.jpg
    r = s.get(d['picture'])
    
    if r.status_code == 200:
        with open(title_folder / 'picture.jpg', 'wb') as f:
            f.write(r.content)

processing: Helping Starbucks design stores that are inclusive for all
processing: Helping Starbucks design stores that are inclusive for all
processing: Helping Starbucks design stores that are inclusive for all
processing: Building a next-generation carbon platform to accelerate the path to net zero
processing: How Lufthansa is using data to reduce costs and improve spend and carbon transparency
processing: How Lufthansa is using data to reduce costs and improve spend and carbon transparency
processing: Banking on innovation: How ING uses generative AI to put people first
processing: From farm to tablet: Building a new business to solve an old challenge
processing: Rewired in action
processing: Partnering on America’s toughest challenges
processing: Made in Africa: Catalyzing stronger, sustainable, and inclusive economies
processing: How a government agency is preparing workers to thrive in the skills-based economy
processing: How a global components manufacturer built an ambitious c

In [7]:
data

[{'title': 'Helping Starbucks design stores that are inclusive for all',
  'description': "Our clients around the world are achieving enduring change in their capabilities and performance. Leading with technology, we partner with them to see new potential for growth, innovate to net zero, and build capabilities across their entire organization, creating impact that goes beyond financial and operational performance improvements. Here's what that looks like.",
  'link': 'https://www.mckinsey.com/industries/retail/how-we-help-clients/helping-starbucks-design-stores-that-are-inclusive-for-all',
  'picture': 'https://www.mckinsey.com/~/media/mckinsey/industries/retail/how%20we%20help%20clients/helping%20starbucks%20design%20stores%20that%20are%20inclusive%20for%20all/starbucks-case-study-thumb_1536x1536.jpg?cq=50&mw=767&car=42:25&cpy=Center',
  'scrapped_at': '2024-08-15 11:35:18'},
 {'title': 'Helping Starbucks design stores that are inclusive for all',
  'description': 'May 13, 2024 - The