In [24]:
!python --version

Python 3.10.13


In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests


import spacy
nlp = spacy.load("en_core_web_sm")

import time
import datetime

In [60]:

def scrape_sitemap(url):
    """
    This method extracts all the links from a companys sitemap.xml
    input: url for a company sitemap.xml
    output : pandas Series with all the links or None if no links found
    
    """
    r = requests.get(url)
    bs = BeautifulSoup(r.text, 'lxml-xml')
    urlset = bs.find_all('loc')
    urls = []
    if len(urlset) > 0:
        for link in urlset:
            urls.append(link.text.strip())
    
    urls2 =[]
    for link in urls:
        if link.endswith('xml'):
            r = requests.get(link)
            if r.ok:
                bs2 = BeautifulSoup(r.text, 'lxml')
                urlset = bs2.find_all('loc')
                if len(urlset) >0:
                    for url in urlset:
                        urls2.append(url.text.strip())
        else:
            urls2.append(link)
            
    
    if len(urls2) >0:
        return pd.Series(urls2).drop_duplicates().str.strip()
    
    return None


def scrape_website(alllinks):
    scrape_sitemap(url):
    """
    This method extracts all the the text data from webpages of a company
    input: pandas series with all the links in the company's website
    output : pandas dataframe with the columns , key (webpage link), text
             that includes the text of the webpage, and the timestamp when the data 
             was scraped.
    
    """
    text_dict ={}

    for link in alllinks.to_list():
        r = requests.get(link)
        if r.ok:
            print(link)
            bs = BeautifulSoup(r.text, 'lxml')
            for tag in bs.find_all(['header', 'nav', 'footer']):
                tag.decompose()
            
            text_only = bs.get_text(separator =' ', strip=True)
            text_dict[link] = text_only
            

    df = pd.DataFrame()
    df['key'] = text_dict.keys()
    df['text'] = text_dict.values()
    df['timestamp'] = pd.to_datetime(datetime.date.today())
    return df

# run the scraping engine, and save the extracted data to csv
def main():
    print("Starting scraping :\n")
    sitemap = pd.read_csv("sitemap.csv")
    for item in sitemap['sitemap']:
        print(f"working on {item}")
        links = scrape_sitemap(item)
        print(f"{links.shape[0]} webpages found for scraping. For demo scraping only the first 10 webpage \n")

        df = scrape_website(links[:10])
        df.to_csv('apple.csv',index=False)
        print("\nFinished scraping. Data stored in apple.csv")
        
        
if __name__ == "__main__":
    main()
              
       
        
        
    

Starting scraping :

working on https://www.apple.com/sitemap.xml
623 webpages found for scraping. For demo scraping only the first 10 webpage 

https://www.apple.com/
https://www.apple.com/accessibility/
https://www.apple.com/accessibility/cognitive/
https://www.apple.com/accessibility/hearing/
https://www.apple.com/accessibility/mobility/
https://www.apple.com/accessibility/speech/
https://www.apple.com/accessibility/vision/
https://www.apple.com/airplay/
https://www.apple.com/airpods-2nd-generation/
https://www.apple.com/airpods-2nd-generation/compare/

Finished scraping. Data stored in apple.csv


In [54]:
df = pd.read_csv('apple.csv')
df

Unnamed: 0,key,text,timestamp
0,https://www.apple.com/,Apple Apple Save on Mac or iPad with education...,2023-09-25
1,https://www.apple.com/accessibility/,Accessibility - Apple Apple Accessibility Make...,2023-09-25
2,https://www.apple.com/accessibility/cognitive/,Accessibility - Cognitive - Apple Cognitive If...,2023-09-25
3,https://www.apple.com/accessibility/hearing/,Accessibility - Hearing - Apple Hearing Explor...,2023-09-25
4,https://www.apple.com/accessibility/mobility/,Accessibility - Mobility - Apple Mobility Disc...,2023-09-25
5,https://www.apple.com/accessibility/speech/,Accessibility - Speech - Apple Speech If you h...,2023-09-25
6,https://www.apple.com/accessibility/vision/,Accessibility - Vision - Apple Vision If you’r...,2023-09-25
7,https://www.apple.com/airplay/,AirPlay - Apple AirPlay Watch. Listen. Share. ...,2023-09-25
8,https://www.apple.com/airpods-2nd-generation/,AirPods (2nd generation) - Apple AirPods Wirel...,2023-09-25
9,https://www.apple.com/airpods-2nd-generation/c...,AirPods - Compare Models - Apple Compare AirPo...,2023-09-25


## Example text that was extracted

In [62]:
print(df['key'].iloc[0])
print(df['text'].iloc[0])

https://www.apple.com/
Apple Apple Save on Mac or iPad with education pricing and get a gift card up to $150. Gift card offer ends October 2. 1 Shop now iPhone 15 Pro Titanium. So strong. So light. So Pro. Learn more Buy A heroic animation that shows light reflecting across iPhone 15 Pro to emphasize its all-new titanium design. iPhone 15 New camera. New design. Newphoria. Learn more Buy Apple Watch Series 9 Smarter. Brighter. Mightier. Learn more Buy Apple Watch Ultra 2 Next level adventure. Learn more Buy Carbon Neutral A first for Apple Watch. And Apple. Learn more Watch the film Shop iPhone with us Save with trade-in, connect to your carrier, and transfer your data, all right here at Apple. Shop iPhone Apple Trade In Get $200-$650 in credit when you trade in iPhone 11 or higher. 2 See what your device is worth AirPods Pro Adaptive Audio. Now playing. Learn more Buy Apple Card Get up to 3% Daily Cash back with every purchase. Learn more Apply now Apply now Apple TV+


In [63]:
print(df['key'].iloc[1])
print(df['text'].iloc[1])

https://www.apple.com/accessibility/
Accessibility - Apple Apple Accessibility Make yours. The best technology works for everyone. That’s why our products and services are inclusive by design, with built-in accessibility features to help you connect, create, and do what you love — in the ways that work best for you. Vision Bigger, bolder, and clearer for you . Magnifier + Point and Speak Point to hear your way around. Reheat REHEAT Read more about Magnifier + Point and Speak Magnifier + Point and Speak Magnifier works like a digital magnifying glass, using the camera on your iPhone or iPad to increase the size of anything you point it at — from a prescription bottle to a candlelit menu. For those who are blind or have low vision and want more information about their physical surroundings, Detection Mode in Magnifier combines input from the camera, LiDAR Scanner, and on-device machine learning to offer intelligent tools like People Detection, Door Detection, Image Descriptions, Text Det