## Webscraping ethics

- The absence of explicit scraping prohibitions in `robots.txt`, Terms of Service, or the Privacy Policy does not automatically grant permission for data extraction. So I directly reached out to La Leche League GB for approval before scraping.
By following these guidelines, web scraping activities were conducted in an ethical and legally compliant manner.

In [57]:
# import libraries

from bs4 import BeautifulSoup
import requests
import random
import pandas as pd


I used responsible request rates to prevent server strain, by scraping each information webpage separately.

In [None]:
# URL to scrape
url = "https://laleche.org.uk/category/breastfeeding-information/page/10/"

# Custom headers to simulate a real browser request (found user-agent string in the browser dev tools)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36'
    }

# Send a GET request to the website
response = requests.get(url, headers = headers)

# Parse the content of the page
soup = BeautifulSoup(response.text, 'html.parser')

In [59]:
print(response.status_code)

200


In [None]:
# Function to extract the content of an individual post
def fetch_post_content(post_url):
          responsepost = requests.get(post_url,headers=headers)
          soup = BeautifulSoup(responsepost.text, 'html.parser')
           # Extract the main content of the post
          content_tag = soup.find('div', {'class': 'entry-content'})  
          if content_tag:
               return content_tag.get_text(strip=True)
          else:
               return None


In [None]:
import time

# Locate all posts on the page
    
posts = soup.find_all('article')  
    
# Prepare lists to store post titles, URLs, and content
post_titles = []
post_urls = []
post_contents = []

    # Step 5: Extract the post titles and URLs
for post in posts:
    title_tag = post.find('h2')
    if title_tag:
        post_title = title_tag.get_text(strip=True)
        post_titles.append(post_title)
        
        link_tag = title_tag.find('a')
        if link_tag and 'href' in link_tag.attrs:
            post_url = link_tag['href']
            post_urls.append(post_url)

            # Step 6: Fetch the post content
            content = fetch_post_content(post_url)
            post_contents.append(content if content else "No content available")
        
    # Add a delay between requests to avoid overloading the server (polite scraping)
    time.sleep(1) 

In [None]:
# Create a DataFrame
df = pd.DataFrame({
        'Post Title': post_titles,
        'URL': post_urls,
        'Content': post_contents
    })
print(df.head)

<bound method NDFrame.head of                           Post Title  \
0           My Baby Won’t Breastfeed   
1       Jaundice in Healthy Newborns   
2    Is My Baby Getting Enough Milk?   
3                   Inverted nipples   
4             If You Leave Your Baby   
5          Dummies and Breastfeeding   
6  Caesarean Birth and Breastfeeding   
7            Beginning Breastfeeding   
8  Antenatal Expression of Colostrum   
9            Adjusting to Motherhood   

                                                 URL  \
0    https://laleche.org.uk/my-baby-wont-breastfeed/   
1                   https://laleche.org.uk/jaundice/   
2  https://laleche.org.uk/is-my-baby-getting-enou...   
3           https://laleche.org.uk/inverted-nipples/   
4     https://laleche.org.uk/if-you-leave-your-baby/   
5  https://laleche.org.uk/dummies-and-breastfeeding/   
6  https://laleche.org.uk/caesarean-birth-and-bre...   
7    https://laleche.org.uk/beginning-breastfeeding/   
8  https://laleche.org.uk

In [None]:
# Save to a CSV file 
df.to_csv('breastfeeding_info_links10.csv', index=False)
print("Done!")

Done!
