In [3]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import json
import time

In [4]:
website = "https://flo.health/menstrual-cycle"

In [5]:
def fetch_html_content(url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    service = Service('/usr/local/bin/chromedriver') 
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    time.sleep(10)
    html_content = driver.page_source
    driver.quit()
    return html_content

In [6]:
def get_article_links(website):
    html_content = fetch_html_content(website)
    soup = BeautifulSoup(html_content, 'html.parser')
    links = []
    
    for a in soup.find_all('h3', class_='flo-categories__article-title'):
        parent_link = a.find_parent('a', href=True) 
        if parent_link:
            href = parent_link['href']
            if href and href.startswith('/menstrual-cycle/'):
                links.append(f"https://flo.health{href}")
    return links[:5]

In [7]:
article_links = get_article_links(website)
print(article_links)

['https://flo.health/menstrual-cycle/health/ovulation/nipple-sensitivity-after-ovulation', 'https://flo.health/menstrual-cycle/health/follicular-phase', 'https://flo.health/menstrual-cycle/health/luteal-phase', 'https://flo.health/menstrual-cycle/health/ovulation/what-is-ovulation', 'https://flo.health/menstrual-cycle/sex/sexual-health/sex-on-period']


In [13]:
def article_contents(article_url):
    
    html_content = fetch_html_content(article_url)
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract the title
    title_tag = soup.find('h1')  # Adjust the tag based on actual HTML
    title = title_tag.get_text(strip=True) if title_tag else 'No title'
    
    # Extract the publication date
    date_container = soup.find('div', class_='flo-article-banner-bottom__info-panel-date--item')  
    date_tag = date_container.find('span')  # Adjust the tag based on actual HTML
    date = date_tag.get_text(strip=True) if date_tag else 'No date'

    content = []

    # Define classes to search for content
    classes_to_search = [
        'flo-content__main', 
        'flo-article-text', 
        'flo-article-text__inner'
    ]

    # Extract content from the specified classes
    for class_name in classes_to_search:
        content_tags = soup.find_all(class_=class_name)
        for tag in content_tags:
            for element in tag.find_all(['p', 'h1', 'h2', 'h3']):
                content.append(element.get_text(strip=True))
        
        # Extract content under subheadings and their paragraphs
        for heading in soup.find_all(['h1', 'h2', 'h3']):
            content.append(heading.get_text(strip=True))
            sibling = heading.find_next_sibling()
            while sibling and sibling.name == 'p':
                content.append(sibling.get_text(strip=True))
                sibling = sibling.find_next_sibling()
        
    content = " ".join(content) if content else 'No content'
        
        # Print the extracted information for debugging
    print(f"Title: {title}")
    print(f"URL: {article_url}")
    print(f"Date: {date}")
    print(f"Content Preview: {content}...")  # Print a preview of the content

    return {
        'title': title,
        'url': article_url,
        'date': date,
        'content': content
    }

In [14]:
# Test the function with one article URL
article_link = "https://flo.health/menstrual-cycle/health/ovulation/nipple-sensitivity-after-ovulation"
article_data = article_contents(article_link)

Title: Sensitive nipples after ovulation: What does it mean?
URL: https://flo.health/menstrual-cycle/health/ovulation/nipple-sensitivity-after-ovulation
Date: 24 July 2024
Content Preview: Every piece of content at Flo Health adheres to the highest editorial standards for language, style, and medical accuracy. To learn what we do to deliver the best health and lifestyle insights to you, check outour content review principles. Breast pain and nipple sensitivity are common symptoms at different points in your cycle. Here’s what can cause it and when to speak to your doctor. Changes in the way yourboobs and nipples lookand feel throughout the month can be confusing and uncomfortable. However, it’s not uncommon to experiencebreast pain or nipple sensitivityat different points in your cycle. You may be able to attribute this tohormone changes. Knowing what can be considered typical and when pain or sensitivity might be something to worry about can be difficult. So, here’s everything you nee

In [17]:
# Example usage
articles = []
for link in article_links:
    content = article_contents(link)
    articles.append(content)
    print(content)

Title: Sensitive nipples after ovulation: What does it mean?
URL: https://flo.health/menstrual-cycle/health/ovulation/nipple-sensitivity-after-ovulation
Date: 24 July 2024
Content Preview: Changes in the way yourboobs and nipples lookand feel throughout the month can be confusing and uncomfortable. However, it’s not uncommon to experiencebreast pain or nipple sensitivityat different points in your cycle. You may be able to attribute this tohormone changes. Knowing what can be considered typical and when pain or sensitivity might be something to worry about can be difficult. So, here’s everything you need to know about breast pain andnipple sensitivity, particularly afterovulation....
{'title': 'Sensitive nipples after ovulation: What does it mean?', 'url': 'https://flo.health/menstrual-cycle/health/ovulation/nipple-sensitivity-after-ovulation', 'publication_date': '24 July 2024', 'content': 'Changes in the way yourboobs and nipples lookand feel throughout the month can be confusing and