In [None]:
from bs4 import BeautifulSoup
import requests
from time import sleep
import pandas as pd

PAGES_TO_GET = 20 #100 articles per page
pages = list(range(1,PAGES_TO_GET+1))
main_url = 'https://www.nami.org/Blogs/NAMI-Blog'
urls = [main_url + "?page=" + str(page) for page in pages]

In [None]:
#scraping article titles and links 
def get_article_links(url):
    article_titles = []
    article_links = []
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        alldivs = soup.find_all(class_='col-md-4 col-lg-3')
        
        for article in alldivs:
            title = article.find('p').text
            link = article.find('a')['href']
            article_titles.append(title)
            article_links.append('https://www.nami.org' + link)

        return article_titles, article_links
    else:
        return

    
#scraping article content
def scrape_article_content(article_url):
    response = requests.get(article_url)
    if response.status_code == 200:
        article_soup = BeautifulSoup(response.text, 'html.parser')
        # Extract the content of the article based on its structure
        author = article_soup.find('p', class_='authorname').get_text().strip().replace("  "," ")
        year = article_soup.find('span', class_='year').get_text().strip()
        month = article_soup.find('span', class_='month').get_text().strip()
        day = article_soup.find('span', class_='day').get_text().strip()
        content = article_soup.find('div', class_='content-container').get_text().strip()
    else:
        return None, None, None, None, None
    return author, year, month, day, content

In [None]:
import logging

logging.basicConfig(filename='scraping_log.txt', level=logging.DEBUG)

all_article_titles = []
all_article_links = []
all_article_authors = []
all_article_years = []
all_article_contents = []

for url in urls:
    page_article_titles, page_article_links = get_article_links(url)
    all_article_titles.extend(page_article_titles)
    all_article_links.extend(page_article_links)
    sleep(3)

for i, article_link in enumerate(all_article_links):
    try:
        article_info = scrape_article_content(article_link)
        if article_info is not None and len(article_info) >= 5:
            all_article_authors.append(article_info[0])
            all_article_years.append(article_info[1])
            all_article_contents.append(article_info[4])
        else:
            logging.warning(f"Skipping article {i}: Invalid data")

        if i % 10 == 0 or i == len(all_article_links) - 1: 
            data_df = pd.DataFrame({
                'title': all_article_titles[:len(all_article_authors)],
                'link': all_article_links[:len(all_article_authors)],
                'author': all_article_authors,
                'year': all_article_years,
                'content': all_article_contents
            })
            data_df.to_csv("article_contents_3iteration_2round.csv", index=False)
            logging.info(f"Saved articles up to index {i}")

    except Exception as e:
        logging.error(f"Error at article {i}: {e}")

    sleep(3)