In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# Function to extract content from a blog URL
def extract_blog_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract author name
            author_div = soup.find('div', class_='surfaceProfile-author-description')
            if author_div:
                author_name_tag = author_div.find('a')
                author_name = author_name_tag.get_text(strip=True) if author_name_tag else ''
            else:
                author_name = ''
            
            # Extract related topics
            related_topics_div = soup.find('div', class_='col-xs-12 col-sm-8 col-sm-offset-1 col-md-12 col-md-offset-0 col-lg-offset-0 default-style')
            if related_topics_div:
                related_topics_tags = related_topics_div.find_all('a')
                related_topics = ', '.join(tag.get_text(strip=True) for tag in related_topics_tags)
            else:
                related_topics = ''
            
            # Extract page content
            def extract_markdown_content(div):
                markdown_content = []
                for element in div:
                    if element.name == 'h1':
                        markdown_content.append(f"# {element.get_text(strip=True)}")
                    elif element.name == 'h2':
                        markdown_content.append(f"## {element.get_text(strip=True)}")
                    elif element.name == 'p':
                        markdown_content.append(f"{element.get_text(strip=True)}")
                    elif element.name == 'ul':
                        for li in element.find_all('li', recursive=False):
                            markdown_content.append(f"- {li.get_text(strip=True)}")
                    elif element.name == 'li':
                        markdown_content.append(f"- {element.get_text(strip=True)}")
                return "\n".join(markdown_content)
            
            main_div = soup.find('div', class_='optional-components paragraphSystem')
            if main_div:
                content_divs = main_div.find_all('div', class_='richText component section richText-copy-block col-xs-12')
                markdown_content = []
                for content_div in content_divs:
                    inner_div = content_div.find('div', class_='component-content')
                    if inner_div:
                        rich_text_div = inner_div.find('div', class_='richText-content')
                        if rich_text_div:
                            markdown_content.append(extract_markdown_content(rich_text_div.children))
                
                page_content = "\n\n".join(markdown_content)
            else:
                page_content = ''
            
            # Extract unique PDF links
            pdf_links = set()
            pdf_divs = soup.find_all('div', class_='fileList-download')
            for pdf_div in pdf_divs:
                pdf_link_tag = pdf_div.find('a', class_='fileList-download-link')
                if pdf_link_tag:
                    pdf_link = pdf_link_tag.get('href')
                    if pdf_link:
                        pdf_links.add(pdf_link)
            
            return author_name, related_topics, list(pdf_links), page_content
        else:
            return '', '', [], ''
    except Exception as e:
        return '', '', [], ''

# Read CSV file
input_file = 'sitemap_data.csv'
output_file = 'blogs_data_500.csv'
df = pd.read_csv(input_file)
# Iterate through each website in the CSV and extract data
for index, row in df.iterrows():
    website_url = row['Website']
    start_time = time.time()
    author_name, related_topics, pdf_links, page_content = extract_blog_content(website_url)
    print(time.time()-start_time)
    # Store extracted data in the DataFrame
    df.at[index, 'author_name'] = author_name
    df.at[index, 'related_topics'] = related_topics
    df.at[index, 'pdf_links'] = ', '.join(pdf_links)  # Join list into a string
    df.at[index, 'page_content'] = page_content

# Save the updated DataFrame to a new CSV file
df.to_csv(output_file, index=False)
print("Content extraction complete. Data saved to:", output_file)

1.116750717163086
1.9483673572540283
1.7466766834259033
1.1010446548461914
1.0025506019592285
1.0173020362854004
1.7144320011138916
1.7028050422668457
1.0172526836395264
1.895183801651001
0.8218324184417725
0.8294682502746582
1.2912945747375488
0.9922006130218506
1.033578872680664
1.6947636604309082
1.237975835800171
1.7959012985229492
1.8010694980621338
0.8709752559661865
1.7167997360229492
1.1964993476867676
1.8403661251068115
1.8671514987945557
0.954275369644165
1.719684362411499
0.9095790386199951
1.140958309173584
1.094181776046753
1.194443702697754
0.9561307430267334
1.6822230815887451
1.7614684104919434
0.9520959854125977
1.7609405517578125
13.359940528869629
1.9563071727752686
1.203094720840454
1.7528095245361328
1.7898762226104736
0.840017557144165
1.708045244216919
1.8394908905029297
1.7340612411499023
1.7185873985290527
1.8897607326507568
1.4442863464355469
1.7238855361938477
1.7295401096343994
1.8931655883789062
20.206456422805786
20.01771879196167
1.7380180358886719
0.8448