In [None]:
!pip install python-docx

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from io import BytesIO
from docx.shared import Inches, Pt
from docx import Document

In [None]:
url = "https://nasvete.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
articles = soup.find_all('article')

In [None]:
# Create a new Document
doc = Document()

In [None]:
def add_heading(text):
    heading = doc.add_heading(text, level=1)
    heading.runs[0].font.size = Pt(18)

def add_paragraph(text):
    paragraph = doc.add_paragraph(text)
    for run in paragraph.runs:
        run.font.size = Pt(12)


In [None]:
def add_image_from_url(doc, image_url, width=6):
    response = requests.get(image_url)
    image_data = BytesIO(response.content)
    doc.add_picture(image_data, width=Inches(width))

def add_all_images(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all div elements with class "wp-caption alignnone"
    image_divs = soup.find_all('div', class_='wp-caption alignnone')

    for div in image_divs:
        # Extract image URL
        img_tag = div.find('img')
        if img_tag and 'src' in img_tag.attrs:
            image_url = img_tag['src']

            # Add image to document
            add_image_from_url(doc, image_url)

            # Extract and add caption
            caption_p = div.find('p', class_='wp-caption-text')
            if caption_p:
                doc.add_paragraph(caption_p.text)

            # Add a blank line for spacing
            doc.add_paragraph()


In [None]:
for article in tqdm(articles):

  title = article.find('h2').get_text()  ## get the title of the article
  para = article.find(class_="text_box").get_text().strip()[:-13]   ## get the article

  ## add both to doc file
  add_heading(title)
  add_paragraph(para)

  ## find the "Read more" link, and go to that link to extract images in the article
  link = article.find('a',class_="more-link").get('href')
  content = requests.get(link)
  add_all_images(content.text)

  ## extra line for spacing after each article
  doc.add_paragraph()

100%|██████████| 7/7 [01:40<00:00, 14.34s/it]


In [None]:
doc.save('upwork_document.docx')     ## save the docx file