In [1]:
import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime, timedelta


URL = "https://fijivillage.com/news"
page = requests.get(URL)

wholepage = BeautifulSoup(page.content, "html.parser")


In [10]:
def extractinfo(divdata):
    try:
        soup = BeautifulSoup(str(divdata))
        
        # Extract the article URL and title
        article_link = soup.find('a', href=True)
        article_url = article_link['href'] if article_link else None
        title_tag = soup.find('h6')
        article_title = title_tag.text.strip() if title_tag else None
        
        # Extract the image URL
        img_tag = soup.find('img')
        image_url = img_tag.get('src') if img_tag else None

        time_span = soup.find('span')
        time = time_span.get_text()

        summary_div = soup.find('div', class_='clear20')
        summary = summary_div.previous_element.get_text().strip()
        
        
        return {
            'title': article_title,
            'summary': summary,
            'article_url': article_url,
            'publish_time': time,
            'image_url': image_url,
        }
    except Exception as e:
        print(f"Error parsing HTML snippet: {e}")
        return None

In [16]:
newsdivs = wholepage.find_all("div", class_="col-md-4 pt-2")
news = []

for div in newsdivs:
    news.append(extractinfo(divdata=div))

print(len(news), news[0])

104 {'title': 'Rubbish piles up like a seawall – Nasinu residents', 'summary': 'Residents of Nasinu are raising concerns about white goods and green waste not being collected by th', 'article_url': 'https://www.fijivillage.com/news/Rubbish-piles-up-like-a-seawall--Nasinu-residents-rf58x4/', 'publish_time': '33 minutes ago', 'image_url': 'https://www.fijivillage.com/news_images/118733207366de39150a77e541e59b.jpg'}


In [2]:
def parse_relative_time(relative_time_str):
    match = re.match(r'(\d+)\s+(\w+)\s+ago', relative_time_str)
    if match:
        amount, unit = match.groups()
        amount = int(amount)
        if unit.startswith('hour'):
            return timedelta(hours=amount)
        elif unit.startswith('minute'):
            return timedelta(minutes=amount)
        elif unit.startswith('day'):
            return timedelta(days=amount)
    return timedelta()  # Return 0 if no match

def parse_news_article(html_snippet, position):
    try:
        soup = BeautifulSoup(str(html_snippet))
        
        # Extract the article URL and title
        article_link = soup.find('a', href=True)
        article_url = article_link['href'] if article_link else None
        title_tag = soup.find('h6')
        article_title = title_tag.text.strip() if title_tag else None
        
        # Extract the image URL
        img_tag = soup.find('img')
        image_url = img_tag.get('src') if img_tag else None
        
        # Extract the summary (div above the tags div)
        tags_div = soup.find('div', id=f'position{position}tags')
        summary = tags_div.find_previous().find_previous('div').text.strip() if tags_div else None
        
        # Extract the relative time and convert to datetime
        time_tag = soup.find('span', id=f'position{position}time')
        if time_tag:
            relative_time = time_tag.text.strip()
            time_delta = parse_relative_time(relative_time)
            publish_time = datetime.now() - time_delta
        else:
            publish_time = None
        
        # Extract tags
        tags = [a.text for a in tags_div.find_all('a')] if tags_div else []
        
        return {
            'title': article_title,
            'summary': summary,
            'article_url': article_url,
            'publish_time': publish_time,
            'image_url': image_url,
            'tags': tags
        }
    except Exception as e:
        print(f"Error parsing HTML snippet: {e}")
        return None

In [3]:
positionelements = wholepage.find_all(id=re.compile("position\\d+$"))
print(positionelements)
newslist = []

for position , div in enumerate(positionelements):
    newslist.append(parse_news_article(div, position))
    # htmldiv = BeautifulSoup(div)
    # anchortag = htmldiv.find("a")
    # imagetag = htmldiv.find("a")
    # header = htmldiv.find("h3")
    # print(anchortag.get("href"), imagetag.get("src"), header.text)
print(len(newslist),newslist)

[<div class="col-md-3 pt-2" id="position3">
<div class="d-flex align-items-start flex-column h-100" style="background-color: #fafafa;">
<div class="mb-auto">
<a href="https://www.fijivillage.com/news/Man-in-his-40s-dies-after-accident-at-Laqere-Bridge-r548fx/"><img class="img-responsive" src="https://www.fijivillage.com/news_images/212998775166de23100ee0a9fff925.jpg" style="border-radius:10px;" width="100%"/></a>
<div class="clear10"></div>
<a href="https://www.fijivillage.com/news/Man-in-his-40s-dies-after-accident-at-Laqere-Bridge-r548fx/"><h6>Man in his 40’s dies after accident at Laqere Bridge</h6></a>
</div>
<div>
		            A man in his 40’s has died following an accident at the Laqere Bridge in Nasinu yesterday afternoo...			         <div class="clear20"></div>
<div class="tags" id="position3tags">
<a class="border border-info rounded-pill px-1" href="https://www.fijivillage.com/tags/Police/">Police</a>
<a class="border border-info rounded-pill px-1" href="https://www.fijivil

In [4]:
featured = wholepage.find_all("h3")
print(len(featured),featured)
results = wholepage.find_all("h6")
print(len(results),results)

5 [<h3> <a href="https://www.fijivillage.com/news/Rubbish-piles-up-like-a-seawall--Nasinu-residents-rf58x4/">
				    	Rubbish piles up like a seawall – Nasinu residents
				    	</a>
</h3>, <h3><a href="https://www.fijivillage.com/sports/Bulldogs-enforcer-Kikau-escapes-suspension-with-early-guilty-plea-rx58f4/">Bulldogs enforcer Kikau escapes suspension with early guilty plea</a></h3>, <h3><a href="https://www.fijivillage.com/entertainment/Deepika-and-Ranveer-blessed-with-baby-girl-r8x54f/">Deepika and Ranveer blessed with baby girl</a></h3>, <h3><a href="https://www.fijivillage.com/fashion/Pharrells-Joopiter-puts-on-sale-of-K-pop-star-G-Dragons-prized-possessions-4xfr58/">Pharrell’s Joopiter puts on sale of K-pop star G-Dragon’s prized possessions</a></h3>, <h3><a href="https://www.fijivillage.com/business/Customers-can-now-shop-at-Kasabias-using-the-Sole-app-5fxr84/">Customers can now shop at Kasabias using the Sole app</a></h3>]
40 [<h6>Women are equal partners in a country’s quest

In [5]:
for res in results:
    print("Title : ", res.text)
    print("Link : ",res.parent.get('href'))
    print("Image : ",res.parent.previous_element.previous_element.previous_element.previous_element.get('src'))
    print(" : ",res.next_element.next_element.next_element.next_element.next_element)
    print("\n")

Title :  Women are equal partners in a country’s quest for economic and social progress – Prof. Prasad
Link :  https://www.fijivillage.com/news/Women-are-equal-partners-in-a-countrys-quest-for-economic-and-social-progress--Prof-Prasad-rxf584/
Image :  https://www.fijivillage.com/news_images/64736923466de3bc6b3a123139b0d7.jpg
 :  
			        Women are equal partners in a country’s quest for economic and social progress.

Deputy Prime Mi...			        


Title :  Man in his 40’s dies after accident at Laqere Bridge
Link :  https://www.fijivillage.com/news/Man-in-his-40s-dies-after-accident-at-Laqere-Bridge-r548fx/
Image :  https://www.fijivillage.com/news_images/212998775166de23100ee0a9fff925.jpg
 :  
		            A man in his 40’s has died following an accident at the Laqere Bridge in Nasinu yesterday afternoo...			         


Title :   Archbishop Peter Loy Chong meets Pope Francis in PNG
Link :  https://www.fijivillage.com/news/-Archbishop-Peter-Loy-Chong-meets-Pope-Francis-in-PNG-5rfx