<a href="https://colab.research.google.com/github/hjshreya/N-360/blob/main/webScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import requests
import re
import os
import pandas as pd
from bs4 import BeautifulSoup
from dateutil import parser

pd.set_option('display.max_colwidth', None)

class WebScraper:
    def __init__(self, rss_url):
        self.url = rss_url

    #1.fetch Rss/xml
    def getHTML(self):
        page = requests.get(self.url)
        soup = BeautifulSoup(page.content, "xml")
        return soup

    def clean_text(self, html_text): #function to remove whitespaces, linebreaks
        if not html_text:
            return None

        soup = BeautifulSoup(html_text, "html.parser")
        text = soup.get_text(separator=" ")

        text = re.sub(r"\s+", " ", text).strip()
        return text

    #2.Read rss items
    def readRSS(self, soup):
        items = soup.find_all("item")
        data = []

        for item in items:
            title = item.title.text.strip() if item.title else None
            link = item.link.text.strip() if item.link else None
            description = item.description.text.strip() if item.description else None
            publish_DateTime = item.pubDate.text.strip() if item.pubDate else None
            comments_elem = item.find("comments")
            comments = comments_elem.text.strip() if comments_elem else None

            dc_creator = item.find("dc:creator")
            author = dc_creator.text.strip() if dc_creator else None

            guid_tag = item.find("guid")
            guid = guid_tag.text.strip() if guid_tag else None

            category_elements = item.find_all(
                lambda tag: tag.name and tag.name.endswith("category")
                )
            categories = [cat.text.strip() for cat in category_elements]


            content_encoded = item.find("content:encoded")
            content = self.clean_text(content_encoded.text) if content_encoded else None

            image = None
            image_tag = item.find("image")

            if image_tag:
                img_tag = image_tag.find("img")
                if img_tag and img_tag.get("src"):
                    image = img_tag["src"]

            data.append({
            "title": title,
            "date-time": self.format_time(publish_DateTime),
            "description": description,
            "url": link,
            "author": author,
            "guid": guid,
            "category": categories,
            "image": image,
            "comments": comments,
            "content": content
           })


        return data

    #3. Format date/time
    def format_time(self, publish_DateTime):
        if not publish_DateTime:
            return None

        dt = parser.parse(publish_DateTime)
        return dt.strftime("%d-%m-%Y %H:%M:%S")

    #4.Create dataframe
    def create_df(self, data):
        df = pd.DataFrame(data)
        df.drop_duplicates(subset=["date-time"], inplace=True)
        return df

    #5. Update or create excel file
    def update_excelSheet(self, df, excel_file="RSS_data.xlsx"):
      if os.path.exists(excel_file):
          existing_df = pd.read_excel(excel_file)
          updated_df = pd.concat([existing_df, df], ignore_index=True)
          updated_df.drop_duplicates(subset=["date-time"], inplace=True)
      else:
          updated_df = df

      updated_df.to_excel(excel_file, index=False)
      print(f"Excel file is updated: {excel_file}")

In [63]:
domains = ["home", "latest", "analysis", "politics","cricket","movies","health","style","pakistan",
           "sindh","punjab","balochistan","khyber-pakhtunkhwa","jammu-kashmir","gilgit-baltistan",
           "business","world",'sports',"tech","games","talko",'gadget',"life-style","art-books",
           "music","film",'fashion',"gossip","tv","theatre","entertainment","opinion","editorial","blogs"]
size = len(domains)
count_current = 0
def get_rss_url(domain):
    base_url = "https://tribune.com.pk/feed/"
    return f"{base_url}{domain.strip()}"

# Google Drive path
excel_file = "/content/drive/MyDrive/RSS_data.xlsx"

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

for domain in domains:
    print(f"Scraping domain: {domain}")
    rss_url = get_rss_url(domain)
    scraper = WebScraper(rss_url)
    soup = scraper.getHTML()
    data = scraper.readRSS(soup)
    df = scraper.create_df(data)

    scraper.update_excelSheet(df, excel_file)
    count_current += 1
    print(f"Count domain: {count_current}")
print("All " + str(size) + " domains scraped." if size == count_current else "Error")

Mounted at /content/drive
Scraping domain: home
Excel file is updated: /content/drive/MyDrive/RSS_data.xlsx
Count domain: 1
Scraping domain: latest
Excel file is updated: /content/drive/MyDrive/RSS_data.xlsx
Count domain: 2
Scraping domain: analysis
Excel file is updated: /content/drive/MyDrive/RSS_data.xlsx
Count domain: 3
Scraping domain: politics
Excel file is updated: /content/drive/MyDrive/RSS_data.xlsx
Count domain: 4
Scraping domain: cricket
Excel file is updated: /content/drive/MyDrive/RSS_data.xlsx
Count domain: 5
Scraping domain: movies
Excel file is updated: /content/drive/MyDrive/RSS_data.xlsx
Count domain: 6
Scraping domain: health
Excel file is updated: /content/drive/MyDrive/RSS_data.xlsx
Count domain: 7
Scraping domain: style
Excel file is updated: /content/drive/MyDrive/RSS_data.xlsx
Count domain: 8
Scraping domain: pakistan
Excel file is updated: /content/drive/MyDrive/RSS_data.xlsx
Count domain: 9
Scraping domain: sindh
Excel file is updated: /content/drive/MyDrive/R