<a href="https://colab.research.google.com/github/hjshreya/N-360/blob/main/webScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import requests
import os
import pandas as pd
from bs4 import BeautifulSoup
from dateutil import parser

class WebScraper:
    def __init__(self, rss_url):
        self.url = rss_url

    #1.fetch Rss/xml
    def getHTML(self):
        page = requests.get(self.url)
        soup = BeautifulSoup(page.content, "xml")
        return soup

    #2.Read rss items
    def readRSS(self, soup):
        items = soup.find_all("item")
        data = []

        for item in items:
            title = item.title.text
            link = item.link.text
            description = item.description.text
            pub_date = item.pubDate.text

            data.append({
                "title": title,
                "date": self.format_time(pub_date),
                "description": description,
                "url": link
            })

        return data

    #3. Format date/time
    def format_time(self, pub_date):
        if not pub_date:
            return None
        try:
            dt = parser.parse(pub_date)
            return dt.strftime("%Y-%m-%d %H:%M:%S")
        except Exception:
            return None

    #4.Create dataframe
    def create_df(self, data):
        df = pd.DataFrame(data)
        df.drop_duplicates(subset=["title", "url"], inplace=True)
        return df

    #5. Update or create excel file
    def update_excelSheet(df, excel_file="RSS_Sheet.xlsx"):
      if os.path.exists(excel_file):
          existing_df = pd.read_excel(excel_file)
          updated_df = pd.concat([existing_df, df], ignore_index=True)
          updated_df.drop_duplicates(subset=["title", "url"], inplace=True)
      else:
          updated_df = df

      updated_df.to_excel(excel_file, index=False)
      print(f"Excel file updated: {excel_file}")





In [22]:
rss_url = "https://tribune.com.pk/feed/analysis"
scraper = WebScraper(rss_url)

soup = scraper.getHTML()
data = scraper.readRSS(soup)
df = scraper.create_df(data)

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

excel_file = "/content/drive/MyDrive/RSS_Sheet.xlsx"
WebScraper.update_excelSheet(df, excel_file)

Mounted at /content/drive
Excel file updated: /content/drive/MyDrive/RSS_Sheet.xlsx
