# Homework-2: SparkSQL and SQL on PostgreSQL

Haiya Niraj Shah \
Andrew id - haiyas

#Prerequisities from previous questions

In [None]:
#Setup Spark
import findspark
findspark.init()
findspark.find()

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext
import requests
import xml.etree.ElementTree as ET
conf=pyspark.SparkConf().set('spark.driver.host','127.0.0.1').setAppName("NewsData").setMaster("local")
sc=SparkContext.getOrCreate(conf=conf)
sqlContext=SQLContext(sc)
spark=sqlContext.sparkSession.builder.getOrCreate()

In [None]:
#Database configuration
db_config ={
    'username':"postgres",
    'password':"Amaatrarocks20",
    'url':"jdbc:postgresql://localhost:5432/postgres",
    'table':"news.google_newsFeed",
    'driver':"org.postgresql.Driver"}

In [None]:
#Fetching data
rss_url= "https://news.google.com/rss/search?q=technology&hl=en-US&gl=US&ceid=US:en"
response =requests.get(rss_url)
xml_data =response.content

In [None]:
#Standard date conversion
from datetime import datetime
root=ET.fromstring(xml_data)

channel=root.find('channel')
build_date_str=channel.find('lastBuildDate').text if channel.find('lastBuildDate') is not None else None

build_date=None
if build_date_str:
    try:
        build_date=datetime.strptime(build_date_str, '%a, %d %b %Y %H:%M:%S %Z')
    except:
        build_date=None

#Extract news
news_data=[]
for item in root.findall('.//item'):
    title=item.find('title')
    link=item.find('link')
    pub_date=item.find('pubDate')
    desc=item.find('description')
    source=item.find('source')

    pub_date_converted=None
    if pub_date is not None and pub_date.text:
        try:
            pub_date_converted=datetime.strptime(pub_date.text, '%a, %d %b %Y %H:%M:%S %Z')
        except:
            pub_date_converted=None

    source_text=source.text if source is not None else None

    news_item={
        'lastBuildDate':build_date,
        'title':title.text if title is not None else None,
        'link':link.text if link is not None else None,
        'pubDate':pub_date_converted,
        'description':desc.text if desc is not None else None,
        'source':source_text
    }

    news_data.append(news_item)

print(f"Parsed {len(news_data)} news items")

Parsed 100 news items


In [None]:
#Inserting data
df = spark.createDataFrame(news_data)
df.show(3, truncate=True)

df.write.format("jdbc")\
.mode("overwrite")\
.option("url",db_config['url'])\
.option("dbtable",db_config['table'])\
.option("user",db_config['username'])\
.option("password",db_config['password'])\
.option("driver",db_config['driver'])\
.save()

print(f"Inserted {df.count()} records")

+--------------------+-------------------+--------------------+-------------------+------------------+--------------------+
|         description|      lastBuildDate|                link|            pubDate|            source|               title|
+--------------------+-------------------+--------------------+-------------------+------------------+--------------------+
|<a href="https://...|2025-09-18 17:42:32|https://news.goog...|2025-09-18 13:05:56|            GOV.UK|Memorandum of Und...|
|<a href="https://...|2025-09-18 17:42:32|https://news.goog...|2025-09-17 13:37:37|      Fox Business|Expert predicts A...|
|<a href="https://...|2025-09-18 17:42:32|https://news.goog...|2025-09-18 16:57:47|The Times of India|New technology la...|
+--------------------+-------------------+--------------------+-------------------+------------------+--------------------+
only showing top 3 rows
Inserted 100 records


#Function to find news from last 24 hours

In [None]:
from datetime import datetime, timedelta
def find_recent_news():
    now=datetime.now()
    yesterday= now -timedelta(hours=24)

    all_news=spark.read.format("jdbc")\
    .option("url",db_config['url'])\
    .option("dbtable",db_config['table'])\
    .option("user",db_config['username'])\
    .option("password",db_config['password'])\
    .option("driver",db_config['driver'])\
    .load()

    recent_news=all_news.filter(all_news["pubDate"]>= yesterday)
    return recent_news

In [None]:
recent=find_recent_news()
print(f"Found {recent.count()} articles from last 24 hours")
print(f"Query executed at device time: {datetime.now().strftime('%m-%d-%Y at %I:%M %p')}")

Found 96 articles from last 24 hours
Query executed at device time: 09-18-2025 at 01:42 PM


In [None]:
print("Top 5 recent articles:")
recent.select("title","source","pubDate").orderBy("pubDate",ascending=False).show(5,truncate=False)


Top 5 recent articles:
+-------------------------------------------------------------------------------------------------------+------------------+-------------------+
|title                                                                                                  |source            |pubDate            |
+-------------------------------------------------------------------------------------------------------+------------------+-------------------+
|Insta360 Wins Emmy® Award for Innovative 360 Camera Technology and Seamless Software - PR Newswire     |PR Newswire       |2025-09-18 17:39:00|
|Policy Scholars program strengthens researcher's work with eye-tracking technology - Virginia Tech News|Virginia Tech News|2025-09-18 17:30:25|
|Nvidia to become one of Intel’s biggest shareholders with new investment - Al Jazeera                  |Al Jazeera        |2025-09-18 17:29:39|
|Trump, Starmer pledge closer US-UK ties on trade and technology - KSBY News                            |K

### References
### Stack Overflow Solutions
- Spark dataframe to get top 5 rows using sql or pandas dataframe: https://stackoverflow.com/questions/60068524/pyspark-dataframe-to-get-top-5-rows-using-sql-or-pandas-dataframe

### Class Materials
- Lecture_2_Introduction_to_Cloud_And_Spark on scalable data processing with PySpark DataFrames
- Lecture_3_SQL_and_SparkSQL for data manipulation operations
