In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os


url = "https://dev.to/latest"
ua = UserAgent()
userAgent = ua.random
headers = {"user-agent": userAgent}
page = requests.get(url, headers = headers)

soup = BeautifulSoup(page.content, "html.parser")

In [2]:
blog_box = soup.find_all("div", class_ = "crayons-story__body")

links = []
titles = []
time_uploaded = []
authors = []
tags = []
reading_times = []

for box in blog_box:
    #links
    if box.find("h2", class_ = "crayons-story__title") is not None:
        link = box.find("h2", class_ = "crayons-story__title").a
        link = link["href"]
        links.append(link.strip())
    else:
        links.append("None")
        
    #titles
    if box.find("h2", class_ = "crayons-story__title") is not None:
        title = box.find("h2", class_ = "crayons-story__title")
        titles.append(title.text.replace("\n", "").strip())
    else:
        titles.append("None")
        
    #time_uploaded
    if box.find("time", attrs = {"datetime": True}) is not None:
        time_upload = box.find("time", attrs = {"datetime": True})
        time_upload = time_upload["datetime"]
        time_uploaded.append(time_upload)
    else:
        time_uploaded.append("None")
        
    #authors
    if box.find("a", class_ = "crayons-story__secondary fw-medium m:hidden") is not None:
        author = box.find("a", class_ = "crayons-story__secondary fw-medium m:hidden")
        authors.append(author.text.replace("\n", "").strip())
    else:
        authors.append("None")
        
    #tags
    if box.find("div", class_ = "crayons-story__tags") is not None:
        tag = box.find("div", class_ = "crayons-story__tags")
        tags.append(tag.text.replace("\n", " ").strip())
    else:
        tags.append("None")
        
    #reading_times
    if box.find("div",class_ = "crayons-story__save") is not None:
        reading_time = box.find("div",class_ = "crayons-story__save")
        reading_times.append(reading_time.text.replace("\n", "").strip())
    else:
        reading_times.append("None")

In [3]:
blog_df = pd.DataFrame(
    {
        "Link": links,
        "Title": titles,
        "Time_Uploaded": time_uploaded,
        "Author": authors,
        "Tag": tags,
        "Reading_Time": reading_times
    }
)

blog_df = blog_df[blog_df["Link"] != "None"]

In [4]:
blog_df.Link.to_list()


['https://dev.to/bekahhw/taking-time-to-breathe-a-new-chapter-begins-42nn',
 'https://dev.to/devops_fundamental/aws-fundamentals-aws-marketplace-kim',
 'https://dev.to/mukilaperiyasamy/today-i-learned-objectthis-keyword-and-hoisting-in-reactjs-6dl',
 'https://dev.to/devops_fundamental/azure-fundamentals-microsoftapp-51dg',
 'https://dev.to/devops_fundamental/gcp-fundamentals-ad-exchange-buyer-api-ii-5fk9',
 'https://dev.to/nextblockcms/we-just-launched-our-dev-docs-for-nextblock-cms-nextjs-supabase-3h10',
 'https://dev.to/lovestaco/locking-it-down-with-redis-acls-a-devs-guide-to-secure-access-1935',
 'https://dev.to/vaib/unlocking-the-power-of-public-cloud-essential-deployment-resources-1big',
 'https://dev.to/ivanrochacardoso/dashboard-em-tempo-real-com-vuejs-quasar-mqtt-usando-hivemq-2d47',
 'https://dev.to/saber9-8/your-guide-to-basic-linux-commands-day05-3ake',
 'https://dev.to/jpraiseofficial/translating-my-javascript-project-to-typescript-lessons-surprises-and-a-few-gotchas-apc',

In [5]:
article = []
article_link = []

def get_full_content(url2):
    ua = UserAgent()
    userAgent = ua.random
    headers = {"user-agent": userAgent}
    page = requests.get(url2, headers = headers)

    soup2 = BeautifulSoup(page.content, "html.parser")
    #print(url2)



    content = soup2.find("div", class_ = "crayons-article__main")

    paragraphs = content.find_all("p")

    contents = []

    for x in paragraphs:
        contents.append(x.text.replace("\n", " "))

    full_content = " ".join(contents)
    article.append(full_content)
    article_link.append(url2)
    
for i in blog_df.Link:
    get_full_content(i)

article_df = pd.DataFrame(
    {
        "Link": article_link,
        "Article_Content": article
    }
)

In [6]:
merged_df = blog_df.merge(article_df, on = "Link", how = "inner")

In [7]:
pip install langid pycountry nltk

Note: you may need to restart the kernel to use updated packages.


In [10]:
from nltk.corpus import stopwords
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the stopwords dataset
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("vader_lexicon")
nltk.download("punkt_tab")


def count_words_without_stopwords(text):
    if isinstance(text, (str, bytes)):
        words = nltk.word_tokenize(str(text))
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return len(filtered_words)
    else:
        0
        
merged_df['Word_Count'] = merged_df["Article_Content"].apply(count_words_without_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
sent = SentimentIntensityAnalyzer()

def get_sentiment(record):
    sentiment_scores = sent.polarity_scores(record)
    compound_score = sentiment_scores['compound']
    
    if compound_score >= 0.05:
        sentiment = 'Positive'
    elif compound_score <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
        
    return compound_score, sentiment

merged_df[['Compound_Score' ,'Sentiment']] = merged_df['Article_Content'].astype(str).apply(lambda x: pd.Series(get_sentiment(x)))

In [14]:
import langid
import pycountry

def detect_language(text):
    # Convert Nan to an empty string
    text = str(text) if pd.notna(text) else ''
    
    # Use langid to detect the language
    lang, confidence = langid.classify(text)
    return lang

merged_df['Language'] = merged_df['Article_Content'].apply(detect_language)
merged_df['Language'] = merged_df['Language'].map(lambda code: pycountry.languages.get(alpha_2 = code).name if pycountry.languages.get(alpha_2 = code) else code)

In [15]:
filtered_df = merged_df[merged_df['Language'] == 'English'].reset_index(drop = True)
filtered_df['Reading_Time'] = filtered_df['Reading_Time'].str.replace(' min read', '', regex=False).str.strip().astype(int)
filtered_df.head()

Unnamed: 0,Link,Title,Time_Uploaded,Author,Tag,Reading_Time,Article_Content,Word_Count,Compound_Score,Sentiment,Language
0,https://dev.to/bekahhw/taking-time-to-breathe-...,Taking Time to Breathe: A New Chapter Begins,2025-06-19T17:25:07Z,BekahHW,#community #career,2,"As of June 1st, I'm no longer with the Linux F...",165,0.9807,Positive,English
1,https://dev.to/devops_fundamental/aws-fundamen...,AWS Fundamentals: Aws Marketplace,2025-06-19T17:18:14Z,DevOps Fundamental,#aws #cloudcomputing #devops #awsmarketplace,6,"In today's fast-paced, digital world, cloud se...",311,0.9942,Positive,English
2,https://dev.to/mukilaperiyasamy/today-i-learne...,"Today I learned-Object,this keyword and hoisti...",2025-06-19T17:16:33Z,P Mukila,#objectreact #hoistingreact #thiskeyword,2,1.What is an Object in React.js? In JavaScript...,88,0.0,Neutral,English
3,https://dev.to/devops_fundamental/azure-fundam...,Azure Fundamentals: Microsoft.App,2025-06-19T17:14:23Z,DevOps Fundamental,#azure #microsoft #devops #microsoftapp,4,Picture this: You're a startup CTO racing to d...,275,0.9427,Positive,English
4,https://dev.to/devops_fundamental/gcp-fundamen...,GCP Fundamentals: Ad Exchange Buyer API II,2025-06-19T17:09:37Z,DevOps Fundamental,#gcp #googlecloud #devops #adexchangebuyerapiii,3,"Digital advertising is a fast-paced, data-driv...",282,0.9274,Positive,English


In [19]:
# CREATE TABLE IF NOT EXISTS articles(
# Link TEXT,
# Title TEXT,
# Time_Uploaded TIMESTAMP,
# Author TEXT,
# Tag TEXT,
# Reading_Time INTEGER,
# Article_Content TEXT,
# Word_Count INTEGER,
# Compound_Score NUMERIC,
# Sentiment TEXT,
# Language TEXT
# );

!pip install psycopg2

Collecting psycopg2
  Downloading psycopg2-2.9.10-cp310-cp310-win_amd64.whl (1.2 MB)
     ---------------------------------------- 1.2/1.2 MB 136.7 kB/s eta 0:00:00
Installing collected packages: psycopg2
Successfully installed psycopg2-2.9.10


In [22]:
import psycopg2

db_params = {
    "dbname": "postgres",
    "user": "postgres.mkykuvtjidbtsasqrnfn",
    "password": "jaykayboss",
    "host": "aws-0-eu-west-1.pooler.supabase.com",
    "port": "5432"
}

try:
    # Connect to PostgreSQL
    conn = psycopg2.connect(**db_params)
    cursor = conn.cursor()
    
    # SQL Insert Query
    insert_query = """
    INSERT INTO articles (Link, Title, Time_Uploaded, Author, Tag, Reading_Time, Article_Content, Word_Count, Compound_Score, Sentiment, Language)
    VALUES (%s, %s, %s, %s, %s, %s, %s,%s, %s, %s, %s)
    ON CONFLICT (Link) DO NOTHING;  -- Avoids duplicate primary key errors
    """
    
    # Insert DataFrame records one by one
    for _, row in filtered_df.iterrows():
        cursor.execute(insert_query, (
            row['Link'], row['Title'], row['Time_Uploaded'],  row['Author'], row['Tag'], row['Reading_Time'],
            row['Article_Content'],row['Word_Count'],row['Compound_Score'],row['Sentiment'],row['Language']
        ))

    # Commit and close
    conn.commit()
    print("Data inserted successfully!")

except Exception as e:
    print(e)

finally:
    if conn:
        cursor.close()
        conn.close

Data inserted successfully!
