In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import os


url = "https://dev.to/latest"
ua = UserAgent()
userAgent = ua.random
headers = {"user-agent": userAgent}
page = requests.get(url, headers = headers)

soup = BeautifulSoup(page.content, "html.parser")

In [2]:
blog_box = soup.find_all("div", class_ = "crayons-story__body")

links = []
titles = []
time_uploaded = []
authors = []
tags = []
reading_times = []

for box in blog_box:
    #links
    if box.find("h2", class_ = "crayons-story__title") is not None:
        link = box.find("h2", class_ = "crayons-story__title").a
        link = link["href"]
        links.append(link.strip())
    else:
        links.append("None")
        
    #titles
    if box.find("h2", class_ = "crayons-story__title") is not None:
        title = box.find("h2", class_ = "crayons-story__title")
        titles.append(title.text.replace("\n", "").strip())
    else:
        titles.append("None")
        
    #time_uploaded
    if box.find("time", attrs = {"datetime": True}) is not None:
        time_upload = box.find("time", attrs = {"datetime": True})
        time_upload = time_upload["datetime"]
        time_uploaded.append(time_upload)
    else:
        time_uploaded.append("None")
        
    #authors
    if box.find("a", class_ = "crayons-story__secondary fw-medium m:hidden") is not None:
        author = box.find("a", class_ = "crayons-story__secondary fw-medium m:hidden")
        authors.append(author.text.replace("\n", "").strip())
    else:
        authors.append("None")
        
    #tags
    if box.find("div", class_ = "crayons-story__tags") is not None:
        tag = box.find("div", class_ = "crayons-story__tags")
        tags.append(tag.text.replace("\n", " ").strip())
    else:
        tags.append("None")
        
    #reading_times
    if box.find("div",class_ = "crayons-story__save") is not None:
        reading_time = box.find("div",class_ = "crayons-story__save")
        reading_times.append(reading_time.text.replace("\n", "").strip())
    else:
        reading_times.append("None")

In [3]:
blog_df = pd.DataFrame(
    {
        "Link": links,
        "Title": titles,
        "Time_Uploaded": time_uploaded,
        "Author": authors,
        "Tag": tags,
        "Reading_Time": reading_times
    }
)

blog_df = blog_df[blog_df["Link"] != "None"]

In [4]:
blog_df.Link.to_list()


['https://dev.to/bhuvi_d/my-food-choices-scared-me-so-i-coded-a-solution-5ffi',
 'https://dev.to/codeqwertyuiop/the-heartbeat-of-modern-web-applications1750065547885000-6ge',
 'https://dev.to/rac/introduction-to-algorithms-what-youll-learn-in-an-algorithm-course-33b7',
 'https://dev.to/codeqwertyuiop/peak-performance-understated-power1750064940729100-41gd',
 'https://dev.to/member_b06955cb/the-poetry-and-horizon-of-code-framework1750064930038900-3a96',
 'https://dev.to/brains_behind_bots/getting-started-with-langchain-build-smarter-ai-apps-with-llms-53go',
 'https://dev.to/codeqwertyuiop/the-heartbeat-of-modern-web-applications1750064330497700-4fga',
 'https://dev.to/member_b06955cb/peak-performance-understated-power1750064322082300-h5f',
 'https://dev.to/kitsunem/archlinuxwori-chang-pctositeinsutorusitemita-2kid',
 'https://dev.to/codeqwertyuiop/peak-performance-understated-power1750064027229400-3lad',
 'https://dev.to/member_b06955cb/junior-year-self-study-notes-my-journey-with-the-f

In [5]:
article = []
article_link = []

def get_full_content(url2):
    ua = UserAgent()
    userAgent = ua.random
    headers = {"user-agent": userAgent}
    page = requests.get(url2, headers = headers)

    soup2 = BeautifulSoup(page.content, "html.parser")
    #print(url2)



    content = soup2.find("div", class_ = "crayons-article__main")

    paragraphs = content.find_all("p")

    contents = []

    for x in paragraphs:
        contents.append(x.text.replace("\n", " "))

    full_content = " ".join(contents)
    article.append(full_content)
    article_link.append(url2)
    
for i in blog_df.Link:
    get_full_content(i)

article_df = pd.DataFrame(
    {
        "Link": article_link,
        "Article_content": article
    }
)

In [6]:
merged_df = blog_df.merge(article_df, on = "Link", how = "inner")

In [7]:
pip install langid pycountry nltk

Note: you may need to restart the kernel to use updated packages.


In [8]:
from nltk.corpus import stopwords
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the stopwords dataset
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("vader_lexicon")
nltk.download("punkt_tab")


def count_words_without_stopwords(text):
    if isinstance(text, (str, bytes)):
        words = nltk.word_tokenize(str(text))
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return len(filtered_words)
    else:
        0
        
merged_df['word_count'] = merged_df["Article_content"].apply(count_words_without_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
sent = SentimentIntensityAnalyzer()

def get_sentiment(record):
    sentiment_scores = sent.polarity_scores(record)
    compound_score = sentiment_scores['compound']
    
    if compound_score >= 0.05:
        sentiment = 'Positive'
    elif compound_score <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
        
    return compound_score, sentiment

merged_df[['Compound_Score' ,'Sentiment']] = merged_df['Article_content'].astype(str).apply(lambda x: pd.Series(get_sentiment(x)))

In [10]:
import langid
import pycountry

def detect_language(text):
    # Convert Nan to an empty string
    text = str(text) if pd.notna(text) else ''
    
    # Use langid to detect the language
    lang, confidence = langid.classify(text)
    return lang

merged_df['Language'] = merged_df['Article_content'].apply(detect_language)
merged_df['Language'] = merged_df['Language'].map(lambda code: pycountry.languages.get(alpha_2 = code).name if pycountry.languages.get(alpha_2 = code) else code)

In [11]:
filtered_df = merged_df[merged_df['Language'] == 'English'].reset_index(drop = True)
filtered_df

Unnamed: 0,Link,Title,Time_Uploaded,Author,Tag,Reading_Time,Article_content,word_count,Compound_Score,Sentiment,Language
0,https://dev.to/bhuvi_d/my-food-choices-scared-...,My food choices scared me so I coded a solution,2025-06-16T09:19:58Z,Bhuvi D,#showdev #beginners #react #webdev,2 min read,You know how most of us have those incomplete ...,198,0.9931,Positive,English
1,https://dev.to/codeqwertyuiop/the-heartbeat-of...,The Heartbeat of Modern Web Applications（17500...,2025-06-16T09:19:08Z,codeqwertyuiop,#webdev #programming #rust #java,8 min read,As a third-year student deeply passionate abou...,1632,0.9999,Positive,English
2,https://dev.to/rac/introduction-to-algorithms-...,Introduction to Algorithms: What You’ll Learn ...,2025-06-16T09:17:48Z,Zack Rac,#webdev #programming #ai #beginners,2 min read,Algorithms are the backbone of computer scienc...,402,0.9927,Positive,English
3,https://dev.to/codeqwertyuiop/peak-performance...,Peak Performance Understated Power（17500649407...,2025-06-16T09:09:01Z,codeqwertyuiop,#webdev #programming #rust #java,5 min read,As a junior pursuing a degree in Computer Scie...,887,0.953,Positive,English
4,https://dev.to/member_b06955cb/the-poetry-and-...,The Poetry and Horizon of Code Framework（17500...,2025-06-16T09:08:50Z,member_b06955cb,#webdev #programming #rust #java,8 min read,"As a third-year computer science student, code...",1471,0.9999,Positive,English
5,https://dev.to/brains_behind_bots/getting-star...,Getting Started with LangChain: Build Smarter ...,2025-06-16T09:04:17Z,Chanchal Singh,#langchain #automation #rag #llm,3 min read,Category: LLMs / LangChain / GenAI Published: ...,337,0.9938,Positive,English
6,https://dev.to/codeqwertyuiop/the-heartbeat-of...,The Heartbeat of Modern Web Applications（17500...,2025-06-16T08:58:50Z,codeqwertyuiop,#webdev #programming #rust #java,8 min read,As a third-year student deeply passionate abou...,1632,0.9999,Positive,English
7,https://dev.to/member_b06955cb/peak-performanc...,Peak Performance Understated Power（17500643220...,2025-06-16T08:58:43Z,member_b06955cb,#webdev #programming #rust #java,5 min read,As a junior pursuing a degree in Computer Scie...,887,0.953,Positive,English
8,https://dev.to/codeqwertyuiop/peak-performance...,Peak Performance Understated Power（17500640272...,2025-06-16T08:53:48Z,codeqwertyuiop,#webdev #programming #rust #java,5 min read,As a junior pursuing a degree in Computer Scie...,887,0.953,Positive,English
9,https://dev.to/member_b06955cb/junior-year-sel...,Junior Year Self-Study Notes My Journey with t...,2025-06-16T08:53:40Z,member_b06955cb,#webdev #programming #rust #java,3 min read,Introducing Hyperlane: The Next-Gen Rust Web F...,313,0.9819,Positive,English
