In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
import json
import pandas as pd

def get_dev_articles(tag, limit=1000):
    articles = []
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options)

    try:
        driver.get(f'https://dev.to/search?q={tag}')
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.crayons-story__title a')))
        
        story_cards = driver.find_elements(By.CSS_SELECTOR, '.crayons-story__title a')[:limit]
        
        for card in story_cards:
            title = card.text.strip()
            link = card.get_attribute('href')
            articles.append({'query': title, 'url': link})
            
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    
    finally:
        driver.quit()
    
    return articles

def get_article_content(url):
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options)
    content = ""
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.crayons-article__body')))
        
        paragraphs = driver.find_elements(By.CSS_SELECTOR, '.crayons-article__body p')
        content = "\n".join([para.text for para in paragraphs])
    
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    
    finally:
        driver.quit()
    
    return content
tag = 'web scraping'
articles = get_dev_articles(tag)
output_data = []
for article in articles:
    print(f"Query: {article['query']}\nURL: {article['url']}")
    content = get_article_content(article['url'])
    output_data.append({'query': article['query'], 'answer': content})
    print(f"Answer: {content}\n{'-'*80}")
df = pd.DataFrame(output_data)
df.to_csv('dev_articles.csv', index=False)
with open('dev_articles.json', 'w') as f:
    json.dump(output_data, f, indent=4)


Query: Web scraping with Node.js and Typescript - the scraper part (1/3)
URL: https://dev.to/uiii/web-scraping-with-nodejs-and-typescript-the-scraper-part-ffn
Answer: Internet is full of information these days. Almost every website display them to the user in a human readable form. But what if you want to process these data programmatically, do some analysis, present them in a different form or store them in a database to make queries on them later? E.g. collect all the product names with a description, image and a price from your favorite online store. Well, you can open the page by page and copy&paste the data you need, but you won't 🤦‍♂️. What you definitely can and should is to check if the page has an API which will provide you the data easily. If not, I'm sorry bro there is no way to ... just kidding! 😝
... the web scraping comes into play. Yay!
👉 In this article series (3 parts) I will guide you through the whole process of building a web scraper in Node.js and Typescript.
In th