### Scrap reddit without pagination

In [1]:
import requests
from bs4 import BeautifulSoup

# URL of the subreddit
subreddit = "machinelearning"
url = f"https://old.reddit.com/r/{subreddit}/"

# Headers to mimic a browser
headers = {"User-Agent": "Mozilla/5.0"}

# Request the page
response = requests.get(url, headers=headers)

# Parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Extract post titles and links
posts = []
for post in soup.find_all("div", class_="thing"):
    title = post.find("a", class_="title").text
    link = post.find("a", class_="title")["href"]
    posts.append({"title": title, "link": link})

# Print results
for p in posts[:5]:
    print(p)


{'title': '[D] Self-Promotion Thread', 'link': '/r/MachineLearning/comments/1j1hc0o/d_selfpromotion_thread/'}
{'title': "[D] Monthly Who's Hiring and Who wants to be Hired?", 'link': '/r/MachineLearning/comments/1ie5qoh/d_monthly_whos_hiring_and_who_wants_to_be_hired/'}
{'title': '[R] Transformers without Normalization (FAIR Meta, New York University, MIT, Princeton University)', 'link': '/r/MachineLearning/comments/1jbs7xg/r_transformers_without_normalization_fair_meta/'}
{'title': 'AirPods 4 with Active Noise Cancellation helps remove unwanted noise.', 'link': 'https://alb.reddit.com/cr?za=HK9Xb8uoAd9pkEmFi_WNR5_KzOzdMUiaLHibP_yl1qEcJWtJVhS3HMFDVOHtj_rEm2BXGX4YR938Ulxt9eLox9cmLvWR3jpeu_B34nWbeb1DzO1-6IIXAWfzc-ImRZpSRseHgGsL9l-7Xz20nlxOnQg3ueVooTdz8ZVUP_6G_anhrxb_7lqtg1UKkchhbyRvf7tGXHuT4IZtKVQorDKfEykKf7hhdbTAAYUEV1Knal_AANc4xg0ckO6jE-WKh1XbTSuAbZFr7AIY0Ukjm-Z8R30bZgE4QNpIRNqiG0Krhv1-l6s49mOvtOckeCD6EVw8anto1MvRJHE-zuqBi1ykPPwkuWc9Cg81UnK7JSn0rTS67jGMrkmmaSWPmcPJWq9vdqz0V1R9gqsfvp10u

### Scrap reddit with pagination

In [2]:
import requests
from bs4 import BeautifulSoup

def scrape_reddit(subreddit, pages=2):
    base_url = f"https://old.reddit.com/r/{subreddit}/"
    headers = {"User-Agent": "Mozilla/5.0"}
    next_page = base_url
    all_posts = []

    for _ in range(pages):
        response = requests.get(next_page, headers=headers)
        if response.status_code != 200:
            print("Failed to fetch data")
            break

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract posts
        for post in soup.find_all("div", class_="thing"):
            title = post.find("a", class_="title").text
            link = post.find("a", class_="title")["href"]
            all_posts.append({"title": title, "link": link})

        # Find the next page button
        next_button = soup.find("span", class_="next-button")
        if next_button:
            next_page = next_button.find("a")["href"]
        else:
            break  # No more pages

    return all_posts

# Usage
subreddit = "machinelearning"
posts = scrape_reddit(subreddit, pages=2)

# Print first 10 posts
for p in posts[:10]:
    print(p)

{'title': '[D] Self-Promotion Thread', 'link': '/r/MachineLearning/comments/1j1hc0o/d_selfpromotion_thread/'}
{'title': "[D] Monthly Who's Hiring and Who wants to be Hired?", 'link': '/r/MachineLearning/comments/1ie5qoh/d_monthly_whos_hiring_and_who_wants_to_be_hired/'}
{'title': '[P] New Python library for axis labeling algorithms', 'link': '/r/MachineLearning/comments/1jchg8d/p_new_python_library_for_axis_labeling_algorithms/'}
{'title': 'Meet iPhone 16e. Built for Apple Intelligence and powered by A18 — the latest-generation chip — it comes packed with a 48MP Fusion camera, supersized battery life and a durable design.', 'link': 'https://alb.reddit.com/cr?za=9sQ0BUkSwF6i_aWlpRv9V3zluEavx5F9SYTztsKlBvOO12C6lQHPeRCwXsDCehZgVU4kIdekEE-KFjtJga4dqU_UvOTRhrPMb8XgA9h4wmWObY1gus9F1NnUvxKoWvcHWHQC2o4obIypXqSugqarS_TRp3OEpM0r05EbkVHNJvSEcf7rEjDAeAiXBdDFG_kH0KhfA_zQA6R6ualxfpgPPfUMK7pHTK1iKmKz8zlpvKgHuddkiPDISQQ9hROHJZhWw7u1soe_E5yC1KdQ4cPbLyFhmjZ5c8UeUe7UEsWFf_7T5le-kh_8vCgCykIUARPuoNx0jaufp8

In [5]:
posts[-1]

{'title': '[D] Could an AI Model Truly Evolve Beyond Predefined Learning?',
 'link': '/r/MachineLearning/comments/1jaf1xm/d_could_an_ai_model_truly_evolve_beyond/'}

### Scrap with pagination and data [title, content, date, comment, vote]

In [43]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_reddit(subreddit, pages=1):
    base_url = f"https://old.reddit.com/r/{subreddit}/"
    headers = {"User-Agent": "Mozilla/5.0"}
    next_page = base_url
    all_posts = []

    for _ in range(pages):
        response = requests.get(next_page, headers=headers)
        if response.status_code != 200:
            print("Failed to fetch data")
            break

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract posts
        for post in soup.find_all("div", class_="thing"):
            title = post.find("a", class_="title").text
            link = post.find("a", class_="title")["href"]
            full_link = urljoin(base_url, link)  # Ensures absolute URL
            votes = post.find("div", class_="score unvoted")
            votes = votes.text if votes else "N/A"
            comments = post.find("a", string=lambda text: text and "comment" in text.lower())
            comments = comments.text if comments else "0 comments"
            author = post.find("a", class_="author")
            author = author.text if author else "Unknown"
            date = post.find("time")
            date = date["datetime"] if date else "Unknown"

            all_posts.append({
                "title": title,
                "link": full_link,
                "votes": votes,
                "comments": comments,
                "author": author,
                "date": date
            })

        # Find the next page button
        next_button = soup.find("span", class_="next-button")
        if next_button:
            next_page = next_button.find("a")["href"]
        else:
            break  # No more pages

    return all_posts

In [44]:
POST_LIMIT = 500
subreddit_list = [
    "r/india",  
    "r/IndianGaming",  
    "r/IndianFood",  
    "r/desis",  
    "r/IndiaSpeaks",  
    "r/bollywood",  
    "r/IndianMusic",  
    "r/IndianFashionAddicts",  
    "r/IndianPeopleFacebook",  
    "r/AskIndia",  
    "r/IndianDiaspora",  
    "r/Sikh",  
    "r/hindustan",  
    "r/TwoXIndia",  
    "r/Chennai",  
    "r/Bangalore",  
    "r/Mumbai",  
    "r/Kolkata",  
    "r/delhi",  
    "r/indiauncensored",  
    "r/IndiaInvestments",  
    "r/IndianArt",  
    "r/IndianProgramming",  
    "r/SouthAsianFood",  
    "r/IndianFootball",  
    "r/IndianMusicExchange",  
    "r/indiadiscussion",  
    "r/IndiaSocial",  
    "r/IndianMemes",  
    "r/IndianHistory",  
    "r/IndianPolitics",  
    "r/Cricket",  
    "r/IndianStockMarket",  
    "r/HindutvaWatch",  
    "r/NorthEastIndia"
]

In [50]:
posts = scrape_reddit("india", pages=3)

In [51]:
len(posts)

84

In [35]:
total_post = []

page_number = 1
while len(total_post) < 10:
    posts = scrape_reddit("india", pages=page_number)
    total_post.extend(posts)
    page_number += 1

In [42]:
total_post[-10]

{'title': 'Surrender India Passport in India : Pune Regional passport office',
 'link': 'https://old.reddit.com/r/india/comments/1jdvlm6/surrender_india_passport_in_india_pune_regional/',
 'votes': '221',
 'comments': '27 comments',
 'author': 'jayrohi18',
 'date': '2025-03-18T03:07:56+00:00'}

### Sqlite program

In [None]:
import sqlite3
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from hashlib import sha256
from datetime import datetime

DB_NAME = "reddit_posts.db"

In [None]:
# 📌 Create SQLite Database and Table
def create_database():
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS posts (
            title TEXT,
            link TEXT,
            votes INTEGER,
            comments INTEGER,
            author TEXT,
            date TEXT,
            age INTEGER,
            hash TEXT UNIQUE
        )
    """)
    conn.commit()
    conn.close()

In [None]:
# 📌 Insert a Post into the Database (Ensuring Uniqueness)
def insert_post(title, link, votes, comments, author, date):
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    # Convert votes and comments to integers
    votes = int(votes) if votes.isdigit() else 0
    comments = int(comments.split()[0]) if comments.split()[0].isdigit() else 0
    
    # Calculate Age (days since post)
    post_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") if date != "Unknown" else datetime.now()
    scrape_time = datetime.now()

    # Generate Unique Hash (SHA256 of Title)
    post_hash = sha256(title.encode()).hexdigest()

    try:
        cursor.execute("""
            INSERT INTO posts (title, link, votes, comments, author, date, scrape_time, hash)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        """, (title, link, votes, comments, author, date, scrape_time, post_hash))
        conn.commit()
    except sqlite3.IntegrityError:
        print(f"Skipping duplicate post: {title}")

    conn.close()

In [None]:
# 📌 Search Posts by Title Keyword
def search_posts(keyword):
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    cursor.execute("SELECT * FROM posts WHERE title LIKE ?", ('%' + keyword + '%',))
    results = cursor.fetchall()
    
    conn.close()
    return results

In [None]:
# 📌 Scrape and Store Posts in Database
def scrape_reddit(subreddit, pages=1):
    base_url = f"https://old.reddit.com/r/{subreddit}/"
    headers = {"User-Agent": "Mozilla/5.0"}
    next_page = base_url

    for _ in range(pages):
        response = requests.get(next_page, headers=headers)
        if response.status_code != 200:
            print("Failed to fetch data")
            break

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract posts
        for post in soup.find_all("div", class_="thing"):
            title = post.find("a", class_="title").text
            link = urljoin(base_url, post.find("a", class_="title")["href"])
            votes = post.find("div", class_="score unvoted")
            votes = votes.text if votes else "0"
            comments = post.find("a", string=lambda text: text and "comment" in text.lower())
            comments = comments.text if comments else "0 comments"
            author = post.find("a", class_="author")
            author = author.text if author else "Unknown"
            date = post.find("time")["datetime"] if post.find("time") else "Unknown"

            # Insert into database
            insert_post(title, link, votes, comments, author, date)

        # Find the next page button
        next_button = soup.find("span", class_="next-button")
        if next_button:
            next_page = next_button.find("a")["href"]
        else:
            break  # No more pages