### Scrap reddit without pagination

In [3]:
import requests
from bs4 import BeautifulSoup

# URL of the subreddit
subreddit = "machinelearning"
url = f"https://old.reddit.com/r/{subreddit}/"

# Headers to mimic a browser
headers = {"User-Agent": "Mozilla/5.0"}

# Request the page
response = requests.get(url, headers=headers)

# Parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Extract post titles and links
posts = []
for post in soup.find_all("div", class_="thing"):
    title = post.find("a", class_="title").text
    link = post.find("a", class_="title")["href"]
    posts.append({"title": title, "link": link})

# Print results
for p in posts[:5]:
    print(p)


{'title': '[D] Self-Promotion Thread', 'link': '/r/MachineLearning/comments/1j1hc0o/d_selfpromotion_thread/'}
{'title': "[D] Monthly Who's Hiring and Who wants to be Hired?", 'link': '/r/MachineLearning/comments/1ie5qoh/d_monthly_whos_hiring_and_who_wants_to_be_hired/'}
{'title': '[R] SEA-VL: A Large-Scale Culturally-Relevant Vision-Language Dataset for Southeast Asian Languages', 'link': '/r/MachineLearning/comments/1ja9stg/r_seavl_a_largescale_culturallyrelevant/'}
{'title': 'AirPods 4 with Active Noise Cancellation helps remove unwanted noise.', 'link': 'https://alb.reddit.com/cr?za=YiALcnXTDSEB2A1tofiXUDcK9gYTbE-0_vtQt1IKX8Kspoo_5h6r5taJ8cqYzvf11Z9MoPgHR_8Odt8e6idsOw19Y_bW6CRf_uccnagIBlu4QiaBYu_XmPUqpiYiRfME5lPYFPupWGTGAucPLWuFi58fwp1YsKQNVPWoI9Hu3m2TEKk8kgqKd8YVXkWW5_NcKo4jZeuUfuDzmDEzKzbVgzEnj8Ew7zpY6sTwWITZs1BhALUckF5oBKjySHECPNisuZZ0tTuf-a149_OGETFNBv4QXFqA2UCGNl-LzJTgTv7Hg8FEiEMRqA1DxTK60fmVXl83TuX4_oSUsSdarCJhfYTX_LPZnbjhNWC13GVohXjKMYjUBCwRtx_4rfWmUCH-bCUP0GqsXFMEzdHSuwT90sf

### Scrap reddit with pagination

In [5]:
import requests
from bs4 import BeautifulSoup

def scrape_reddit(subreddit, pages=2):
    base_url = f"https://old.reddit.com/r/{subreddit}/"
    headers = {"User-Agent": "Mozilla/5.0"}
    next_page = base_url
    all_posts = []

    for _ in range(pages):
        response = requests.get(next_page, headers=headers)
        if response.status_code != 200:
            print("Failed to fetch data")
            break

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract posts
        for post in soup.find_all("div", class_="thing"):
            title = post.find("a", class_="title").text
            link = post.find("a", class_="title")["href"]
            all_posts.append({"title": title, "link": link})

        # Find the next page button
        next_button = soup.find("span", class_="next-button")
        if next_button:
            next_page = next_button.find("a")["href"]
        else:
            break  # No more pages

    return all_posts

# Usage
subreddit = "machinelearning"
posts = scrape_reddit(subreddit, pages=2)

# Print first 10 posts
for p in posts[:10]:
    print(p)

{'title': '[D] Self-Promotion Thread', 'link': '/r/MachineLearning/comments/1j1hc0o/d_selfpromotion_thread/'}
{'title': "[D] Monthly Who's Hiring and Who wants to be Hired?", 'link': '/r/MachineLearning/comments/1ie5qoh/d_monthly_whos_hiring_and_who_wants_to_be_hired/'}
{'title': '[R] SEA-VL: A Large-Scale Culturally-Relevant Vision-Language Dataset for Southeast Asian Languages', 'link': '/r/MachineLearning/comments/1ja9stg/r_seavl_a_largescale_culturallyrelevant/'}
{'title': 'Gamers on Reddit rely on Reddit for trustworthy information. Plus, they’re 27% more likely to buy products they see advertised.', 'link': 'https://alb.reddit.com/cr?za=HCHlQ7lZjpfIo8i5IC0TpWfWvnaqfhFYV5tKq1pmafv5r7hFz08uKkDIKdBL4OjceaXUYrauq8lT8eJ8BQaJ0bn2AMJpZYL4_a7oR8qmAK0g0oIkhwsc7jihAm4Fkj_TnbID8ydDZ9nwKlb-Zl2hSowfbaWYmppYuwR3meFjrleXgzaH5jGcZEOwBxzFK-Pto-jaLS7Fsbb3q3OeC9_a6E3tLHYNGgVgn8tsTPFU8RDrvfdWVpVTVhRIUqn9EDsWHYuMBWEySiT3qC5QD5utSHJemr9wFjwV0R2XM_K4iAGHtevUzfvKjVrbMeqziQ0GmPRFnOLiIbjBQnV72cBrlYsXAahAh

### Scrap with pagination and data [title, content, date, comment, vote]

In [8]:
import requests
from bs4 import BeautifulSoup

def scrape_reddit(subreddit, pages=1):
    base_url = f"https://old.reddit.com/r/{subreddit}/"
    headers = {"User-Agent": "Mozilla/5.0"}
    next_page = base_url
    all_posts = []

    for _ in range(pages):
        response = requests.get(next_page, headers=headers)
        if response.status_code != 200:
            print("Failed to fetch data")
            break

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract posts
        for post in soup.find_all("div", class_="thing"):
            title = post.find("a", class_="title").text
            link = post.find("a", class_="title")["href"]
            votes = post.find("div", class_="score unvoted")
            votes = votes.text if votes else "N/A"
            comments = post.find("a", string=lambda text: text and "comment" in text.lower())
            comments = comments.text if comments else "0 comments"
            author = post.find("a", class_="author")
            author = author.text if author else "Unknown"
            date = post.find("time")
            date = date["datetime"] if date else "Unknown"

            all_posts.append({
                "title": title,
                "link": link,
                "votes": votes,
                "comments": comments,
                "author": author,
                "date": date
            })

        # Find the next page button
        next_button = soup.find("span", class_="next-button")
        if next_button:
            next_page = next_button.find("a")["href"]
        else:
            break  # No more pages

    return all_posts

In [None]:
POST_LIMIT = 500
subreddit_list = indian_subreddits = [
    "r/india",  
    "r/IndianGaming",  
    "r/IndianFood",  
    "r/desis",  
    "r/IndiaSpeaks",  
    "r/bollywood",  
    "r/IndianMusic",  
    "r/IndianFashionAddicts",  
    "r/IndianPeopleFacebook",  
    "r/AskIndia",  
    "r/IndianDiaspora",  
    "r/Sikh",  
    "r/hindustan",  
    "r/TwoXIndia",  
    "r/Chennai",  
    "r/Bangalore",  
    "r/Mumbai",  
    "r/Kolkata",  
    "r/delhi",  
    "r/indiauncensored",  
    "r/IndiaInvestments",  
    "r/IndianArt",  
    "r/IndianProgramming",  
    "r/SouthAsianFood",  
    "r/IndianFootball",  
    "r/IndianMusicExchange",  
    "r/indiadiscussion",  
    "r/IndiaSocial",  
    "r/IndianMemes",  
    "r/IndianHistory",  
    "r/IndianPolitics",  
    "r/Cricket",  
    "r/IndianStockMarket",  
    "r/HindutvaWatch",  
    "r/NorthEastIndia"
]