### Installing Required Package

In [1]:
%pip install praw python-dotenv

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m189.3/189.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


### Import Required Package

In [7]:
import praw
import csv
import os
from dotenv import load_dotenv

print("‚úÖ Libraries imported successfully!")

‚úÖ Libraries imported successfully!


### Mounting google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load Environment Variables
Load Reddit API credentials from the environment file.
### And
### Authenticate with Reddit API
Establish connection to Reddit using PRAW with the loaded credentials.

In [39]:
from dotenv import load_dotenv
import os

load_dotenv("reddit_api.env") #change the name to your actual reddit env file name

reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT")
)
print("‚úÖ Reddit API authenticated successfully!")
print(f"Read-only mode: {reddit.read_only}")


‚úÖ Reddit API authenticated successfully!
Read-only mode: True


## 4. Define Data Collection Function

Create a function to download recent posts from a specified subreddit with proper error handling and improvements.


In [25]:
def download_recent_posts(subreddits, limit=100, filename="reddit_data.csv"):
    """
    Download 'hot' posts from one or more subreddits and write to a single CSV.

    Args:
        subreddits (str | list[str]): subreddit name or list of names
        limit (int): posts per subreddit
        filename (str): output CSV file
    """
    import csv

    # normalize input
    if isinstance(subreddits, str):
        subreddits = [subreddits]
    if not isinstance(subreddits, list) or not subreddits:
        raise ValueError("Provide at least one subreddit name.")

    try:
        with open(filename, mode="w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            # header once
            writer.writerow([
                "Title","Score","Upvote_ratio","Num_comments","Author","Subreddit",
                "URL","Permalink","Created_utc","Is_self","Selftext","Flair","Domain","Search_query"
            ])

            total = 0
            for sub in subreddits:
                print(f"üì• Collecting hot posts from r/{sub} ...")
                for post in reddit.subreddit(sub).hot(limit=limit):
                    author = str(post.author) if post.author else "[deleted]"
                    selftext = post.selftext[:500] if getattr(post, "selftext", None) else None
                    flair = getattr(post, "link_flair_text", None)
                    domain = getattr(post, "domain", None)

                    writer.writerow([
                        post.title,
                        post.score,
                        getattr(post, "upvote_ratio", None),
                        post.num_comments,
                        author,
                        str(post.subreddit),
                        post.url,
                        f"https://reddit.com{post.permalink}",
                        int(post.created_utc),
                        post.is_self,
                        selftext,
                        flair,
                        domain,
                        None  # hot posts ‚Üí no search query
                    ])
                    total += 1

        print(f"‚úÖ Saved {total} posts from {len(subreddits)} subreddits to '{filename}'.")
        return True

    except praw.exceptions.PRAWException as e:
        print(f"‚ùå Reddit API error: {e}")
        return False
    except Exception as e:
        print(f"‚ùå Unexpected error: {e}")
        return False


print("‚úÖ Function defined successfully!")


‚úÖ Function defined successfully!


## Define Search Query Function

In [16]:
def search_posts(query, subreddits, limit=50, filename="reddit_search.csv"):
    """
    Search Reddit posts across one or more subreddits and save results to a CSV file.

    Args:
        query (str): Keyword to search for (e.g., "GPT-4")
        subreddits (list): List of subreddit names (e.g., ["datascience", "MachineLearning"])
        limit (int): Number of posts per subreddit to fetch
        filename (str): Output CSV file name
    """

    import csv
    import os

    if not query or not isinstance(query, str):
        raise ValueError("Query must be a non-empty string.")
    if isinstance(subreddits, str):
        subreddits = [subreddits]
    if not isinstance(subreddits, list) or len(subreddits) == 0:
        raise ValueError("Please provide at least one subreddit name.")

    try:
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow([
                "Title", "Score", "Upvote_ratio", "Num_comments", "Author", "Subreddit",
                "URL", "Permalink", "Created_utc", "Is_self", "Selftext",
                "Flair", "Domain", "Search_query"
            ])

            total_count = 0
            for sub in subreddits:
                subreddit = reddit.subreddit(sub)
                print(f"üîç Searching '{query}' in r/{sub} ...")

                for post in subreddit.search(query, limit=limit):
                    author_name = str(post.author) if post.author else "[deleted]"
                    selftext = post.selftext[:500] if post.selftext else None
                    flair = getattr(post, "link_flair_text", None)
                    domain = getattr(post, "domain", None)

                    writer.writerow([
                        post.title,
                        post.score,
                        getattr(post, "upvote_ratio", None),
                        post.num_comments,
                        author_name,
                        str(post.subreddit),
                        post.url,
                        f"https://reddit.com{post.permalink}",
                        int(post.created_utc),
                        post.is_self,
                        selftext,
                        flair,
                        domain,
                        query  # ‚úÖ this column shows which keyword was searched
                    ])
                    total_count += 1

            print(f"‚úÖ Completed search for '{query}'. Total posts saved: {total_count}")
            return True

    except Exception as e:
        print(f"‚ùå Error during search: {e}")
        return False
print("‚úÖ Function defined successfully!")

‚úÖ Function defined successfully!


## Configure Parameters

Set the parameters for data collection - subreddit, number of posts, and output filename.

In [32]:
# Configuration parameters
subreddit = ["datascience", "MachineLearning", "learnpython"] # Subreddit to download posts from
limit = 100  # Number of posts to download
filename = "reddit_data.csv"  # Name of the CSV file

print(f"üìã Configuration:")
print(f"   Subreddit: r/{subreddit}")
print(f"   Posts to download: {limit}")
print(f"   Output file: {filename}")


üìã Configuration:
   Subreddit: r/['datascience', 'MachineLearning', 'learnpython']
   Posts to download: 100
   Output file: reddit_data.csv


## Execute Data Collection

Run the data collection function to download cricket posts from Reddit.

In [34]:
# Execute the data collection
success = download_recent_posts(subreddit, limit, filename)

if success:
    print("\nüéâ Data collection completed successfully!")
else:
    print("\n‚ùå Data collection failed. Please check the error messages above.")

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üì• Collecting hot posts from r/datascience ...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üì• Collecting hot posts from r/MachineLearning ...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üì• Collecting hot posts from r/learnpython ...
‚úÖ Saved 300 posts from 3 subreddits to 'reddit_data.csv'.

üéâ Data collection completed successfully!


## Search Query Function output

In [35]:
query = "GPT-4"
subreddits = ["datascience", "MachineLearning", "learnpython"]
filename_search = "reddit_search.csv"

# Run the search function
success = search_posts(query=query, subreddits=subreddits, limit=30, filename=filename_search)

if success:
    print(f"üéâ Search results saved to {filename_search}")
else:
    print("‚ùå Something went wrong during search.")

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîç Searching 'GPT-4' in r/datascience ...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîç Searching 'GPT-4' in r/MachineLearning ...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîç Searching 'GPT-4' in r/learnpython ...
‚úÖ Completed search for 'GPT-4'. Total posts saved: 90
üéâ Search results saved to reddit_search.csv


## Merge two datasets

In [36]:
import pandas as pd

# Load both CSVs
df_hot = pd.read_csv("reddit_data.csv")
df_search = pd.read_csv("reddit_search.csv")

# Combine and deduplicate
df_final = pd.concat([df_hot, df_search], ignore_index=True)
df_final.drop_duplicates(subset=["Permalink"], inplace=True)

# Save the final combined file
df_final.to_csv("reddit_data.csv", index=False)

print(f"‚úÖ Final reddit_data.csv saved with {len(df_final)} unique posts.")

‚úÖ Final reddit_data.csv saved with 387 unique posts.


## Data Analysis

In [37]:
import pandas as pd
filename = "reddit_data.csv"
# Check if file exists and load data
if os.path.exists(filename):
    # Load the CSV file
    df = pd.read_csv(filename)

    print(f"üìä Dataset Overview:")
    print(f"   Total posts: {len(df)}")
    print(f"   Columns: {list(df.columns)}")
    print(f"   File size: {os.path.getsize(filename)} bytes")

    print(f"\nüìù Sample Posts:")
    print("=" * 50)

    # Display first 3 posts
    for i, row in df.head(3).iterrows():
        print(f"\nPost {i+1}:")
        print(f"Title: {row['Title'][:80]}...")
        print(f"Author: {row['Author']}")
        print(f"Score: {row['Score']}")
        print(f"URL: {row['URL']}")
        print("-" * 30)

    # Basic statistics
    print(f"\nüìà Basic Statistics:")
    print(f"   Average score: {df['Score'].mean():.2f}")
    print(f"   Highest score: {df['Score'].max()}")
    print(f"   Lowest score: {df['Score'].min()}")

else:
    print(f"‚ùå File '{filename}' not found!")


üìä Dataset Overview:
   Total posts: 387
   Columns: ['Title', 'Score', 'Upvote_ratio', 'Num_comments', 'Author', 'Subreddit', 'URL', 'Permalink', 'Created_utc', 'Is_self', 'Selftext', 'Flair', 'Domain', 'Search_query']
   File size: 286177 bytes

üìù Sample Posts:

Post 1:
Title: Weekly Entering & Transitioning - Thread 03 Nov, 2025 - 10 Nov, 2025...
Author: AutoModerator
Score: 1
URL: https://www.reddit.com/r/datascience/comments/1on34xg/weekly_entering_transitioning_thread_03_nov_2025/
------------------------------

Post 2:
Title: How would you turn a working Jupyter pipeline into a small web app?...
Author: Proof_Wrap_2150
Score: 26
URL: https://www.reddit.com/r/datascience/comments/1ommxv4/how_would_you_turn_a_working_jupyter_pipeline/
------------------------------

Post 3:
Title: Is it too early to accept an internship offer?...
Author: LilParkButt
Score: 18
URL: https://www.reddit.com/r/datascience/comments/1om9zgm/is_it_too_early_to_accept_an_internship_offer/
------------

## Additional Analysis

In [38]:
# Additional analysis (optional)
if 'df' in locals() and not df.empty:
    print("üîç Additional Analysis:")
    print("=" * 40)

    # Top scoring posts
    top_posts = df.nlargest(5, 'Score')[['Title', 'Score', 'Author']]
    print("\nüèÜ Top 5 Posts by Score:")
    for i, (_, row) in enumerate(top_posts.iterrows(), 1):
        print(f"{i}. {row['Title'][:60]}... (Score: {row['Score']}, Author: {row['Author']})")

    # Most active authors
    author_counts = df['Author'].value_counts().head(5)
    print("\nüë• Most Active Authors:")
    for author, count in author_counts.items():
        print(f"   {author}: {count} posts")

    # Posts with text content
    posts_with_text = df[df['Selftext'].str.len() > 0]
    print(f"\nüìù Posts with text content: {len(posts_with_text)} out of {len(df)}")

else:
    print("‚ùå No data available for analysis")


üîç Additional Analysis:

üèÜ Top 5 Posts by Score:
1. Why do new analysts often ignore R?... (Score: 2473, Author: ElectrikMetriks)
2. My Data Science Manifesto from a Self Taught Data Scientist... (Score: 2078, Author: irndk10)
3. [D] Anyone else witnessing a panic inside NLP orgs of big te... (Score: 1381, Author: thrwsitaway4321)
4. I investigated the Underground Economy of Glassdoor Reviews... (Score: 1159, Author: ibsurvivors)
5. [N] OpenAI may have benchmarked GPT-4‚Äôs coding ability on it... (Score: 1002, Author: Balance-)

üë• Most Active Authors:
   AutoModerator: 11 posts
   DeepAnalyze: 4 posts
   WarChampion90: 4 posts
   seraine: 3 posts
   Singularian2501: 3 posts

üìù Posts with text content: 357 out of 387
