In [1]:
import praw
from prawcore.exceptions import NotFound, Forbidden
import datetime
import csv
import pandas as pd
import os

# Fill below fields
reddit = praw.Reddit(
    client_id="",
    client_secret="",
    redirect_uri="",
    password="",
    user_agent="",
    username="",
)
print(reddit.user.me())

Ill_Action_9778


# Get list of subreddits

In [2]:
# Grab list of submissions id's
submissions_ids = pd.read_csv("submissions.csv")['id']
comments_orig_post_ids = pd.read_csv("comments.csv")['link_id'].str.replace("t3_", "")
all_post_ids = pd.concat([submissions_ids, comments_orig_post_ids], ignore_index=True).drop_duplicates()
print(submissions_ids.size)
print(comments_orig_post_ids.size)
print(all_post_ids)

11053
6711
0        8b7ryg
1        511r3a
2        4xyikx
3        4va15u
4        4v9u9s
          ...  
17758    1zst6o
17759    1zsvn0
17760    1zs1v4
17761    1zskf0
17763    1zpe4j
Length: 14553, dtype: object


In [6]:
data_list = []
csv_name = None
# Capturing 4 month period of events from 2020 elections, Jan. 6 riot
# start_date = datetime.datetime(2023, 9, 8)
# end_date = datetime.datetime(2023, 10, 8)

def save_data_list():
    global data_list; global csv_name
    print(f"[{datetime.datetime.now()}] Saving {len(data_list)} entries...")
    df = pd.DataFrame(data_list)
    df.drop_duplicates(subset="id", keep="last", inplace=True)
    # Display the DataFrame
    if os.path.isfile(f"{csv_name}.csv"):
        df.to_csv(f"{csv_name}.csv", mode='a', header=False)
    else:
        df.to_csv(f"{csv_name}.csv")
    # Erase to save memory
    data_list.clear()
    df.iloc[0:0]

# Function to recursively extract comments and replies
def extract_comments(comment, lvl=0):
    global data_list; global csv_name
    if lvl == 2:
         return [] # stop from nesting too much

    comments_list = []

    # Comment attributes can collect: https://praw.readthedocs.io/en/stable/code_overview/models/comment.html
    try:
        comments_list.append({
            'type': 'comment',
            'author': comment.author, 
            'body': comment.body, 
            'created_utc': comment.created_utc, 
            'id': comment.id, 
            'link_id': comment.link_id, 
            'parent_id': comment.parent_id, 
            'score': comment.score, 
            'subreddit': comment.subreddit.display_name, 
        })

        for reply in comment.replies:
            # Recursively extract replies to the reply
            comments_list.extend(extract_comments(reply, lvl=lvl+1))
            if len(comments_list) > 100: # Comment list getting large, write to file to save memory
                data_list.extend(comments_list)
                save_data_list()
                comments_list.clear()
    except Exception as e:
        print(f"[{datetime.datetime.now()}] Skipping adding this comment due to: {e}")
             
    return comments_list

def scrape_subreddits_to_csv(ids, name, save_every_n_posts=100): # sortmode options: https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#obtain-submission-instances-from-a-subreddit
    global data_list; global csv_name
    start_time = datetime.datetime.now()
    print(f"[{start_time}] Attempting to scrape posts for {name}...")
    # print(f"Only saving posts created between {start_date} and {end_date}")
    
    csv_name = f"{name}"
    # Extract and append posts and comments to the list
    data_list = []
    skip_list = []
    count_posts = 0
    skip_recent_posts = 0
    exit = False
    done = False
    try:
        while not done:
            for id in ids:
                post = reddit.submission(id)
                if exit == True:
                    break
                retry_post = 0
                while True:
                    try:
                        # # Filter range of time to grab
                        # if start_date.timestamp() > post.created_utc:
                        #     exit = True
                        #     break
                        # if end_date.timestamp() < post.created_utc:
                        #     skip_recent_posts += 1
                        #     if skip_recent_posts % 10000 == 0:
                        #         print(f"[{datetime.datetime.now()}] [{count_posts}] Still skipping, currently at: {datetime.datetime.fromtimestamp(post.created_utc)}")
                        #     break
                        count_posts += 1
                        if count_posts % 500 == 0:
                            print(f"[{datetime.datetime.now()}] [{count_posts}] Post created date: {datetime.datetime.fromtimestamp(post.created_utc)}")
                        
                        # Below are all the fields we'll request from PRAW for each post
                        # Submission attributes can collect: https://praw.readthedocs.io/en/latest/code_overview/models/submission.html
                        post_dict = {
                            'type': 'post',
                            'author': post.author, 
                            'created_utc': post.created_utc, 
                            'id': post.id, 
                            'locked': post.locked, 
                            'name': post.name, 
                            'num_comments': post.num_comments, 
                            'score': post.score, 
                            'selftext': post.selftext, 
                            'subreddit': post.subreddit.display_name, 
                            'title': post.title, 
                            'upvote_ratio': post.upvote_ratio,
                        }
                        data_list.append(post_dict)
                        num_retries=0
                        while True:
                            try:
                                post.comments.replace_more(limit=None)
                                break
                            except praw.exceptions.DuplicateReplaceException as e:
                                print(f"[{datetime.datetime.now()}] [{count_posts}] [{post.id}] Ran into DuplicateReplaceException, continue to next post")
                                break
                            except Exception as e:
                                print(f"[{datetime.datetime.now()}] [{count_posts}] [{post.id}] [{num_retries}] {e}")
                                if num_retries == 10:
                                    skip_list.append(id)
                                    print("(Continuing to next post...)")
                                    break
                                else:
                                    num_retries += 1
                                    continue
                        for comment in post.comments.list():
                            data_list.extend(extract_comments(comment))
                        if count_posts % save_every_n_posts == 0 and len(data_list) > 0:
                            print(f"[{datetime.datetime.now()}] Saving #{count_posts // save_every_n_posts} batch of {save_every_n_posts} posts...")
                            save_data_list()
                        break
                    except (NotFound, Forbidden) as e:
                        skip_list.append(id)
                        print(f"[{retry_post}][{datetime.datetime.now()}] [{count_posts}] {e} (Continuing to next post...)")
                        break
                    except Exception as e:
                        retry_post += 1
                        print(f"[{retry_post}][{datetime.datetime.now()}] [{count_posts}] {e} ")
                        if retry_post == 10:
                            skip_list.append(id)
                            print("(Continuing to next post...)")
                            break
                        continue
            done = True
    except Exception as e:
        print(f"[{datetime.datetime.now()}] [{count_posts}] {e}")
        pass

    end_time = datetime.datetime.now()
    print(f"Elapsed time: {end_time - start_time}")
    print(f"Skipped {skip_recent_posts} posts that were created in the last 24 hours.")
    if len(data_list) > 0:
        save_data_list()
    return skip_list


In [18]:
skipped = scrape_subreddits_to_csv(all_post_ids.to_list(), "sus_usrs_posts")
print(f"Skipped {len(skipped)}")

[2023-10-19 20:35:56.292328] Attempting to scrape posts for sus_usrs_posts...
[0][2023-10-19 20:35:57.732783] [7] received 403 HTTP response (Continuing to next post...)
[0][2023-10-19 20:35:59.352352] [16] received 403 HTTP response (Continuing to next post...)
[0][2023-10-19 20:36:00.952183] [24] received 403 HTTP response (Continuing to next post...)
[0][2023-10-19 20:36:09.547216] [74] received 403 HTTP response (Continuing to next post...)
[0][2023-10-19 20:36:10.709764] [81] received 403 HTTP response (Continuing to next post...)
[0][2023-10-19 20:36:10.861123] [82] received 403 HTTP response (Continuing to next post...)
[2023-10-19 20:36:14.053229] Saving #1 batch of 100 posts...
[2023-10-19 20:36:14.053229] Saving 124 entries...
[0][2023-10-19 20:37:02.773453] [164] received 403 HTTP response (Continuing to next post...)
[0][2023-10-19 20:37:27.003382] [188] received 403 HTTP response (Continuing to next post...)
[2023-10-19 20:37:38.966032] Saving #2 batch of 100 posts...
[202

KeyboardInterrupt: 

In [7]:
skipped = scrape_subreddits_to_csv(all_post_ids.to_list()[13632+163:], "sus_usrs_posts_13632+163")
print(f"Skipped {len(skipped)}")

[2023-10-25 08:20:49.070388] Attempting to scrape posts for sus_usrs_posts_13632+163...
[0][2023-10-25 08:21:09.384524] [4] received 403 HTTP response (Continuing to next post...)
[2023-10-25 08:23:14.872327] Saving 3894 entries...
[2023-10-25 08:24:02.421253] [13] [4qfy9d] [0] received 413 HTTP response
[2023-10-25 08:24:02.941253] [13] [4qfy9d] [1] received 413 HTTP response
[2023-10-25 08:24:03.428276] [13] [4qfy9d] [2] received 413 HTTP response
[2023-10-25 08:24:03.913276] [13] [4qfy9d] [3] received 413 HTTP response
[2023-10-25 08:24:04.374277] [13] [4qfy9d] [4] received 413 HTTP response
[2023-10-25 08:24:04.832300] [13] [4qfy9d] [5] received 413 HTTP response
[2023-10-25 08:24:05.327300] [13] [4qfy9d] [6] received 413 HTTP response
[2023-10-25 08:24:05.794300] [13] [4qfy9d] [7] received 413 HTTP response
[2023-10-25 08:24:06.259307] [13] [4qfy9d] [8] received 413 HTTP response
[2023-10-25 08:24:06.737303] [13] [4qfy9d] [9] received 413 HTTP response
[2023-10-25 08:24:07.196300]

In [8]:
print(os.listdir('.'))

['comments.csv', 'data_1month', 'data_exploration_reborn2.ipynb', 'environment.yml', 'fetch_sus_usrs_post_history.ipynb', 'submissions.csv', 'subreddits', 'sus_usrs_posts.csv', 'sus_usrs_posts_13632+13.csv', 'sus_usrs_posts_13632+163.csv', 'sus_usrs_posts_13632.csv', 'users.csv']


In [9]:
# Combine and cleanup
df_list = []
for file in os.listdir('.'):
    if 'sus_usrs' in file and file.endswith(".csv"):
        df_list.append(pd.read_csv(file))
sus_usrs_df = pd.concat(df_list, ignore_index=True).drop_duplicates(subset=['id']).reset_index(drop=True)
sus_usrs_df.to_csv("sus_usrs_posts.csv")

  df_list.append(pd.read_csv(file))
  df_list.append(pd.read_csv(file))
  df_list.append(pd.read_csv(file))


In [10]:
sus_usrs_df.shape

(2299936, 16)