In [2]:
import praw
from prawcore.exceptions import NotFound, Forbidden
import datetime
import csv
import pandas as pd
import os

# Fill below fields
reddit = praw.Reddit(
    client_id="",
    client_secret="",
    redirect_uri="",
    password="",
    user_agent="",
    username="",
)
print(reddit.user.me())

Ill_Action_9778


# Get list of subreddits

In [3]:
# Grab subreddits from suspicious users list
sus_usr_ser = pd.read_csv("sus_usrs_subreddits.csv")['subreddit_name_prefixed']
sus_usr_ser

0       Bad_Cop_No_Donut
1                   news
2       BlackLivesMatter
3             The_Donald
4           blackculture
              ...       
1099       redditrequest
1100             toosoon
1101          dogemarket
1102          dogemining
1103         provenlands
Name: subreddit_name_prefixed, Length: 1104, dtype: object

In [4]:
# Grab political subreddits: https://web.archive.org/web/20190526161627/https://www.reddit.com/r/politics/wiki/relatedsubs
left_ser = pd.read_csv("political_left.csv")['Subreddit']
right_ser = pd.read_csv("political_right.csv")['Subreddit']
gen_political_ser = pd.read_csv("political_other.csv")['Subreddit']
all_political_ser = pd.concat([left_ser, right_ser, gen_political_ser])
print(all_political_ser.shape)
all_political_ser.drop_duplicates(inplace=True)
print(all_political_ser.shape)


(116,)
(113,)


In [5]:
# From https://subredditstats.com/list/post-votes
pop_ser = pd.read_csv("popular_by_subscribers.csv")['Subreddit']
pop_ser = pop_ser[~pop_ser.isin(all_political_ser)]
pop_ser

0                  funny
1              AskReddit
2                 gaming
3                    aww
4              worldnews
             ...        
95                AskMen
96    BlackPeopleTwitter
97           programming
98                   PS4
99     malefashionadvice
Name: Subreddit, Length: 100, dtype: object

In [6]:
sus_usr_ser = sus_usr_ser[~sus_usr_ser.isin(all_political_ser) & ~sus_usr_ser.isin(pop_ser)]
sus_usr_ser

2       BlackLivesMatter
3             The_Donald
4           blackculture
5             blackpower
6            AsABlackMan
              ...       
1099       redditrequest
1100             toosoon
1101          dogemarket
1102          dogemining
1103         provenlands
Name: subreddit_name_prefixed, Length: 1021, dtype: object

In [7]:
def construct_combo_subreddit_str(subreddits_list):
    subreddits_str = ''
    for sub in subreddits_list:
        subreddits_str += f"{sub}+"
    print(subreddits_str[:-1])
    return subreddits_str[:-1]

In [8]:
political_subreddits = reddit.subreddit(construct_combo_subreddit_str(all_political_ser.to_list()))
popular_subreddits = reddit.subreddit(construct_combo_subreddit_str(pop_ser.to_list()))
sus_usrs_subreddits = reddit.subreddit(construct_combo_subreddit_str(sus_usr_ser.to_list()))

AllTheLeft+Classical_Liberals+CornbreadLiberals+Democrats+Demsocialist+GreenParty+Labor+Leftcommunism+Leninism+Liberal+NeoProgs+Obama+Progressive+SocialDemocracy+Socialism+Conservative+Conservatives+Monarchism+New_Right+Objectivism+Paleoconservative+Republican+Republicans+Romney+TrueObjectivism+ACTA+AntiWar+Bad_Cop_No_Donut+BadGovNoFreedom+BasicIncome+CISPA+Democracy+EndlessWar+EnoughPaulSpam+EnoughObamaSpam+Environment+FairTax+FlushTheTPP+evolutionReddit+FirstAmendment+Good_Cop_Free_Donut+Greed+Green+GunPolitics+GunsAreCool+HumanRights+Liberty+NSA+NSALeaks+PoliticalActivism+Privacy+ProChoice+ProGun+RestoreTheFourth+Rootstrikers+SaveTheConstitution+SOPA+SupportTheTPP+Wikileaks+2012Elections+2016_elections+AmericansElect+Campaigns+ElectionPolls+ElectionReform+Elections+Forecast2016+PeoplesParty+RunForIt+Voting+AmericanGovernment+AmericanPolitics+AnythingGoesNews+Ask_Political_Science+Ask_Politics+Authoritarian+Comparative+Conspiracy+Conspiratard+DescentIntoTyranny+debatefascism+DoctorsW

In [10]:
data_list = []
csv_name = None
# Capturing 4 month period of events from 2020 elections, Jan. 6 riot
start_date = datetime.datetime(2023, 9, 8)
end_date = datetime.datetime(2023, 10, 8)

def save_data_list():
    global data_list; global csv_name
    print(f"[{datetime.datetime.now()}] Saving {len(data_list)} entries...")
    df = pd.DataFrame(data_list)
    df.drop_duplicates(subset="id", keep="last", inplace=True)
    # Display the DataFrame
    if os.path.isfile(f"{csv_name}.csv"):
        df.to_csv(f"{csv_name}.csv", mode='a', header=False)
    else:
        df.to_csv(f"{csv_name}.csv")
    # Erase to save memory
    data_list.clear()
    df.iloc[0:0]

# Function to recursively extract comments and replies
def extract_comments(comment, lvl=0):
    global data_list; global csv_name
    if lvl == 2:
         return [] # stop from nesting too much

    comments_list = []

    # Comment attributes can collect: https://praw.readthedocs.io/en/stable/code_overview/models/comment.html
    try:
        comments_list.append({
            'type': 'comment',
            'author': comment.author, 
            'body': comment.body, 
            'created_utc': comment.created_utc, 
            'id': comment.id, 
            'link_id': comment.link_id, 
            'parent_id': comment.parent_id, 
            'score': comment.score, 
            'subreddit': comment.subreddit.display_name, 
        })

        for reply in comment.replies:
            # Recursively extract replies to the reply
            comments_list.extend(extract_comments(reply, lvl=lvl+1))
            if len(comments_list) > 100: # Comment list getting large, write to file to save memory
                data_list.extend(comments_list)
                save_data_list()
                comments_list.clear()
    except Exception as e:
        print(f"[{datetime.datetime.now()}] Skipping adding this comment due to: {e}")
             
    return comments_list

def scrape_subreddits_to_csv(subreddits, name, sortmode='new', target_num_posts=100, save_every_n_posts=100): # sortmode options: https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#obtain-submission-instances-from-a-subreddit
    global data_list; global csv_name
    start_time = datetime.datetime.now()
    print(f"[{start_time}] Attempting to scrape posts for {name}, sorted by {sortmode}...")
    print(f"Only saving posts created between {start_date} and {end_date}")
    
    csv_name = f"{name}_{sortmode}"
    # Extract and append posts and comments to the list
    data_list = []
    subreddit_sort = getattr(subreddits, sortmode)
    posts = subreddit_sort(limit=None)
    count_posts = 0
    skip_recent_posts = 0
    exit = False
    done = False
    try:
        while not done:
            for post in posts:
                if exit == True:
                    break
                retry_post = 0
                while True:
                    try:
                        # Filter range of time to grab
                        if start_date.timestamp() > post.created_utc:
                            exit = True
                            break
                        if end_date.timestamp() < post.created_utc:
                            skip_recent_posts += 1
                            if skip_recent_posts % 10000 == 0:
                                print(f"[{datetime.datetime.now()}] [{count_posts}] Still skipping, currently at: {datetime.datetime.fromtimestamp(post.created_utc)}")
                            break
                        count_posts += 1
                        if count_posts % 500 == 0:
                            print(f"[{datetime.datetime.now()}] [{count_posts}] Post created date: {datetime.datetime.fromtimestamp(post.created_utc)}")
                        
                        # Below are all the fields we'll request from PRAW for each post
                        # Submission attributes can collect: https://praw.readthedocs.io/en/latest/code_overview/models/submission.html
                        post_dict = {
                            'type': 'post',
                            'author': post.author, 
                            'created_utc': post.created_utc, 
                            'id': post.id, 
                            'locked': post.locked, 
                            'name': post.name, 
                            'num_comments': post.num_comments, 
                            'score': post.score, 
                            'selftext': post.selftext, 
                            'subreddit': post.subreddit.display_name, 
                            'title': post.title, 
                            'upvote_ratio': post.upvote_ratio,
                        }
                        data_list.append(post_dict)
                        num_retries=0
                        while True:
                            try:
                                post.comments.replace_more(limit=None)
                                break
                            except praw.exceptions.DuplicateReplaceException as e:
                                print(f"[{datetime.datetime.now()}] [{count_posts}] [{post.id}] Ran into DuplicateReplaceException, continue to next post")
                                break
                            except Exception as e:
                                print(f"[{datetime.datetime.now()}] [{count_posts}] [{post.id}] [{num_retries}] {e}")
                                num_retries += 1
                                continue
                        for comment in post.comments.list():
                            data_list.extend(extract_comments(comment))
                        if count_posts % save_every_n_posts == 0 and len(data_list) > 0:
                            print(f"[{datetime.datetime.now()}] Saving #{count_posts // save_every_n_posts} batch of {save_every_n_posts} posts...")
                            save_data_list()
                        break
                    except (NotFound, Forbidden) as e:
                        print(f"[{retry_post}][{datetime.datetime.now()}] [{count_posts}] {e} (Continuing to next post...)")
                        break
                    except Exception as e:
                        retry_post += 1
                        print(f"[{retry_post}][{datetime.datetime.now()}] [{count_posts}] {e} ")
                        if retry_post == 10:
                            print("(Continuing to next post...)")
                            break
                        continue
            done = True
    except Exception as e:
        print(f"[{datetime.datetime.now()}] [{count_posts}] {e}")
        pass

    end_time = datetime.datetime.now()
    print(f"Elapsed time: {end_time - start_time}")
    print(f"Skipped {skip_recent_posts} posts that were created in the last 24 hours.")
    if len(data_list) > 0:
        save_data_list()

    
# scrape_subreddits_to_csv(political_subreddits, "political_subreddits", target_num_posts=4, save_every_n_posts=2)
# scrape_subreddits_to_csv(political_subreddits, "political_subreddits")
# scrape_subreddits_to_csv(popular_subreddits, "popular_subreddits")

In [11]:
sus_usr_sers = [sus_usr_ser[i:i+100] for i in range(0, len(sus_usr_ser), 100)]

In [12]:
for i in range(1, len(sus_usr_sers)):
    scrape_subreddits_to_csv(reddit.subreddit(construct_combo_subreddit_str(sus_usr_sers[i].to_list())), f"sus_usr_subreddits_{i}")

TheAmericanPresidency+HillaryForPrison2016+Israel+policeporn+Police_v_Video+AmIFreeToGo+Blackpeople+privacy+100BlackMen+PINAC+blackinamerica+women+DisneyMovies+YoungRepublicans+EverythingScience+blackactivism+TalkShowGirls+Patriots+MovieDownloads+Pictures+TheGameOfThronesGame+policebrutality+education+selfiehunters+climatechange+racism_immigration+transgender+ar15+FunnyandSad+USNEWS+olympics+copwatch+human_rights+humanrights+activism+Boxer+Foodnews+spicy+merica+socialism+republicanmemes+blackpeoplegifs+ProtectAndServe+jealousobamas+abortion+NewsOfTheStupid+cats+humor+EnoughLibertarianSpam+ContemporaryArt+statistics+AnimalGIFs+raditt+oklahoma+knives+hockeyplayers+hockey+sissykik+Gaykikpals+StreetArtPorn+Presidents+QuotesPorn+TinyHouses+misc+lylestevik+DoeCases+itookapicture+Celebs+Bad_Cop_No_Donut_meta+LGBTnews+Funnypics+ftm+GaySoundsShitposts+Veterans+CryptoCurrencies+doge+football+gay+HistoryPorn+USHistory+BenFranklinFanClub+adtech+Ripple+funnysigns+badcopnodoughnut+CopsBeingJerks+aww