In [1]:
import pandas as pd
import numpy as np
import pprint as pp
import re, string
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import praw

import credentials as creds

In [2]:
# https://praw.readthedocs.io/en/latest/getting_started/
# https://www.reddit.com/dev/api/

In [3]:
CLIENT_ID = creds.client_id()
CLIENT_SECRET_KEY = creds.client_secret_key()


r = praw.Reddit(client_id = CLIENT_ID,
                client_secret = CLIENT_SECRET_KEY,
                user_agent = 'RedditorMatch')

print(r.read_only)

True


In [4]:
def getThreads(subreddit, limit = 1):
    thread_ids = []
    
    hotThreads = subreddit.hot(limit = limit)
    time.sleep(2)
    topThreads = subreddit.top(limit = limit)
    time.sleep(2)
    newThreads = subreddit.new(limit = limit)    
    
    for t in hotThreads:
        if (t.id not in thread_ids):
            thread_ids.append(t.id)
        
    for t in topThreads:
        if (t.id not in thread_ids):
            thread_ids.append(t.id)
        
    for t in newThreads:
        if (t.id not in thread_ids):
            thread_ids.append(t.id)
    
    return(thread_ids)

In [5]:
def getUsersFromSubreddit(subredditIds):
    if(len(subredditIds) < 1):
        return None
    
    usernames = []
    
    ## this process takes a while...    

    for thread_id in subredditIds:        
        thread = r.submission(id = thread_id)
        time.sleep(2)
        
        try:
            username = thread.author.name # gets the author of the thread 
            if (username is not None and username not in usernames):
                print("--- Adding user:", username)
                usernames.append(username)
        except:
            print("------ Unable to get thread author. Moving on...")      

        # get all authors of the comments in the thread
        allComments = thread.comments.list()
        for c in allComments:           
            username = str(c.author)
            if (username is not None and username not in usernames):
                print("--- Adding user:", username)
                usernames.append(username)
                
        time.sleep(2)
                
    return(usernames)      

In [6]:
def cleanText(text):
    text = text.replace(',', ' ').replace(':', ' ')\
                .replace('...', ' ')\
                .replace('?', ' ')\
                .replace('!', ' ')\
                .replace(';', ' ')\
                .replace('\n', ' ').replace('\r', '') # replace newlines and page breaks     
                
    text = re.sub(r'([^\s\w]|_)+', '', text) # remove non-alphanumeric characters but leave the spaces
    text = re.sub(' +',' ', text) # remove double spaces        
    return(text)

In [7]:
def getRedditor(username):
    try:     
        username = username.strip()
        user = r.redditor(username)
    except Exception as e: 
        print(e)
        return None
    
    return(user)

In [8]:
def getUserComments(username, commentsLimit = 1000):   
    userComments = []
    

    counter = 0
    while (counter <= 3):
        user = getRedditor(username)
        if (user is not None):
            break
        else:
            counter = counter + 1               
            print("------------ Retry #", counter, "for user:", username)
            time.sleep(15)
        
    if (user is None):
        print("------ ERROR: unable to retrieve user info:", username)
        return None  
    
    time.sleep(2)
    
    try:
        newComments = user.comments.new(limit = commentsLimit)
    except Exception as e: 
        print(e)
        print("------ ERROR: unable to retrieve new comments. moving on...")
        newComments = None
    
    time.sleep(2)
    
    try:
        hotComments = user.comments.hot(limit = commentsLimit)
    except Exception as e: 
        print(e)
        print("------ ERROR: unable to retrieve hot comments. moving on...")
        hotComments = None
    
    time.sleep(2)
    
    try:
        controversialComments = user.comments.controversial(limit = commentsLimit)
    except Exception as e: 
        print(e)
        print("------ ERROR: unable to retrieve controversial comments. moving on...")
        controversialComments = None

    # new comments    
    try:
        if (newComments is not None):
            for c in newComments:
                comment = cleanText(c.body)        
                if (comment not in userComments):
                    userComments.append(comment)
    except:
        print("Failed...moving on...")
    
    # hot comments
    try:
        if (hotComments is not None):
            for c in hotComments:  
                comment = cleanText(c.body)        
                if (comment not in userComments):
                    userComments.append(comment)
    except:
        print("Failed...moving on...")

    # controversial comments
    try:
        if (controversialComments is not None):
            for c in controversialComments:
                comment = cleanText(c.body)     
                if (comment not in userComments):
                    userComments.append(comment)
    except:
        print("Failed...moving on...")
        
    print("------ Retrieved", len(userComments), "comments for:", username)
    return(userComments)

In [9]:
def subExists(subreddit):
    exists = True
    try:
        r.subreddits.search_by_name(subreddit, exact = True)
    except:
        exists = False
    return(exists)

In [None]:
def scrapeCommentsFromSubreddit(subreddit):
    
    UserArray = []
    CommentArray = []  
    
    exists = subExists(subreddit)
    
    if (exists == False):
        print("ERROR: subreddit '", subreddit, "' does not exist...")
        return None
    
    sub = r.subreddit(subreddit)
    
    threads = getThreads(sub)
    
    if (len(threads) < 1):
        print("ERROR: unable to retrieve threads for subreddit:", subreddit)
    else:
        print("Retrieved ", len(threads), "threads...")
    
    users = getUsersFromSubreddit(threads)
    
    if (len(users) < 1):
        print("ERROR: unable to retrieve users for subreddit:", subreddit)
    else:
        print("Retrieved ", len(users), "users...")
    
    for user in users:
        try:
            userComments = getUserComments(user)

            if (userComments is not None):
                userComments = ".".join(userComments)
                UserArray.append(user)
                CommentArray.append(userComments)
        except:
            print("ERROR: something failed badly. moving on...")
        
    df = pd.DataFrame(UserArray, columns = ["Username"])
    df["Comments"] = CommentArray
    
    print("Outputting dataframe with ", len(df), "records")
    
    return(df)

In [None]:
mizzou = scrapeCommentsFromSubreddit("mizzou")

mizzou.to_csv("data/mizzou.csv", index = False)

Retrieved  3 threads...
--- Adding user: WorseThanHipster
--- Adding user: oluek
--- Adding user: RedInsulatedPatriot
--- Adding user: sahtopi
--- Adding user: rickjuly252012
--- Adding user: BottomsMU
--- Adding user: None
--- Adding user: BenisonBT101
--- Adding user: EasPerFunSkAt
--- Adding user: UpboatOrNoBoat
--- Adding user: danwin
--- Adding user: RobinIsAGoodfellow
--- Adding user: RichardTBarber
--- Adding user: Apatches
--- Adding user: senorworldwide
--- Adding user: v1ct0r1us
--- Adding user: Fredfries333
--- Adding user: herple_derpskin
--- Adding user: notfarenough
--- Adding user: Ben--Affleck
--- Adding user: ReasonOz
--- Adding user: Squidler31
--- Adding user: TheSliceman
--- Adding user: billy492
--- Adding user: tigris1427
--- Adding user: Jackaboonie
--- Adding user: razeal113
--- Adding user: bat_mayn
--- Adding user: Feartality
--- Adding user: CoMoBroLetsGo
--- Adding user: mimipath123456
--- Adding user: sparkyumr98
--- Adding user: xAIRGUITARISTx
--- Adding u

In [None]:
umich = scrapeCommentsFromSubreddit("uofm")

umich.to_csv("data/umich.csv", index = False)

In [None]:
chicago = scrapeCommentsFromSubreddit("chicago")

chicago.to_csv("data/chicago.csv", index = False)

In [None]:
columbia = scrapeCommentsFromSubreddit("columbiamo")

columbia.to_csv("data/columbia.csv", index = False)

In [None]:
dems = scrapeCommentsFromSubreddit("democrats")

dems.to_csv("data/dems.csv", index = False)

In [None]:
reps = scrapeCommentsFromSubreddit("republicans")

reps.to_csv("data/reps.csv", index = False)

In [None]:
bball = scrapeCommentsFromSubreddit("CollegeBasketball")

bball.to_csv("data/bball.csv", index = False)

In [None]:
austin = scrapeCommentsFromSubreddit("austin")

austin.to_csv("data/austin.csv", index = False)