In [1]:
import datetime as dt
import logging
from logging import INFO
import os
import pandas as pd
import praw
import pprint
from psaw import PushshiftAPI
import sys

logging.basicConfig(format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
                    level=INFO,
                    stream=sys.stderr)
logger: logging.Logger = logging

CLIENT_ID = os.environ.get("REDDSCRP_PU_SCRIPT")
SECRET_TOKEN = os.environ.get("REDDSCRP_SECRET")
headers = {"User-Agent": "reddscrape/0.0.1"}
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=SECRET_TOKEN, user_agent=headers)

In [347]:
# Fetch posts with minimal PRAW usage

def data_prep_posts(subreddit, query,  start_time, end_time, filters, limit):
    if(len(filters) == 0):
        filters = ['id', 'title', 'created_utc', 'author', 
                'score', 'upvote_ratio', 'num_comments', 'url']                 
                #We set by default some useful columns
                
    api = PushshiftAPI()
    posts = list(api.search_submissions(
        subreddit=subreddit,
        q=query,   #Subreddit we want to audit
        after=start_time,      #Start date
        before=end_time,       #End date
        filter=filters,        #Column names we want to retrieve
        limit=limit))          ##Max number of posts
    print(api.metadata_.get('shards'))
    print(posts[0])
    # print(posts[0].d_)
    # id = f"{posts[0].d_['id']}"
    # title = f"{posts[0].d_['title']}"
    # date = int(posts[0].d_['created_utc'])
    # date = dt.datetime.fromtimestamp(date)
    # author = f"{posts[0].d_['author']}"
    # upvotes = f"{posts[0].d_['score']}"
    # upvote_ratio = f"{posts[0].d_['upvote_ratio']}"
    # num_comments = f"{posts[0].d_['num_comments']}"
    # url = f"{posts[0].d_['url']}"

    # print(id)
    # print(title)
    # print(date)
    # print(author)
    # print(upvotes)
    # print(upvote_ratio)
    # print(num_comments)
    # print(url)

    data_dict = {"ID": [], "Title" : [], "Date": [], "Author": [], "Upvotes": [], "Num_Comments": [], "URL": []}

    for post in posts:
        # print(f"POST: {post}")
        # print(f"RATIO: {post.d_['upvote_ratio']}")
        # break
        id = f"{post.d_['id']}"
        fullname = reddit.info(fullnames=[f"t3_{id}"])
        for name in fullname:
            upvotes = name.score
            num_comments = name.num_comments
            url = f"https://www.reddit.com{name.permalink}"
        title = f"{post.d_['title']}"
        date = int(post.d_['created_utc'])
        date = dt.datetime.fromtimestamp(date)
        author = f"{post.d_['author']}"
        # upvotes = f"{post.d_['score']}"
        # upvote_ratio = f"{post.d_['upvote_ratio']}"
        # break
        # num_comments = f"{post.d_['num_comments']}"
        # url = f"{post.d_['url']}"

        data_dict['ID'] += [id]
        data_dict['Title'] += [title]
        data_dict['Date'] += [date]
        data_dict['Author'] += [author]
        data_dict['Upvotes'] += [upvotes]
        # data_dict['Upvote_Ratio'] += [upvote_ratio]
        data_dict['Num_Comments'] += [num_comments]
        data_dict['URL'] += [url]



    # print(f"Title: {posts[0]['title']}")
    # print(f"Date: {posts[0]['created_utc']}")
    # print(f"Author: {posts[0]['author']}")
    # print(f"Score: {posts[0]['score']}")
    # print(f"Upvote_Ratio: {posts[0]['upvote_ratio']}")
    # print(f"Num_Comments: {posts[0]['num_comments']}")
    # print(f"URL: {posts[0]['url']}")
    return pd.DataFrame(data_dict) #Return dataframe for analysis

def data_prep_comments(subreddit, term, filters, limit, end_time=None, start_time=None):
    if (len(filters) == 0):
        filters = ['title', 'id', 'author', 'created_utc',
                'body', 'permalink', 'subreddit']
                #We set by default some useful columns 
    
    comments = list(api.search_comments(
        subreddit=subreddit,
        q=term,                 #Subreddit we want to audit
        after=start_time,       #Start date
        before=end_time,        #End date
        filter=filters,         #Column names we want to retrieve
        limit=limit))           #Max number of comments
    return pd.DataFrame(comments) #Return dataframe for analysis

In [348]:
subreddit = "Tennis"
q = "serena williams"
start_time = int(dt.datetime(2015, 1, 1).timestamp())
end_time = int(dt.datetime.now().timestamp())
filters = []
limit = 1

p_df = data_prep_posts(subreddit, q, start_time, end_time, filters, limit)
# c_df = data_prep_comments(subreddit, q, filters, limit, end_time)

[INFO ][2022-10-26 22:29:42,778][PushshiftAPI:0185] : https://api.pushshift.io/meta
[INFO ][2022-10-26 22:29:43,317][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?subreddit=Tennis&q=serena+williams&after=1420099200&before=1666848582&filter=id&filter=title&filter=created_utc&filter=author&filter=score&filter=upvote_ratio&filter=num_comments&filter=url&limit=1&metadata=true&sort=desc


{'failed': 0, 'skipped': 0, 'successful': 20, 'total': 24}
submission(author='eleanor_james', created_utc=1666787382, id='ydwx25', num_comments=0, score=1, title="Serena Williams teases tennis fans as she says 'I'm not retired'", upvote_ratio=1.0, url='https://edition.cnn.com/2022/10/25/tennis/serena-williams-tennis-return-spt-intl/index.html', created=1666812582.0, d_={'author': 'eleanor_james', 'created_utc': 1666787382, 'id': 'ydwx25', 'num_comments': 0, 'score': 1, 'title': "Serena Williams teases tennis fans as she says 'I'm not retired'", 'upvote_ratio': 1.0, 'url': 'https://edition.cnn.com/2022/10/25/tennis/serena-williams-tennis-return-spt-intl/index.html', 'created': 1666812582.0})




In [290]:
p_df
# scoreover1 = p_df.loc[p_df["score"] > 1]
# scoreover1

Unnamed: 0,ID,Title,Date,Author,Upvotes,Num_Comments,URL
0,ydwx25,Serena Williams teases tennis fans as she says...,2022-10-26 05:29:42,eleanor_james,0,1,https://www.reddit.com/r/tennis/comments/ydwx2...
1,ydf44j,Two Americans are ranked in the top four for f...,2022-10-25 13:39:10,estreetpanda,72,5,https://www.reddit.com/r/tennis/comments/ydf44...
2,yd7iw1,Serena Williams says she's not retired and the...,2022-10-25 08:24:46,thythr,0,37,https://www.reddit.com/r/tennis/comments/yd7iw...
3,ya2wgt,If Simona Halep is found guilty of doping duri...,2022-10-21 12:50:28,reddit8019,1,0,https://www.reddit.com/r/tennis/comments/ya2wg...
4,y7c76y,Who do you think was Serena Williams toughest ...,2022-10-18 09:47:57,tennisfan120,63,99,https://www.reddit.com/r/tennis/comments/y7c76...
...,...,...,...,...,...,...,...
95,x18skd,"Robert Gordon: It is August 29, 2022. Serena W...",2022-08-29 20:45:31,akitakiteriyaki,38,1,https://www.reddit.com/r/tennis/comments/x18sk...
96,x17ta9,🥎 Serena Williams @US Open Tennis Championship...,2022-08-29 19:56:58,LiveWalkingNYC,1,0,https://www.reddit.com/r/tennis/comments/x17ta...
97,x1753n,Venus Williams,2022-08-29 19:25:19,KaxOn440,9,7,https://www.reddit.com/r/tennis/comments/x1753...
98,x163nr,Round 2 on the great Serena Williams farewell ...,2022-08-29 18:37:12,Julian81295,117,19,https://www.reddit.com/r/tennis/comments/x163n...


In [12]:
from datetime import datetime

def convert_utc(df: pd.DataFrame) -> pd.DataFrame:

    df["Date"] = df["created_utc"].map(lambda t: datetime.fromtimestamp(t))
    df.sort_values(by=["Date"])
    df.drop(["created_utc", "created", "d_"], axis=1, inplace=True)
    # df.so
    # for value in df['created_utc']:
    #     value = datetime.fromtimestamp(value)
    #     print(value)
    return df

# c_df.dtypes
# c_df = c_df.astype({'created_utc': 'float64'})
c_df.dtypes

# c_df = datetime.fromtimestamp(c_df["created_utc"])
# c_df['created_utc'] = pd.to_datetime(c_df.created_utc).dt.tz_convert(None)
c_df = convert_utc(c_df)
c_df

Unnamed: 0,author,body,id,permalink,subreddit,Date
0,tonybotz,You realize the majority of players come from ...,itm421h,/r/tennis/comments/ybxdh4/jessica_pegula_4_def...,tennis,2022-10-24 10:32:40
1,The_Entheogenist,Which of his other pupils have been caught (or...,itbfdzf,/r/tennis/comments/yahopw/was_simona_halep_fra...,tennis,2022-10-22 03:41:02
2,escherbach,Serena never tested positive for a banned subs...,italx8f,/r/tennis/comments/y9u1b1/simona_halep_suspend...,tennis,2022-10-21 21:10:41
3,mastershake714,No ill will at all towards players who have ac...,itajj4o,/r/tennis/comments/ya08e6/how_come_elena_didnt...,tennis,2022-10-21 20:46:30
4,reddit8019,Should Sloane Stephen's and Serena Williams be...,itaapc7,/r/tennis/comments/y9vk57/who_doped_simona_hal...,tennis,2022-10-21 19:26:10
...,...,...,...,...,...,...
995,rubikkon,I went to the Fed Cup (what it used to be call...,i18vqop,/r/tennis/comments/thn1yf/billie_jean_king_cup...,tennis,2022-03-18 21:39:21
996,tar4ntula,i wouldn’t be surprised.\n\n&gt;\tWhen leaving...,i18qlfh,/r/tennis/comments/tgw8wi/rtennis_discussion_f...,tennis,2022-03-18 20:49:29
997,Neverslept2mins,Better smash by Halep. When ppl fail a smash r...,i18fkjx,/r/tennis/comments/tgw8wi/rtennis_discussion_f...,tennis,2022-03-18 19:12:31
998,Albiceleste_D10S,"I mean, it's pretty clearly not in the US top ...",i17ni99,/r/tennis/comments/thfhvv/taylor_fritz_on_his_...,tennis,2022-03-18 15:30:50


In [13]:
# p_df.to_csv("./data/posts.csv", index=False)
c_df.to_csv("./data/comments.csv", index=False)

In [228]:
def fetch_ids(subreddit, query, limit=None):
    filters = ['id', "permalink"]                 
                #We set by default some useful columns

    posts = api.search_submissions(
        subreddit=subreddit,
        q=query,   #Subreddit we want to audit
        after=int(dt.datetime(2012, 1, 1).timestamp()),      #Start date
        before=int(dt.datetime.now().timestamp()),       #End date
        filter=filters,        #Column names we want to retrieve
        limit=limit)          ##Max number of posts
    # print(api.metadata_.get('shards'))
    # data = next(posts)
    # print(data.d_)
    
    return pd.DataFrame({"ID": data} for data in posts) #Return dataframe for analysis

subreddit = "tennis"
q = "serena williams"
# after = int(dt.datetime(2015, 1, 1).timestamp())
# before = int(dt.datetime.now().timestamp())
limit = 100

posts_df = fetch_ids(subreddit, q, limit)

[INFO ][2022-10-26 17:21:29,031][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?subreddit=tennis&q=serena+williams&after=1325404800&before=1666830088&filter=id&filter=permalink&filter=created_utc&limit=100&metadata=true&sort=desc


In [231]:
posts_df.iloc[0:4].ID

0    (1666787382, ydwx25, /r/tennis/comments/ydwx25...
1    (1666730350, ydf44j, /r/tennis/comments/ydf44j...
2    (1666711486, yd7iw1, /r/tennis/comments/yd7iw1...
3    (1666381828, ya2wgt, /r/tennis/comments/ya2wgt...
Name: ID, dtype: object

In [365]:
# ids = ["t3_ydf44j", "t3_h77d4u", "t3_f8zvyi"]
ids = ["t1_itm421h"]
subs = reddit.info(fullnames=ids)
subs
sub_list = [sub for sub in subs]
sub_list
submission1 = sub_list[0]
submission1
# submission1.id
submission1.body
# submission1.author
# submission1.title
# submission1.permalink
# submission1.score
# submission1_comments = submission1.comments.list()
# submission1_comments[0].body

'You realize the majority of players come from wealthy families, don’t you? For every Serena Williams there are ten Ernes Gulbis’'

In [308]:
# Fetch threads with more PRAW usage

def fetch_threads(subreddit, query, limit=None):
    filters = ['id']                 
                #We set by default some useful columns
                
    api = PushshiftAPI(reddit)
    posts = api.search_submissions(
        subreddit=subreddit,
        q=query,   #Subreddit we want to audit
        after=int(dt.datetime(2012, 1, 1).timestamp()) - 1,      #Start date
        before=int(dt.datetime.now().timestamp()),       #End date
        filter=filters,        #Column names we want to retrieve
        limit=limit)          ##Max number of posts
    # print(api.metadata_.get('shards'))
    # data = next(posts)
    # print(data.d_)
    # metadata = [post.__dict__["_reddit"] for post in posts]
    # pprint.pprint(vars(post) for post in posts)
    # print(f"meta {metadata}")
    # print(api.metadata_.get("shards"))
    # print(vars(metadata))
    # meta_list = [post.__dict__ for post in posts]
    # print(f"length of meta_list: {len(meta_list)}")
    # print(meta_list)
    # data_dict = meta_list
    # thread1_data = meta_list[0]
    # thread1_id = thread1_data["id"]
    # thread1_title = thread1_data["title"]
    # thread1_subreddit = thread1_data["subreddit_name_prefixed"]
    # thread1_date = dt.datetime.fromtimestamp(thread1_data["created_utc"])
    # thread1_author = thread1_data["author"]
    # thread1_upvotes = thread1_data["ups"]
    # thread1_ratio = thread1_data["upvote_ratio"]
    # thread1_num_comments = thread1_data["num_comments"]
    # thread1_url = thread1_data["url"]
    data_dict = {"ID": [], "Title" : [], "Subreddit": [], "Date": [], "Author": [], "Upvotes": [], "Ratio": [], "Num_Comments": [], "URL": []}
    meta_list = [post.__dict__ for post in posts]
    # print(meta_list)
    for meta_dict in meta_list:
        # thread_data = meta_dict[-1]
        # for thread in thread_data:
        data_dict["ID"] += [meta_dict["id"]]
        data_dict["Title"] += [meta_dict["title"]]
        data_dict["Subreddit"] += [meta_dict["subreddit_name_prefixed"]]
        data_dict["Date"] += [dt.datetime.fromtimestamp(meta_dict["created_utc"])]
        data_dict["Author"] += [meta_dict["author"]]
        data_dict["Upvotes"] += [meta_dict["ups"]]
        data_dict["Ratio"] += [meta_dict["upvote_ratio"]]
        data_dict["Num_Comments"] += [meta_dict["num_comments"]]
        data_dict["URL"] += [f"https://www.reddit.com{meta_dict['permalink']}"]

    # print(data_dict.keys())
    # print(thread1_data)
    # print(thread1_id)
    # print(thread1_title)
    # print(thread1_subreddit)
    # print(thread1_date)
    # print(thread1_author)
    # print(thread1_upvotes)
    # print(thread1_ratio)
    # print(thread1_num_comments)
    # print(thread1_url)

    # df = pd.DataFrame(data) #Return dataframe for analysis
    # df = pd.DataFrame({"ID": meta_dict["id"], "Title" : meta_dict["title"], "Date": meta_dict["created_utc"], "Author": meta_dict["author"], "Upvotes": meta_dict["score"], "Upvote_Ratio": meta_dict["upvote_ratio"], "Total_Comments": meta_dict["num_comments"], "URL": meta_dict["url"]})
    df = pd.DataFrame(data_dict)
    return df
    
# subreddit = "boxing"
# q = "mills lane"
# after = int(dt.datetime(2012, 1, 1).timestamp())
# before = int(dt.datetime.now().timestamp())
# limit = None

subreddit = "Tennis"
q = "serena williams"
start_time = int(dt.datetime(2015, 1, 1).timestamp())
end_time = int(dt.datetime.now().timestamp())
filters = []
limit = 1000

In [309]:
posts_df = fetch_threads(subreddit, q, limit=limit)

[INFO ][2022-10-26 21:34:07,237][PushshiftAPI:0185] : https://api.pushshift.io/meta
[INFO ][2022-10-26 21:34:07,705][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=Tennis&q=serena+williams&after=1325404799&before=1666845247&limit=1000&metadata=true&sort=desc
[INFO ][2022-10-26 21:34:31,035][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=Tennis&q=serena+williams&after=1325404799&before=1651784686&limit=750&metadata=true&sort=desc
[INFO ][2022-10-26 21:35:03,771][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=Tennis&q=serena+williams&after=1325404799&before=1613468280&limit=500&metadata=true&sort=desc
[INFO ][2022-10-26 21:35:14,642][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=Tennis&q=serena+williams&after=1325404799&before=1577427096&l

In [310]:
posts_df

Unnamed: 0,ID,Title,Subreddit,Date,Author,Upvotes,Ratio,Num_Comments,URL
0,ydwx25,Serena Williams teases tennis fans as she says...,r/tennis,2022-10-26 05:29:42,eleanor_james,0,0.33,1,https://www.reddit.com/r/tennis/comments/ydwx2...
1,ydf44j,Two Americans are ranked in the top four for f...,r/tennis,2022-10-25 13:39:10,estreetpanda,73,0.91,5,https://www.reddit.com/r/tennis/comments/ydf44...
2,yd7iw1,Serena Williams says she's not retired and the...,r/tennis,2022-10-25 08:24:46,thythr,0,0.46,37,https://www.reddit.com/r/tennis/comments/yd7iw...
3,ya2wgt,If Simona Halep is found guilty of doping duri...,r/tennis,2022-10-21 12:50:28,reddit8019,1,1.00,0,https://www.reddit.com/r/tennis/comments/ya2wg...
4,y7c76y,Who do you think was Serena Williams toughest ...,r/tennis,2022-10-18 09:47:57,tennisfan120,61,0.88,99,https://www.reddit.com/r/tennis/comments/y7c76...
...,...,...,...,...,...,...,...,...,...
995,cajiot,Match Thread: Serena Williams vs Carla Suarez-...,r/tennis,2019-07-08 03:49:38,dgfhghjgf,0,0.43,12,https://www.reddit.com/r/tennis/comments/cajio...
996,cajhcw,[Match@Thread]: Serena Williams vs Carla Suare...,r/tennis,2019-07-08 03:45:19,jonny-bairstow,0,0.33,0,https://www.reddit.com/r/tennis/comments/cajhc...
997,cai6o5,John McEnroe's comments on Serena Williams in ...,r/tennis,2019-07-08 00:57:23,,1,0.53,18,https://www.reddit.com/r/tennis/comments/cai6o...
998,cagxx4,Serena Williams: I had a lot of fun with Andy ...,r/tennis,2019-07-07 22:22:14,tennisworldusa,1,1.00,0,https://www.reddit.com/r/tennis/comments/cagxx...


In [368]:
# Fetch comments with PRAW usage
def fetch_comments(subreddit, query, limit=None):
    
    data_dict = {"ID": [], "Thread_Title": [], "Comment": [], "Date": [], "Author": [], "Upvotes": [], "Downvotes": [], "Subreddit": [], "URL": []}

    api = PushshiftAPI(reddit)
    comments = list(api.search_comments(
        subreddit=subreddit,
        q=query,                 #Subreddit we want to audit
        after=int(dt.datetime(2012, 1, 1).timestamp()) - 1,       #Start date
        before=int(dt.datetime.now().timestamp()),        #End date
        limit=limit))           #Max number of comments

    for comment in comments:
        # print(comment.__dict__)
        # print(comment.link_id)
        # print(comment.body)
        # print(comment.subreddit)
        # print(dt.datetime.fromtimestamp(int(comment.created_utc)))
        # print(comment.author)
        # print(comment.score)
        # print(comment.downs)
        # thread_id = reddit.info(fullnames=[comment.link_id])
        # for item in thread_id:
        #     print(item.title)
        # print(f"https://reddit.com{comment.permalink}")

        data_dict["ID"] += [f"t1_{comment.id}"]
        thread_id = reddit.info(fullnames=[comment.link_id])
        for item in thread_id:
            data_dict["Thread_Title"] += [item.title]
        data_dict["Comment"] += [comment.body]
        data_dict["Date"] += [dt.datetime.fromtimestamp(int(comment.created_utc))]
        data_dict["Author"] += [comment.author]
        data_dict["Upvotes"] += [comment.score]
        data_dict["Downvotes"] += [comment.downs]
        data_dict["Subreddit"] += [comment.subreddit]
        data_dict["URL"] += [f"https://reddit.com{comment.permalink}"]
    return pd.DataFrame(data_dict) #Return dataframe for analysis

In [369]:
subreddit = "Tennis"
q = "serena williams"
limit=10
c_df = fetch_comments(subreddit, q, limit)

[INFO ][2022-10-26 22:55:29,647][PushshiftAPI:0185] : https://api.pushshift.io/meta
[INFO ][2022-10-26 22:55:31,158][PushshiftAPI:0185] : https://api.pushshift.io/reddit/comment/search?filter=id&filter=created_utc&subreddit=Tennis&q=serena+williams&after=1325404799&before=1666850129&limit=10&metadata=true&sort=desc


In [370]:
c_df

Unnamed: 0,ID,Thread_Title,Comment,Date,Author,Upvotes,Downvotes,Subreddit,URL
0,t1_itm421h,"Jessica Pegula [4] def. Maria Sakkari 6-2, 6-3...",You realize the majority of players come from ...,2022-10-24 10:32:40,tonybotz,2,0,tennis,https://reddit.com/r/tennis/comments/ybxdh4/je...
1,t1_itbfdzf,Was Simona Halep Framed?,Which of his other pupils have been caught (or...,2022-10-22 03:41:02,The_Entheogenist,5,0,tennis,https://reddit.com/r/tennis/comments/yahopw/wa...
2,t1_italx8f,Simona Halep suspended for positive doping test,Serena never tested positive for a banned subs...,2022-10-21 21:10:41,escherbach,2,0,tennis,https://reddit.com/r/tennis/comments/y9u1b1/si...
3,t1_itajj4o,How come Elena didn’t get the same reception R...,No ill will at all towards players who have ac...,2022-10-21 20:46:30,mastershake714,4,0,tennis,https://reddit.com/r/tennis/comments/ya08e6/ho...
4,t1_itaapc7,Who Doped Simona Halep?,Should Sloane Stephen's and Serena Williams be...,2022-10-21 19:26:10,reddit8019,5,0,tennis,https://reddit.com/r/tennis/comments/y9vk57/wh...
5,t1_ita10l8,How come Elena didn’t get the same reception R...,"At least in the US market, Iga and Elena suffe...",2022-10-21 18:06:41,electricblueguava,6,0,tennis,https://reddit.com/r/tennis/comments/ya08e6/ho...
6,t1_it9t7bu,How come Elena didn’t get the same reception R...,[deleted],2022-10-21 17:04:16,,0,0,tennis,https://reddit.com/r/tennis/comments/ya08e6/ho...
7,t1_it8rhx3,Simona's statement,">To be honest she was ungodly ripped, strong a...",2022-10-21 12:29:18,reddit8019,8,0,tennis,https://reddit.com/r/tennis/comments/y9u4fe/si...
8,t1_it8e4hr,Simona Halep suspended for positive doping test,[deleted],2022-10-21 10:59:33,,5,0,tennis,https://reddit.com/r/tennis/comments/y9u1b1/si...
9,t1_it8csaw,Simona Halep suspended for positive doping test,"Serena Williams retired, what the hell are you...",2022-10-21 10:50:48,Guilty-Knee-8521,1,0,tennis,https://reddit.com/r/tennis/comments/y9u1b1/si...


In [42]:
posts_df.to_csv("./data/psawprawposts.csv", index=False)

In [225]:
def get_threads(reddit_group: str, subject: str, csv=None) -> pd.DataFrame:
    """
    Loops through the threads returned from a subreddit's search sub_datas to retrieve the titles, links, and other metadata.\n
    Outputs to a csv\n
    "reddit_group" is the subreddit to search through. ex: 'boxing'\n
    "subject" is the topic you want to search about. ex: 'ali'
    """

    data_dict = {"ID": [], "Title" : [], "Date": [], "Author": [], "Upvotes": [], "Upvote_Ratio": [], "Total_Comments": [], "URL": []}

    reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=SECRET_TOKEN, user_agent=headers)
    # subreddit = reddit.subreddit(reddit_group)

    id_df = fetch_ids(subreddit=reddit_group, query=subject)
    for row in id_df.itertuples():
        submission = reddit.submission(id=row.ID)
    # logger.info(f"Searching subreddit '{reddit_group}' for '{subject}'")
    # for submission in subreddit.search(subject, limit=None):
        submission_id = submission.id
        submission_title = submission.title
        date = datetime.fromtimestamp(submission.created_utc)
        author = submission.author
        upvotes = submission.score
        upvote_ratio = submission.upvote_ratio
        num_comments = submission.num_comments
        url = "https://www.reddit.com" + submission.permalink

        data_dict["ID"] += [submission_id]
        data_dict["Title"] += [submission_title]
        data_dict["Date"] += [date]
        data_dict["Author"] += [author]
        data_dict["Upvotes"] += [upvotes]
        data_dict["Upvote_Ratio"] += [upvote_ratio]
        data_dict["Total_Comments"] += [num_comments]
        data_dict["URL"] += [url]
    logger.info(f"Creating dataframe for {subject} threads")
    df = pd.DataFrame(data=data_dict)
    if csv:
        logger.info("Saving to dataframe.")
        df.to_csv(f"./data/{subject}_threads.csv", index=False)
    print(reddit)
    return df

In [226]:
ml_df = get_threads("boxing", "mills lane")
ml_df

[INFO ][2022-10-25 21:19:28,974][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=boxing&q=mills+lane&after=1325404800&before=1666757968&metadata=true&sort=desc
[INFO ][2022-10-25 21:19:29,680][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=boxing&q=mills+lane&after=1325404800&before=1449516114&metadata=true&sort=desc
[INFO ][2022-10-25 21:19:33,864][3314228745:0036] : Creating dataframe for mills lane threads


<praw.reddit.Reddit object at 0x7f67379718e0>


Unnamed: 0,ID,Title,Date,Author,Upvotes,Upvote_Ratio,Total_Comments,URL
0,r1bi3x,Mills Lane,2021-11-24 10:18:03,enfamous03,1,1.0,1,https://www.reddit.com/r/Boxing/comments/r1bi3...
1,h77d4u,Holyfield wasn’t intentionally headbutting Tys...,2020-06-11 14:49:56,KingAsgoreJr,0,0.36,36,https://www.reddit.com/r/Boxing/comments/h77d4...
2,f8zvyi,"TIL Mills Lane was a lawyer, prosecutor, judge...",2020-02-24 14:54:39,NotCausedManatee,1,1.0,0,https://www.reddit.com/r/Boxing/comments/f8zvy...
3,bvf7ie,Remember when Mills Lane knocked Bernard Hopki...,2019-05-31 17:52:58,drpvn,126,0.96,31,https://www.reddit.com/r/Boxing/comments/bvf7i...
4,9ar7p2,"Larry Holmes vs Marvis Frazier, Holmes signals...",2018-08-27 10:58:20,Moveinslience,81,0.96,17,https://www.reddit.com/r/Boxing/comments/9ar7p...
5,8qbc8m,Top 5 - Craziest incidents,2018-06-11 10:39:53,,128,0.95,44,https://www.reddit.com/r/Boxing/comments/8qbc8...
6,8q2r9n,Top 5 - Disqualifications,2018-06-10 11:28:07,,130,0.96,72,https://www.reddit.com/r/Boxing/comments/8q2r9...
7,8o0g6u,Top 5 - Boxing controversies,2018-06-02 06:12:00,,618,0.98,120,https://www.reddit.com/r/Boxing/comments/8o0g6...
8,5v76cl,I'm reading Tyson's biography and I had some q...,2017-02-20 13:04:02,,13,0.89,28,https://www.reddit.com/r/Boxing/comments/5v76c...
9,5h1m8f,[GIF] “Mayweather’s backpedaling for eight rou...,2016-12-07 10:37:32,FuturisticChinchilla,160,0.94,39,https://www.reddit.com/r/Boxing/comments/5h1m8...


In [74]:
jl_df.to_csv("./data/deontay_wilder_threads_psaw.csv", index=False)

In [97]:
def comment_data(submission, csv=None) -> list:
    """
    Retrieve all of the reddit comments for a single submission(thread)
    Requires a reddit submission URL or ID
    """
    data_dict = {"Title": [], "Comment": [], "Author": [], "Upvotes": [], "Downvotes": []}

    reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=SECRET_TOKEN, user_agent=headers)
    submission = reddit.submission(url=submission) if "https:" in str(submission) else reddit.submission(id=submission)
    submission.comments.replace_more(limit=None)

    if submission.selftext != "":
        submission.selftext = submission.selftext.replace("\n", "")
        submission.selftext = submission.selftext.replace("\r", "")
        data_dict["Title"] += [submission.title]
        data_dict["Comment"] += [submission.selftext]
        data_dict["Author"] += [submission.author]
        data_dict["Upvotes"] += [submission.score]
        data_dict["Downvotes"] += [submission.downs]

    for comment in submission.comments.list():
        comment.body = comment.body.replace("\n", "")
        comment.body = comment.body.replace("\r", "")
        data_dict["Title"] += [submission.title]
        data_dict["Comment"] += [comment.body]
        data_dict["Author"] += [comment.author]
        data_dict["Upvotes"] += [comment.score]
        data_dict["Downvotes"] += [comment.downs]

    df = pd.DataFrame(data=data_dict)
    if csv:
        logger.info("Saving to csv file")
        df.to_csv(f"./data/{submission.title}.csv", index=False)
    return df
    # return data_dict["Title"], data_dict["Comment"], data_dict["Author"], data_dict["Upvotes"], data_dict["Downvotes"]

In [134]:
for id in posts_df.itertuples():
    print(id.ID)

r1bi3x
h77d4u
f8zvyi
bvf7ie
9ar7p2
8qbc8m
8q2r9n
8o0g6u
5v76cl
5h1m8f
3vtrns


In [108]:
ids = posts_df["ID"]
comm_data = ids.apply(comment_data)
data_dict = {"Title": comm_data[0], "Comment": comm_data[1], "Author": comm_data[2], "Upvotes": comm_data[3], "Downvotes": comm_data[4]}
comm_data[1]
# df = pd.DataFrame(data=data_dict)
# df
# ids
# comment_data("h77d4u")

Unnamed: 0,Title,Comment,Author,Upvotes,Downvotes
0,Holyfield wasn’t intentionally headbutting Tys...,I love Tyson and Holyfield. This is not a bias...,KingAsgoreJr,0,0
1,Holyfield wasn’t intentionally headbutting Tys...,I would accept this as reasonable if holyfield...,darkman3535,24,0
2,Holyfield wasn’t intentionally headbutting Tys...,"Fields, is that you?",Nihlus11,22,0
3,Holyfield wasn’t intentionally headbutting Tys...,It was definitely on purpose it was a tactic o...,ZachariahTheMessiah,10,0
4,Holyfield wasn’t intentionally headbutting Tys...,"It doesn’t justify biting an ear, no. But a he...",Ty4ys78,9,0
5,Holyfield wasn’t intentionally headbutting Tys...,On Mike's podcast Holyfield says he's mad peop...,DivingDays,2,0
6,Holyfield wasn’t intentionally headbutting Tys...,Still sticking with my opinion. Watch the full...,KingAsgoreJr,1,0
7,Holyfield wasn’t intentionally headbutting Tys...,Foreman said holyfield was the dirtiest fighte...,belladoyle,1,0
8,Holyfield wasn’t intentionally headbutting Tys...,Wasnt tyson doing much better in round 3? He e...,,1,0
9,Holyfield wasn’t intentionally headbutting Tys...,Lol. The reason tyson bit Holyfields ear was b...,jesusatemybaby,1,0


In [124]:
data_dict = {"Title": comm_data[0], "Comment": comm_data[1], "Author": comm_data[2], "Upvotes": comm_data[3], "Downvotes": comm_data[4]}
len(comm_data)
comm_data[0:10]
# df = pd.DataFrame(data=comm_data[0:10])
# df
# ids
# comment_data("h77d4u")

0            Title                                 ...
1                                                  ...
2    Empty DataFrame
Columns: [Title, Comment, Auth...
3                                                  ...
4                                                  ...
5                             Title  \
0   Top 5 - ...
6                            Title  \
0   Top 5 - D...
7                                Title  \
0    Top ...
8                                                  ...
9                                                  ...
Name: ID, dtype: object

In [1]:
query = "joe louis"
query = query.replace(" ", "_")
print(query)

joe_louis
