In [1]:
import datetime as dt
import logging
from logging import INFO
import os
import pandas as pd
import praw
import pprint
from psaw import PushshiftAPI
import sys

logging.basicConfig(format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
                    level=INFO,
                    stream=sys.stderr)
logger: logging.Logger = logging

CLIENT_ID = os.environ.get("REDDSCRP_PU_SCRIPT")
SECRET_TOKEN = os.environ.get("REDDSCRP_SECRET")
headers = {"User-Agent": "reddscrape/0.0.1"}
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=SECRET_TOKEN, user_agent=headers)

In [2]:
api = PushshiftAPI(reddit)

[INFO ][2022-10-26 12:09:46,696][PushshiftAPI:0185] : https://api.pushshift.io/meta


In [10]:
def data_prep_posts(subreddit, query,  start_time, end_time, filters, limit):
    if(len(filters) == 0):
        filters = ['id', 'author', 'created_utc',
                'permalink', 'title', 'num_comments', 'score']                 
                #We set by default some useful columns

    posts = list(api.search_submissions(
        subreddit=subreddit,
        q=query,   #Subreddit we want to audit
        after=start_time,      #Start date
        before=end_time,       #End date
        filter=filters,        #Column names we want to retrieve
        limit=limit))          ##Max number of posts
    print(api.metadata_.get('shards'))

    return pd.DataFrame(posts) #Return dataframe for analysis

def data_prep_comments(subreddit, term, filters, limit, end_time=None, start_time=None):
    if (len(filters) == 0):
        filters = ['title', 'id', 'author', 'created_utc',
                'body', 'permalink', 'subreddit']
                #We set by default some useful columns 
    
    comments = list(api.search_comments(
        subreddit=subreddit,
        q=term,                 #Subreddit we want to audit
        after=start_time,       #Start date
        before=end_time,        #End date
        filter=filters,         #Column names we want to retrieve
        limit=limit))           #Max number of comments
    return pd.DataFrame(comments) #Return dataframe for analysis

In [11]:
subreddit = "Tennis"
q = "serena williams"
start_time = int(dt.datetime(2015, 1, 1).timestamp())
end_time = int(dt.datetime.now().timestamp())
filters = []
limit = 1000

# p_df = data_prep_posts(subreddit, q, start_time, end_time, filters, limit)
c_df = data_prep_comments(subreddit, q, filters, limit, end_time)



In [44]:
p_df
# scoreover1 = p_df.loc[p_df["score"] > 1]
# scoreover1

Unnamed: 0,author,created_utc,id,num_comments,permalink,score,title,created,d_
0,Lumpy-Neck3647,1665849553,y4ri80,0,/r/Boxing/comments/y4ri80/joe_louis/,1,Joe Louis,1.665875e+09,"{'author': 'Lumpy-Neck3647', 'created_utc': 16..."
1,momo903,1665300549,xzfgmz,0,/r/Boxing/comments/xzfgmz/tyson_fury_has_state...,1,‼️ Tyson Fury has stated that after he finishe...,1.665326e+09,"{'author': 'momo903', 'created_utc': 166530054..."
2,godisaprankster,1664648649,xt2g5u,0,/r/Boxing/comments/xt2g5u/boxer_joe_louis_in_t...,1,Boxer Joe Louis in Training. 1937.,1.664674e+09,"{'author': 'godisaprankster', 'created_utc': 1..."
3,MrPeanutbutter14,1664504982,xrpx1u,0,/r/Boxing/comments/xrpx1u/joe_louis_vs_usyk_at...,1,Joe Louis vs Usyk at cruiserweight,1.664530e+09,"{'author': 'MrPeanutbutter14', 'created_utc': ..."
4,ambitiousfinanceguy,1663333346,xfqqhy,0,/r/Boxing/comments/xfqqhy/joe_louis_calls_muha...,1,Joe Louis Calls Muhammad Ali A Bum | Who Would...,1.663359e+09,"{'author': 'ambitiousfinanceguy', 'created_utc..."
...,...,...,...,...,...,...,...,...,...
266,KluxKhan,1430958978,354ego,39,/r/Boxing/comments/354ego/your_top_5_heavyweig...,4,Your Top 5 Heavy-Weight Boxers of All Time ?,1.430984e+09,"{'author': 'KluxKhan', 'created_utc': 14309589..."
267,haliastales,1430532789,34l14n,3,/r/Boxing/comments/34l14n/joe_louis_boxing_glo...,26,Joe Louis boxing glove. Smithsonian Museum in ...,1.430558e+09,"{'author': 'haliastales', 'created_utc': 14305..."
268,scoopmalinowski,1424811545,2x153r,2,/r/Boxing/comments/2x153r/new_book_about_muham...,4,New book about Muhammad Ali excerpt,1.424837e+09,"{'author': 'scoopmalinowski', 'created_utc': 1..."
269,HannibalofBarca,1423876747,2vtv4n,76,/r/Boxing/comments/2vtv4n/wladimir_on_track_to...,13,Wladimir on track to break a record this year,1.423902e+09,"{'author': 'HannibalofBarca', 'created_utc': 1..."


In [12]:
from datetime import datetime

def convert_utc(df: pd.DataFrame) -> pd.DataFrame:

    df["Date"] = df["created_utc"].map(lambda t: datetime.fromtimestamp(t))
    df.sort_values(by=["Date"])
    df.drop(["created_utc", "created", "d_"], axis=1, inplace=True)
    # df.so
    # for value in df['created_utc']:
    #     value = datetime.fromtimestamp(value)
    #     print(value)
    return df

# c_df.dtypes
# c_df = c_df.astype({'created_utc': 'float64'})
c_df.dtypes

# c_df = datetime.fromtimestamp(c_df["created_utc"])
# c_df['created_utc'] = pd.to_datetime(c_df.created_utc).dt.tz_convert(None)
c_df = convert_utc(c_df)
c_df

Unnamed: 0,author,body,id,permalink,subreddit,Date
0,tonybotz,You realize the majority of players come from ...,itm421h,/r/tennis/comments/ybxdh4/jessica_pegula_4_def...,tennis,2022-10-24 10:32:40
1,The_Entheogenist,Which of his other pupils have been caught (or...,itbfdzf,/r/tennis/comments/yahopw/was_simona_halep_fra...,tennis,2022-10-22 03:41:02
2,escherbach,Serena never tested positive for a banned subs...,italx8f,/r/tennis/comments/y9u1b1/simona_halep_suspend...,tennis,2022-10-21 21:10:41
3,mastershake714,No ill will at all towards players who have ac...,itajj4o,/r/tennis/comments/ya08e6/how_come_elena_didnt...,tennis,2022-10-21 20:46:30
4,reddit8019,Should Sloane Stephen's and Serena Williams be...,itaapc7,/r/tennis/comments/y9vk57/who_doped_simona_hal...,tennis,2022-10-21 19:26:10
...,...,...,...,...,...,...
995,rubikkon,I went to the Fed Cup (what it used to be call...,i18vqop,/r/tennis/comments/thn1yf/billie_jean_king_cup...,tennis,2022-03-18 21:39:21
996,tar4ntula,i wouldn’t be surprised.\n\n&gt;\tWhen leaving...,i18qlfh,/r/tennis/comments/tgw8wi/rtennis_discussion_f...,tennis,2022-03-18 20:49:29
997,Neverslept2mins,Better smash by Halep. When ppl fail a smash r...,i18fkjx,/r/tennis/comments/tgw8wi/rtennis_discussion_f...,tennis,2022-03-18 19:12:31
998,Albiceleste_D10S,"I mean, it's pretty clearly not in the US top ...",i17ni99,/r/tennis/comments/thfhvv/taylor_fritz_on_his_...,tennis,2022-03-18 15:30:50


In [13]:
# p_df.to_csv("./data/posts.csv", index=False)
c_df.to_csv("./data/comments.csv", index=False)

In [3]:
def fetch_ids(subreddit, query, limit=None):
    filters = ['id']                 
                #We set by default some useful columns

    posts = api.search_submissions(
        subreddit=subreddit,
        q=query,   #Subreddit we want to audit
        after=int(dt.datetime(2012, 1, 1).timestamp()),      #Start date
        before=int(dt.datetime.now().timestamp()),       #End date
        filter=filters,        #Column names we want to retrieve
        limit=limit)          ##Max number of posts
    # print(api.metadata_.get('shards'))
    # data = next(posts)
    # print(data.d_)
    
    return pd.DataFrame({"ID": data} for data in posts) #Return dataframe for analysis

subreddit = "boxing"
q = "mills lane"
# after = int(dt.datetime(2012, 1, 1).timestamp())
# before = int(dt.datetime.now().timestamp())
# limit = None

posts_df = fetch_ids(subreddit, q)

[INFO ][2022-10-26 12:10:11,902][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=boxing&q=mills+lane&after=1325404800&before=1666811411&metadata=true&sort=desc
[INFO ][2022-10-26 12:10:12,634][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=boxing&q=mills+lane&after=1325404800&before=1449516114&metadata=true&sort=desc


In [4]:
posts_df

Unnamed: 0,ID
0,r1bi3x
1,h77d4u
2,f8zvyi
3,bvf7ie
4,9ar7p2
5,8qbc8m
6,8q2r9n
7,8o0g6u
8,5v76cl
9,5h1m8f


In [27]:
ids = ["t3_h77d4u", "t3_f8zvyi"]
subs = reddit.info(fullnames=ids)
subs
sub_list = [sub for sub in subs]
submission1 = sub_list[0].comments.list()
submission1
submission1[0].body
submission1[0].author

Redditor(name='darkman3535')

In [53]:
def fetch_threads(subreddit, query, limit=None):
    filters = ['id']                 
                #We set by default some useful columns

    posts = api.search_submissions(
        subreddit=subreddit,
        q=query,   #Subreddit we want to audit
        after=int(dt.datetime(2012, 1, 1).timestamp()) - 1,      #Start date
        before=int(dt.datetime.now().timestamp()),       #End date
        filter=filters,        #Column names we want to retrieve
        limit=limit)          ##Max number of posts
    # print(api.metadata_.get('shards'))
    # data = next(posts)
    # print(data.d_)
    # metadata = [post.__dict__["_reddit"] for post in posts]
    # pprint.pprint(vars(post) for post in posts)
    # print(f"meta {metadata}")
    # print(api.metadata_.get("shards"))
    # print(vars(metadata))
    meta_list = [post.__dict__ for post in posts]
    print(f"length of meta_list: {len(meta_list)}")
    data_dict = meta_list
    thread1_data = meta_list[0]
    thread1_id = thread1_data["id"]
    thread1_title = thread1_data["title"]
    thread1_subreddit = thread1_data["subreddit_name_prefixed"]
    thread1_date = dt.datetime.fromtimestamp(thread1_data["created_utc"])
    thread1_author = thread1_data["author"]
    thread1_upvotes = thread1_data["ups"]
    thread1_ratio = thread1_data["upvote_ratio"]
    thread1_num_comments = thread1_data["num_comments"]
    thread1_url = thread1_data["url"]

    # print(data_dict.keys())
    print(thread1_data)
    print(thread1_id)
    print(thread1_title)
    print(thread1_subreddit)
    print(thread1_date)
    print(thread1_author)
    print(thread1_upvotes)
    print(thread1_ratio)
    print(thread1_num_comments)
    print(thread1_url)

    # df = pd.DataFrame(data) #Return dataframe for analysis
    # df = pd.DataFrame({"ID": data_dict["id"], "Title" : data_dict["title"], "Date": data_dict["created_utc"], "Author": data_dict["author"], "Upvotes": data_dict["score"], "Upvote_Ratio": data_dict["upvote_ratio"], "Total_Comments": data_dict["num_comments"], "URL": data_dict["url"]})
    # return df
    
subreddit = "boxing"
q = "mills lane"
# after = int(dt.datetime(2012, 1, 1).timestamp())
# before = int(dt.datetime.now().timestamp())
# limit = None


In [111]:
from itertools import chain
list_of_dicts = [{"title": 'title1', "url": "url1", "redditor": "redditor1"}, {"title": 'title2', "url": "url2", "redditor": "redditor2"}, {"title": 'title3', "url": "url3", "redditor": "redditor3"}, {"title": 'title4', "url": "url4", "redditor": "redditor4"}]
list_of_dicts
# flattened = chain.from_iterable(list_of_dicts)
# print(list(flattened))

['title', 'url', 'redditor', 'title', 'url', 'redditor', 'title', 'url', 'redditor', 'title', 'url', 'redditor']


In [54]:
posts_df = fetch_threads(subreddit, q)

[INFO ][2022-10-26 13:09:41,711][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=boxing&q=mills+lane&after=1325404799&before=1666814981&metadata=true&sort=desc
[INFO ][2022-10-26 13:09:42,109][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=boxing&q=mills+lane&after=1325404799&before=1449516114&metadata=true&sort=desc


length of meta_list: 11
{'comment_limit': 2048, 'comment_sort': 'confidence', '_reddit': <praw.reddit.Reddit object at 0x7fe8440ba460>, 'approved_at_utc': None, 'subreddit': Subreddit(display_name='Boxing'), 'selftext': '[removed]', 'author_fullname': 't2_2uu21lbe', 'saved': False, 'mod_reason_title': None, 'gilded': 0, 'clicked': False, 'title': 'Mills Lane', 'link_flair_richtext': [], 'subreddit_name_prefixed': 'r/Boxing', 'hidden': False, 'pwls': 6, 'link_flair_css_class': None, 'downs': 0, 'thumbnail_height': None, 'top_awarded_type': None, 'hide_score': False, 'name': 't3_r1bi3x', 'quarantine': False, 'link_flair_text_color': 'dark', 'upvote_ratio': 1.0, 'author_flair_background_color': None, 'subreddit_type': 'public', 'ups': 1, 'total_awards_received': 0, 'media_embed': {}, 'thumbnail_width': None, 'author_flair_template_id': None, 'is_original_content': False, 'user_reports': [], 'secure_media': None, 'is_reddit_media_domain': False, 'is_meta': False, 'category': None, 'secure_

In [87]:
posts_df

Unnamed: 0,comment_limit,comment_sort,_reddit,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,...,subreddit_subscribers,created_utc,num_crossposts,media,is_video,_fetched,_comments_by_id,url_overridden_by_dest,post_hint,preview
0,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,[removed],t2_2uu21lbe,False,,0,...,1233561,1637778000.0,0,,False,False,{},,,
1,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,I love Tyson and Holyfield. This is not a bias...,t2_6jb6tkki,False,,0,...,1233561,1591912000.0,0,,False,False,{},,,
2,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,,t2_46pxu8qa,False,,0,...,1233561,1582585000.0,0,"{'type': 'youtube.com', 'oembed': {'provider_u...",False,False,{},https://www.youtube.com/watch?v=xJZdnlTUlww,,
3,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,,t2_11it43,False,,0,...,1233561,1559350000.0,0,"{'type': 'gfycat.com', 'oembed': {'provider_ur...",False,False,{},https://gfycat.com/harmlessrewardingamericanbu...,rich:video,{'images': [{'source': {'url': 'https://extern...
4,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,,t2_sbhlh,False,,0,...,1233561,1535393000.0,0,{'reddit_video': {'fallback_url': 'https://v.r...,True,False,{},https://v.redd.it/sdmbf3pzcoi11,hosted:video,{'images': [{'source': {'url': 'https://extern...
5,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,"Some verge on the ridiculous, others insanity....",,False,,0,...,1233561,1528739000.0,0,,False,False,{},,,
6,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,The dirtier side of boxing\n\n---------------\...,,False,,0,...,1233561,1528655000.0,0,,False,False,{},,,
7,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,#5 - Tyson vs Holyfield II\n\nAlso known as th...,,False,,1,...,1233561,1527945000.0,0,,False,False,{},,,
8,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,[deleted],,False,,0,...,1233561,1487625000.0,0,,False,False,{},,,
9,2048,confidence,<praw.reddit.Reddit object at 0x7f4662d37520>,,Boxing,,t2_9y5ma,False,,0,...,1233561,1481136000.0,0,"{'type': 'gfycat.com', 'oembed': {'provider_ur...",False,False,{},http://gfycat.com/RemoteExcellentIraniangroundjay,rich:video,{'images': [{'source': {'url': 'https://extern...


In [42]:
posts_df.to_csv("./data/psawprawposts.csv", index=False)

In [225]:
def get_threads(reddit_group: str, subject: str, csv=None) -> pd.DataFrame:
    """
    Loops through the threads returned from a subreddit's search sub_datas to retrieve the titles, links, and other metadata.\n
    Outputs to a csv\n
    "reddit_group" is the subreddit to search through. ex: 'boxing'\n
    "subject" is the topic you want to search about. ex: 'ali'
    """

    data_dict = {"ID": [], "Title" : [], "Date": [], "Author": [], "Upvotes": [], "Upvote_Ratio": [], "Total_Comments": [], "URL": []}

    reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=SECRET_TOKEN, user_agent=headers)
    # subreddit = reddit.subreddit(reddit_group)

    id_df = fetch_ids(subreddit=reddit_group, query=subject)
    for row in id_df.itertuples():
        submission = reddit.submission(id=row.ID)
    # logger.info(f"Searching subreddit '{reddit_group}' for '{subject}'")
    # for submission in subreddit.search(subject, limit=None):
        submission_id = submission.id
        submission_title = submission.title
        date = datetime.fromtimestamp(submission.created_utc)
        author = submission.author
        upvotes = submission.score
        upvote_ratio = submission.upvote_ratio
        num_comments = submission.num_comments
        url = "https://www.reddit.com" + submission.permalink

        data_dict["ID"] += [submission_id]
        data_dict["Title"] += [submission_title]
        data_dict["Date"] += [date]
        data_dict["Author"] += [author]
        data_dict["Upvotes"] += [upvotes]
        data_dict["Upvote_Ratio"] += [upvote_ratio]
        data_dict["Total_Comments"] += [num_comments]
        data_dict["URL"] += [url]
    logger.info(f"Creating dataframe for {subject} threads")
    df = pd.DataFrame(data=data_dict)
    if csv:
        logger.info("Saving to dataframe.")
        df.to_csv(f"./data/{subject}_threads.csv", index=False)
    print(reddit)
    return df

In [226]:
ml_df = get_threads("boxing", "mills lane")
ml_df

[INFO ][2022-10-25 21:19:28,974][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=boxing&q=mills+lane&after=1325404800&before=1666757968&metadata=true&sort=desc
[INFO ][2022-10-25 21:19:29,680][PushshiftAPI:0185] : https://api.pushshift.io/reddit/submission/search?filter=id&filter=created_utc&subreddit=boxing&q=mills+lane&after=1325404800&before=1449516114&metadata=true&sort=desc
[INFO ][2022-10-25 21:19:33,864][3314228745:0036] : Creating dataframe for mills lane threads


<praw.reddit.Reddit object at 0x7f67379718e0>


Unnamed: 0,ID,Title,Date,Author,Upvotes,Upvote_Ratio,Total_Comments,URL
0,r1bi3x,Mills Lane,2021-11-24 10:18:03,enfamous03,1,1.0,1,https://www.reddit.com/r/Boxing/comments/r1bi3...
1,h77d4u,Holyfield wasn’t intentionally headbutting Tys...,2020-06-11 14:49:56,KingAsgoreJr,0,0.36,36,https://www.reddit.com/r/Boxing/comments/h77d4...
2,f8zvyi,"TIL Mills Lane was a lawyer, prosecutor, judge...",2020-02-24 14:54:39,NotCausedManatee,1,1.0,0,https://www.reddit.com/r/Boxing/comments/f8zvy...
3,bvf7ie,Remember when Mills Lane knocked Bernard Hopki...,2019-05-31 17:52:58,drpvn,126,0.96,31,https://www.reddit.com/r/Boxing/comments/bvf7i...
4,9ar7p2,"Larry Holmes vs Marvis Frazier, Holmes signals...",2018-08-27 10:58:20,Moveinslience,81,0.96,17,https://www.reddit.com/r/Boxing/comments/9ar7p...
5,8qbc8m,Top 5 - Craziest incidents,2018-06-11 10:39:53,,128,0.95,44,https://www.reddit.com/r/Boxing/comments/8qbc8...
6,8q2r9n,Top 5 - Disqualifications,2018-06-10 11:28:07,,130,0.96,72,https://www.reddit.com/r/Boxing/comments/8q2r9...
7,8o0g6u,Top 5 - Boxing controversies,2018-06-02 06:12:00,,618,0.98,120,https://www.reddit.com/r/Boxing/comments/8o0g6...
8,5v76cl,I'm reading Tyson's biography and I had some q...,2017-02-20 13:04:02,,13,0.89,28,https://www.reddit.com/r/Boxing/comments/5v76c...
9,5h1m8f,[GIF] “Mayweather’s backpedaling for eight rou...,2016-12-07 10:37:32,FuturisticChinchilla,160,0.94,39,https://www.reddit.com/r/Boxing/comments/5h1m8...


In [74]:
jl_df.to_csv("./data/deontay_wilder_threads_psaw.csv", index=False)

In [97]:
def comment_data(submission, csv=None) -> list:
    """
    Retrieve all of the reddit comments for a single submission(thread)
    Requires a reddit submission URL or ID
    """
    data_dict = {"Title": [], "Comment": [], "Author": [], "Upvotes": [], "Downvotes": []}

    reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=SECRET_TOKEN, user_agent=headers)
    submission = reddit.submission(url=submission) if "https:" in str(submission) else reddit.submission(id=submission)
    submission.comments.replace_more(limit=None)

    if submission.selftext != "":
        submission.selftext = submission.selftext.replace("\n", "")
        submission.selftext = submission.selftext.replace("\r", "")
        data_dict["Title"] += [submission.title]
        data_dict["Comment"] += [submission.selftext]
        data_dict["Author"] += [submission.author]
        data_dict["Upvotes"] += [submission.score]
        data_dict["Downvotes"] += [submission.downs]

    for comment in submission.comments.list():
        comment.body = comment.body.replace("\n", "")
        comment.body = comment.body.replace("\r", "")
        data_dict["Title"] += [submission.title]
        data_dict["Comment"] += [comment.body]
        data_dict["Author"] += [comment.author]
        data_dict["Upvotes"] += [comment.score]
        data_dict["Downvotes"] += [comment.downs]

    df = pd.DataFrame(data=data_dict)
    if csv:
        logger.info("Saving to csv file")
        df.to_csv(f"./data/{submission.title}.csv", index=False)
    return df
    # return data_dict["Title"], data_dict["Comment"], data_dict["Author"], data_dict["Upvotes"], data_dict["Downvotes"]

In [134]:
for id in posts_df.itertuples():
    print(id.ID)

r1bi3x
h77d4u
f8zvyi
bvf7ie
9ar7p2
8qbc8m
8q2r9n
8o0g6u
5v76cl
5h1m8f
3vtrns


In [108]:
ids = posts_df["ID"]
comm_data = ids.apply(comment_data)
data_dict = {"Title": comm_data[0], "Comment": comm_data[1], "Author": comm_data[2], "Upvotes": comm_data[3], "Downvotes": comm_data[4]}
comm_data[1]
# df = pd.DataFrame(data=data_dict)
# df
# ids
# comment_data("h77d4u")

Unnamed: 0,Title,Comment,Author,Upvotes,Downvotes
0,Holyfield wasn’t intentionally headbutting Tys...,I love Tyson and Holyfield. This is not a bias...,KingAsgoreJr,0,0
1,Holyfield wasn’t intentionally headbutting Tys...,I would accept this as reasonable if holyfield...,darkman3535,24,0
2,Holyfield wasn’t intentionally headbutting Tys...,"Fields, is that you?",Nihlus11,22,0
3,Holyfield wasn’t intentionally headbutting Tys...,It was definitely on purpose it was a tactic o...,ZachariahTheMessiah,10,0
4,Holyfield wasn’t intentionally headbutting Tys...,"It doesn’t justify biting an ear, no. But a he...",Ty4ys78,9,0
5,Holyfield wasn’t intentionally headbutting Tys...,On Mike's podcast Holyfield says he's mad peop...,DivingDays,2,0
6,Holyfield wasn’t intentionally headbutting Tys...,Still sticking with my opinion. Watch the full...,KingAsgoreJr,1,0
7,Holyfield wasn’t intentionally headbutting Tys...,Foreman said holyfield was the dirtiest fighte...,belladoyle,1,0
8,Holyfield wasn’t intentionally headbutting Tys...,Wasnt tyson doing much better in round 3? He e...,,1,0
9,Holyfield wasn’t intentionally headbutting Tys...,Lol. The reason tyson bit Holyfields ear was b...,jesusatemybaby,1,0


In [124]:
data_dict = {"Title": comm_data[0], "Comment": comm_data[1], "Author": comm_data[2], "Upvotes": comm_data[3], "Downvotes": comm_data[4]}
len(comm_data)
comm_data[0:10]
# df = pd.DataFrame(data=comm_data[0:10])
# df
# ids
# comment_data("h77d4u")

0            Title                                 ...
1                                                  ...
2    Empty DataFrame
Columns: [Title, Comment, Auth...
3                                                  ...
4                                                  ...
5                             Title  \
0   Top 5 - ...
6                            Title  \
0   Top 5 - D...
7                                Title  \
0    Top ...
8                                                  ...
9                                                  ...
Name: ID, dtype: object