In [None]:
import praw
import pandas as pd
from bs4 import BeautifulSoup
import requests
from _secrets import user_agent, client_id, client_secret
from tqdm import tqdm

In [None]:
reddit = praw.Reddit(user_agent=user_agent,
                     client_id=client_id,
                     client_secret=client_secret)

In [None]:
gaming_wiki_content = reddit.subreddit('gaming').wiki['list-sorted-by-subscribers'].content_html
soup = BeautifulSoup(gaming_wiki_content)

In [None]:
tables = soup.find_all('table')
table = tables[0]

In [None]:
list_of_tables = [pd.read_html(str(table))[0] for table in tables]

Label tables for categorization later

In [None]:
for table_num,table in enumerate(list_of_tables):
    table['table_number'] = table_num

In [None]:
subreddits = pd.concat(list_of_tables)

In [None]:
subreddits.sample(9)

In [None]:
subreddits['display_name'] = subreddits['Link'].str.split('/', expand=True)[2]

In [None]:
subreddits['https_Link'] = "https://reddit.com"+subreddits['Link']

In [None]:
subreddits.rename(columns={'Subscribers':'reported_Subscribers', 'Name':'label'}, inplace=True)

In [None]:
subreddits.sample(9)

Inefficient building of fetched subreddit-level metadata..

In [None]:
df = pd.DataFrame()
for display_name in subreddits['display_name']:
    subreddit = reddit.subreddit(display_name)
    try:
        subreddit._fetch()
    except Exception as e:
        print(display_name, e)
    else:
        df = df.append({
            'display_name':subreddit.display_name,
            'subscribers':subreddit.subscribers,
            'name':subreddit.name,
            'id':subreddit.id
        }, ignore_index=True)

In [None]:
subreddits = subreddits.merge(df, on='display_name')

In [None]:
subreddits.head()

testing efficiency...

What other subreddit-level metadata can we fetch?

In [None]:
hi = reddit.subreddit('hellointernet')

In [None]:
dir(hi)

### Output subreddits table

In [None]:
subreddits.to_csv("subreddits.csv", index=False)

# Build submissions database

submission-level metadata?

In [None]:
dir(submission)

In [None]:
lol_top = reddit.subreddit('leagueoflegends').top()

In [None]:
df = pd.DataFrame()
for display_name in subreddits['display_name']:
    for submission in reddit.subreddit(display_name).top(limit=10):
        try:
            df = df.append({
                'subreddit': submission.subreddit,
                'subreddit_id': submission.subreddit_id,
                'title': submission.title,
                'id': submission.id,
                'fullname': submission.fullname,
                'name': submission.name,
                'author_name': submission.author.name, 
                'upvotes': submission.ups,
                'downvotes': submission.downs,
                'score': submission.score, 
                'num_comments': submission.num_comments, 
                'gilded': submission.gilded,
                'domain': submission.domain, 
                'likes': submission.likes,
                'edited': submission.edited,
                'media': submission.media,
                'media_embed': submission.media_embed,
                'media_only': submission.media_only,
                'mod_note': submission.mod_note,
                'author_id': submission.author.id,
                'author_fullname': submission.author.fullname,
                'clicked': submission.clicked,
                'selftext': submission.selftext,
            }, ignore_index=True )
        except Exception as e:
            print(submission.fullname, e)

In [None]:
submissions = df

In [None]:
submissions.sample(9)

### Output submmisions table

In [None]:
submissions.to_csv("submissions.csv", index=False)

In [None]:
[(s.title, s.author, s.score, s.id, s.name) for s in hi.top(limit=9)]

In [None]:
display_name = 'leagueoflegends'
for submission in reddit.subreddit(display_name).hot(limit=25):
    print((submission.title, submission.author, submission.score, submission.id, submission.name))
    

available submission-level metadata:

In [None]:
dir(submission)

In [None]:
for display_name in subreddits['display_name']:
    subreddit = reddit.subreddit(display_name)
    try:
        new_submissions = [submission.title for submission in subreddit.top(limit=1)]
        submissions['title'] = submissions['title'].append(pd.Series(new_submissions), ignore_index=True)
    except Exception as e:
        print(display_name, e)

In [None]:
submissions['title'] = submissions['title'].append(pd.Series([s.title for s in reddit.subreddit('HelloInternet').top(limit=1)]), ignore_index=True)

In [None]:
submissions