In [None]:
import praw
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from _secrets import user_agent, client_id, client_secret
from tqdm import tqdm

In [None]:
reddit = praw.Reddit(user_agent=user_agent,
                     client_id=client_id,
                     client_secret=client_secret)

What other subreddit-level metadata can we fetch?

In [None]:
gaming = reddit.subreddit('gaming')

In [None]:
print(dir(gaming))

# Collect list of gaming subreddits from /r/gaming wiki

In [None]:
gaming_wiki_content = reddit.subreddit('gaming').wiki['list-sorted-by-subscribers'].content_html
soup = BeautifulSoup(gaming_wiki_content)

Use BeautifulSoup to collect all of the tables on the gaming wiki page

In [None]:
tables = soup.find_all('table')
table = tables[0]

In [None]:
list_of_tables = [pd.read_html(str(table))[0] for table in tables]

Label tables for categorization later

In [None]:
for table_num,table in enumerate(list_of_tables):
    table['table_number'] = table_num

In [None]:
subreddits = pd.concat(list_of_tables)

In [None]:
subreddits.sample(9)

In [None]:
subreddits['display_name'] = subreddits['Link'].str.split('/', expand=True)[2]

In [None]:
subreddits['https_Link'] = ("https://reddit.com"+subreddits['Link'])

In [None]:
subreddits.rename(columns={'Subscribers':'reported_Subscribers', 'Name':'label'}, inplace=True)

In [None]:
subreddits.sample(5)

Inefficient building of fetched subreddit-level metadata..

In [None]:
subreddit = reddit.subreddit('leagueoflegends')

In [None]:
series = dict()
subreddit = reddit.subreddit('CivMulti')
for a in attrs:
    try: series[a] = getattr(subreddit, a)
    except: series[a] = np.nan
series

In [None]:
(subreddit.display_name, subreddit.fullname, subreddit.name)

In [None]:
df = pd.DataFrame()
attrs = ('display_name','fullname','id','subscribers','title')

for display_name in tqdm(subreddits['display_name']):
    subreddit = reddit.subreddit(display_name)
    series = dict()
    
    for a in attrs:
        try: series[a] = getattr(subreddit, a)
        except: series[a] = np.nan
            
    df = df.append(series, ignore_index=True)

In [None]:
subreddits = subreddits.merge(df, on='display_name')

In [None]:
subreddits.sample(5)

testing efficiency...

### Output subreddits table

In [None]:
subreddits.to_csv("subreddits.csv", index=False)