In [1]:
import praw
import pandas as pd
from bs4 import BeautifulSoup
import requests
from _secrets import user_agent, client_id, client_secret
from tqdm import tqdm

In [2]:
reddit = praw.Reddit(user_agent=user_agent,
                     client_id=client_id,
                     client_secret=client_secret)

In [3]:
gaming_wiki_content = reddit.subreddit('gaming').wiki['list-sorted-by-subscribers'].content_html
soup = BeautifulSoup(gaming_wiki_content)

In [4]:
tables = soup.find_all('table')
table = tables[0]

In [5]:
list_of_tables = [pd.read_html(str(table))[0] for table in tables]

Label tables for categorization later

In [6]:
for table_num,table in enumerate(list_of_tables):
    table['table_number'] = table_num

In [7]:
subreddits = pd.concat(list_of_tables)

In [8]:
subreddits.sample(9)

Unnamed: 0,Name,Link,Subscribers,table_number
365,Fortnite,/r/FORTnITE,551,0
239,LittleBigPlanet,/r/littlebigplanet,2419,0
12,Base-Building Games,/r/BaseBuildingGames,3994,3
19,Speedrunning,/r/speedrun,34925,2
457,Rising World,/r/RisingWorld,131,0
147,Gaming for Leisure,/r/gamingforleisure,223,2
512,Ascend: Hand of Kul,/r/Ascend,54,0
299,Marvel: Avengers Alliance,/r/MAA,1171,0
20,Game Collecting,/r/gamecollecting,30123,2


In [9]:
subreddits['display_name'] = subreddits['Link'].str.split('/', expand=True)[2]

In [10]:
subreddits['https_Link'] = ("https://reddit.com"+subreddits['Link'])

In [11]:
subreddits.rename(columns={'Subscribers':'reported_Subscribers', 'Name':'label'}, inplace=True)

In [12]:
subreddits.sample(5)

Unnamed: 0,label,Link,reported_Subscribers,table_number,display_name,https_Link
333,Wolfenstein: Enemy Territory,/r/EnemyTerritory,825,0,EnemyTerritory,https://reddit.com/r/EnemyTerritory
519,Battle Forge,/r/BattleForge,43,0,BattleForge,https://reddit.com/r/BattleForge
108,Gaming Fashion,/r/gamingfashion,654,2,gamingfashion,https://reddit.com/r/gamingfashion
382,Beyond: Two Souls,/r/beyondtwosouls,416,0,beyondtwosouls,https://reddit.com/r/beyondtwosouls
26,Stealth Games,/r/stealthgames,1087,3,stealthgames,https://reddit.com/r/stealthgames
373,The Longest Journey,/r/thelongestjourney,511,0,thelongestjourney,https://reddit.com/r/thelongestjourney
332,Train Fever,/r/trainfever,836,0,trainfever,https://reddit.com/r/trainfever
79,Loading Screen Thoughts,/r/loadingscreenthoughts,2013,2,loadingscreenthoughts,https://reddit.com/r/loadingscreenthoughts
140,WTFGames,/r/wtfgames,259,2,wtfgames,https://reddit.com/r/wtfgames


Inefficient building of fetched subreddit-level metadata..

In [13]:
df = pd.DataFrame()
for display_name in subreddits['display_name']:
    subreddit = reddit.subreddit(display_name)
    try:
        subreddit._fetch()
    except Exception as e:
        print(display_name, e)
    else:
        df = df.append({
            'display_name':subreddit.display_name,
            'subscribers':subreddit.subscribers,
            'name':subreddit.name,
            'id':subreddit.id
        }, ignore_index=True)

CivMulti received 403 HTTP response
eRepublik received 403 HTTP response
planets3 received 403 HTTP response
Chorilion_City_Crimes received 403 HTTP response
Ascend received 403 HTTP response
EverSky received 403 HTTP response
MarkLane received 403 HTTP response
minecraftxe received 403 HTTP response
mncpc received 403 HTTP response
TokyoJungle received 403 HTTP response
Gamingmemories received 403 HTTP response
needateam received 403 HTTP response
gamingartwork received 403 HTTP response
indiejunction received 403 HTTP response
VGracing received 403 HTTP response
pdox received 403 HTTP response
redditcasual received 403 HTTP response
Team_Awesome received 403 HTTP response
rdtclan received 403 HTTP response
wreckedrdt received 403 HTTP response
NAE3 received 403 HTTP response
Sega_Saturn received 403 HTTP response


In [14]:
subreddits = subreddits.merge(df, on='display_name')

In [16]:
subreddits.sample(5)

Unnamed: 0,label,Link,reported_Subscribers,table_number,display_name,https_Link,id,name,subscribers
806,Platforming,/r/platforming,49,3,platforming,https://reddit.com/r/platforming,2xxvg,t5_2xxvg,114.0
865,SNES,/r/snes,11434,5,snes,https://reddit.com/r/snes,2rany,t5_2rany,67809.0
811,Mindcrack,/r/mindcrack,50497,4,mindcrack,https://reddit.com/r/mindcrack,2to85,t5_2to85,43030.0
462,Unreal Tournament 2004,/r/UT2004,108,0,UT2004,https://reddit.com/r/UT2004,2t04e,t5_2t04e,161.0
568,Munchkin,/r/Munchkin,3875,1,Munchkin,https://reddit.com/r/Munchkin,2s6ol,t5_2s6ol,6788.0


testing efficiency...

What other subreddit-level metadata can we fetch?

In [17]:
hi = reddit.subreddit('hellointernet')

In [18]:
dir(hi)

['MESSAGE_PREFIX',
 'STR_FIELD',
 'VALID_TIME_FILTERS',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_create_or_update',
 '_fetch',
 '_fetch_data',
 '_fetch_info',
 '_fetched',
 '_kind',
 '_path',
 '_reddit',
 '_reset_attributes',
 '_safely_add_arguments',
 '_submission_class',
 '_submit_media',
 '_subreddit_collections_class',
 '_subreddit_list',
 '_upload_media',
 '_url_parts',
 '_validate_time_filter',
 'banned',
 'collections',
 'comments',
 'contributor',
 'controversial',
 'display_name',
 'emoji',
 'filters',
 'flair',
 'fullname',
 'gilded',
 'hot',
 'message',
 'mod',
 'moderator',
 'modmail',
 'muted',
 'new',
 'parse',
 'quaran',
 'random

### Output subreddits table

In [34]:
subreddits.to_csv("subreddits.csv", index=False)