In [1]:
import praw
import pandas as pd
from bs4 import BeautifulSoup
import requests
from _secrets import user_agent, client_id, client_secret
from tqdm import tqdm_notebook as tqdm

In [2]:
reddit = praw.Reddit(user_agent=user_agent,
                     client_id=client_id,
                     client_secret=client_secret)

In [3]:
gaming_wiki_content = reddit.subreddit('gaming').wiki['list-sorted-by-subscribers'].content_html
soup = BeautifulSoup(gaming_wiki_content)

In [4]:
tables = soup.find_all('table')
table = tables[0]

In [5]:
list_of_tables = [pd.read_html(str(table))[0] for table in tables]

Label tables for categorization later

In [6]:
for table_num,table in enumerate(list_of_tables):
    table['table_number'] = table_num

In [7]:
subreddits = pd.concat(list_of_tables)

In [8]:
subreddits.sample(9)

Unnamed: 0,Name,Link,Subscribers,table_number
31,Amiga,/r/amiga,1639,5
170,Dead Space,/r/DeadSpace,5403,0
48,AskGames,/r/AskGames,6284,2
154,Flash Gamers,/r/flashgamers,173,2
2,Girl Gamers,/r/GirlGamers,33526,4
38,Commodore,/r/Commodore,620,5
10,ProjectAwesome,/r/ProjectAwesome,2023,4
104,Custom Controllers,/r/customcontrollers,800,2
368,Combat Arms,/r/combatarms,544,0


In [9]:
subreddits['display_name'] = subreddits['Link'].str.split('/', expand=True)[2]

In [10]:
subreddits['https_Link'] = "https://reddit.com"+subreddits['Link']

In [11]:
subreddits.rename(columns={'Subscribers':'reported_Subscribers', 'Name':'label'}, inplace=True)

In [12]:
subreddits.sample(9)

Unnamed: 0,label,Link,reported_Subscribers,table_number,display_name,https_Link
13,Team Fortress 2,/r/tf2,146551,0,tf2,https://reddit.com/r/tf2
157,Baldur's Gate,/r/baldursgate,5777,0,baldursgate,https://reddit.com/r/baldursgate
24,Creepy Things In Games/Horror Games,/r/creepygaming,22887,2,creepygaming,https://reddit.com/r/creepygaming
1,Boardgames,/r/boardgames,100633,1,boardgames,https://reddit.com/r/boardgames
8,Rockstar Games,/r/rockstar,2433,6,rockstar,https://reddit.com/r/rockstar
245,Blacklight: Retribution,/r/Blacklight,2187,0,Blacklight,https://reddit.com/r/Blacklight
39,Sega Genesis / Mega Drive,/r/Megadrive,590,5,Megadrive,https://reddit.com/r/Megadrive
23,Turn Based Strategy,/r/tbs,1216,3,tbs,https://reddit.com/r/tbs
333,Wolfenstein: Enemy Territory,/r/EnemyTerritory,825,0,EnemyTerritory,https://reddit.com/r/EnemyTerritory


Inefficient building of fetched subreddit-level metadata..

In [13]:
df = pd.DataFrame()
for display_name in subreddits['display_name']:
    subreddit = reddit.subreddit(display_name)
    try:
        subreddit._fetch()
    except Exception as e:
        print(display_name, e)
    else:
        df = df.append({
            'display_name':subreddit.display_name,
            'subscribers':subreddit.subscribers,
            'name':subreddit.name,
            'id':subreddit.id
        }, ignore_index=True)

CivMulti received 403 HTTP response
eRepublik received 403 HTTP response
planets3 received 403 HTTP response
Chorilion_City_Crimes received 403 HTTP response
Ascend received 403 HTTP response
EverSky received 403 HTTP response
MarkLane received 403 HTTP response
minecraftxe received 403 HTTP response
mncpc received 403 HTTP response
TokyoJungle received 403 HTTP response
Gamingmemories received 403 HTTP response
needateam received 403 HTTP response
gamingartwork received 403 HTTP response
indiejunction received 403 HTTP response
VGracing received 403 HTTP response
pdox received 403 HTTP response
redditcasual received 403 HTTP response
Team_Awesome received 403 HTTP response
rdtclan received 403 HTTP response
wreckedrdt received 403 HTTP response
NAE3 received 403 HTTP response
Sega_Saturn received 403 HTTP response


In [14]:
subreddits = subreddits.merge(df, on='display_name')

In [15]:
subreddits.head()

Unnamed: 0,label,Link,reported_Subscribers,table_number,display_name,https_Link,id,name,subscribers
0,League of Legends,/r/leagueoflegends,699793,0,leagueoflegends,https://reddit.com/r/leagueoflegends,2rfxx,t5_2rfxx,3250876.0
1,Pokémon,/r/pokemon,444468,0,pokemon,https://reddit.com/r/pokemon,2qmeb,t5_2qmeb,1721165.0
2,Minecraft,/r/Minecraft,440180,0,Minecraft,https://reddit.com/r/Minecraft,2r05i,t5_2r05i,1602534.0
3,The Elder Scrolls V: Skyrim,/r/skyrim,298865,0,skyrim,https://reddit.com/r/skyrim,2s837,t5_2s837,681930.0
4,Hearthstone: Heroes of Warcraft,/r/hearthstone,253622,0,hearthstone,https://reddit.com/r/hearthstone,2w31t,t5_2w31t,1071585.0


testing efficiency...

What other subreddit-level metadata can we fetch?

In [None]:
hi = reddit.subreddit('hellointernet')

In [None]:
dir(hi)

# Build submissions database

submission-level metadata?

In [18]:
lol_top = reddit.subreddit('leagueoflegends').top()

In [None]:
reddit.subreddit

In [30]:
df = pd.DataFrame()
for display_name in subreddits['display_name']:
    for submission in reddit.subreddit(display_name).top(limit=10):
        try:
            df = df.append({
                'subreddit': submission.subreddit,
                'subreddit_id': submission.subreddit_id,
                'title': submission.title,
                'id': submission.id,
                'fullname': submission.fullname,
                'name': submission.name,
                'author_name': submission.author.name, 
                'upvotes': submission.ups,
                'downvotes': submission.downs,
                'score': submission.score, 
                'num_comments': submission.num_comments, 
                'gilded': submission.gilded,
                'domain': submission.domain, 
                'likes': submission.likes,
                'edited': submission.edited,
                'media': submission.media,
                'media_embed': submission.media_embed,
                'media_only': submission.media_only,
                'mod_note': submission.mod_note,
                'author_id': submission.author.id,
                'author_fullname': submission.author.fullname,
                'clicked': submission.clicked,
                'selftext': submission.selftext,
            }, ignore_index=True )
        except Exception as e:
            print(submission.fullname, e)

t3_3uze6n 'Redditor' object has no attribute 'id'
t3_35abkc 'NoneType' object has no attribute 'name'
t3_5lb6zz 'NoneType' object has no attribute 'name'
t3_59u0x5 'NoneType' object has no attribute 'name'
t3_68lpga 'NoneType' object has no attribute 'name'
t3_1tvzp8 'Redditor' object has no attribute 'id'
t3_3by1c2 'NoneType' object has no attribute 'name'
t3_1shl7d 'NoneType' object has no attribute 'name'
t3_52ae7t 'NoneType' object has no attribute 'name'
t3_8lyaam 'NoneType' object has no attribute 'name'
t3_831iqv 'NoneType' object has no attribute 'name'
t3_29t834 'NoneType' object has no attribute 'name'
t3_249uo6 'NoneType' object has no attribute 'name'
t3_5hepn4 'NoneType' object has no attribute 'name'
t3_7wknpo 'NoneType' object has no attribute 'name'
t3_b9tf0r 'Redditor' object has no attribute 'id'
t3_3y4a2b 'NoneType' object has no attribute 'name'
t3_8lx42p 'NoneType' object has no attribute 'name'
t3_1zicl5 'NoneType' object has no attribute 'name'
t3_1l1ldi 'NoneTyp

In [31]:
submissions = df
submissions.head()

RequestException: error with request HTTPSConnectionPool(host='oauth.reddit.com', port=443): Max retries exceeded with url: /r/leagueoflegends/about/?raw_json=1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000238C9748EF0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))

RequestException: error with request HTTPSConnectionPool(host='oauth.reddit.com', port=443): Max retries exceeded with url: /r/leagueoflegends/about/?raw_json=1 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x00000238C926FC50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))

In [412]:
dir(submission)

['STR_FIELD',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_chunk',
 '_comments_by_id',
 '_fetch',
 '_fetch_data',
 '_fetch_info',
 '_fetched',
 '_kind',
 '_reddit',
 '_reset_attributes',
 '_safely_add_arguments',
 '_url_parts',
 '_vote',
 'all_awardings',
 'allow_live_comments',
 'approved_at_utc',
 'approved_by',
 'archived',
 'author',
 'author_flair_background_color',
 'author_flair_css_class',
 'author_flair_richtext',
 'author_flair_template_id',
 'author_flair_text',
 'author_flair_text_color',
 'author_flair_type',
 'author_fullname',
 'author_patreon_flair',
 'banned_at_utc',
 'banned_by',
 'can_gild',
 'can_mod_post',
 'category',
 'clear_

In [402]:
[(s.title, s.author, s.score, s.id, s.name) for s in hi.top(limit=9)]

[('I like the idea of a 53 stars flag now.',
  Redditor(name='anitussi'),
  2951,
  '6xtapi',
  't3_6xtapi'),
 ('Grey_irl', Redditor(name='Dommkopf_Trip'), 2799, '78tn97', 't3_78tn97'),
 ('Too accurate', Redditor(name='KroniK907'), 2494, '6q2k13', 't3_6q2k13'),
 ('The reality of what Non-Americans understand when Americans tell us their state',
  Redditor(name='ninjomat'),
  2428,
  '9qo5qk',
  't3_9qo5qk'),
 ('CGP Grey. Upvote this so it comes up in a Google image search for CGP Grey.',
  Redditor(name='EarthlyAwakening'),
  2380,
  '6vx6x6',
  't3_6vx6x6'),
 ('the wholesome puppy caretaker',
  Redditor(name='PM_MOI_STEAM_KEYS'),
  1920,
  '7h33mv',
  't3_7h33mv'),
 ("Thought I'd post my Christmas card design mentioned in today's episode :)",
  Redditor(name='squiral-'),
  1848,
  'aclkpi',
  't3_aclkpi'),
 ('Hard as Nails, Cold as Ice',
  Redditor(name='gmalatete'),
  1832,
  'b9gzhv',
  't3_b9gzhv'),
 ('The true unofficial unofficial official Hello Internet street corner',
  Reddito

In [143]:
display_name = 'leagueoflegends'
for submission in reddit.subreddit(display_name).hot(limit=25):
    print((submission.title, submission.author, submission.score, submission.id, submission.name))
    

('Patch 9.15 Discussion and Bug Megathead', Redditor(name='untamedlazyeye'), 45, 'ck5ula', 't3_ck5ula')
('We have added an archive for LCS, LEC, and International Event live discussions', Redditor(name='untamedlazyeye'), 149, 'ckb95m', 't3_ckb95m')
("Twisted Treeline will be removed, but can't we keep Vilemaw?", Redditor(name='Eevree'), 2626, 'ckor77', 't3_ckor77')
('PROJECT: Warwick’s Bloodtrail is exactly the same as the base skin', Redditor(name='parnellyxlol'), 1538, 'ckna65', 't3_ckna65')
('SK Telecom T1 vs. Kingzone DragonX / LCK 2019 Summer - Week 8 / Post-Match Discussion', Redditor(name='adz0r'), 1682, 'ckm739', 't3_ckm739')
('I found an old pentakill edit I made in 2017, I was not on drugs', Redditor(name='hercoule'), 820, 'ckm4uf', 't3_ckm4uf')
("People really does exagerate when they say things like ''Riot is the greediest and scummiest company i've ever seen'' When in reality it's just kind of middle of the pack.", Redditor(name='Sachielkun'), 473, 'cknsi7', 't3_cknsi7')
(

available submission-level metadata:

In [None]:
dir(submission)

In [104]:
for display_name in subreddits['display_name']:
    subreddit = reddit.subreddit(display_name)
    try:
        new_submissions = [submission.title for submission in subreddit.top(limit=1)]
        submissions['title'] = submissions['title'].append(pd.Series(new_submissions), ignore_index=True)
    except Exception as e:
        print(display_name, e)

CivMulti received 403 HTTP response
eRepublik received 403 HTTP response
planets3 received 403 HTTP response
Chorilion_City_Crimes received 403 HTTP response
Ascend received 403 HTTP response
EverSky received 403 HTTP response
MarkLane received 403 HTTP response
minecraftxe received 403 HTTP response
mncpc received 403 HTTP response
TokyoJungle received 403 HTTP response


In [113]:
submissions['title'] = submissions['title'].append(pd.Series([s.title for s in reddit.subreddit('HelloInternet').top(limit=1)]), ignore_index=True)

In [114]:
submissions

Unnamed: 0,title
0,I like the idea of a 53 stars flag now.


't3_b7y7k2'