In [1]:
import asyncio
import json
import os
from datetime import datetime

import aiohttp
import nest_asyncio
import numpy as np
import requests
import requests.auth
from dotenv import load_dotenv

In [2]:
load_dotenv()
REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_USERNAME = os.getenv("REDDIT_USERNAME")
REDDIT_PASSWORD = os.getenv("REDDIT_PASSWORD")
USER_AGENT = f"PersonalScript/0.1 by {REDDIT_USERNAME}"

In [3]:
# async def fetch(url):
#     async with aiohttp.ClientSession() as session:
#         try:
#             async with session.get(url) as response:
#                 return await response.text()
#         except aiohttp.ClientError as e:
#             print(f"An error occurred: {e}")
#             return None


# async def main():
#     urls = [
#         "https://www.reddit.com/r/meme/top.json?t=week",
#     ]
#     tasks = [fetch(url) for url in urls]
#     responses = await asyncio.gather(*tasks)
#     for response in responses:
#         if response:
#             print(response)

# nest_asyncio.apply()

# loop = asyncio.get_event_loop()
# res = loop.run_until_complete(main())

In [4]:
def get_access_token() -> str:
    client_auth = requests.auth.HTTPBasicAuth(REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET)
    post_data = {
        "grant_type": "password",
        "username": REDDIT_USERNAME,
        "password": REDDIT_PASSWORD,
    }
    headers = {"User-Agent": USER_AGENT}
    response = requests.post(
        "https://www.reddit.com/api/v1/access_token",
        auth=client_auth,
        data=post_data,
        headers=headers,
    )
    return response.json()["access_token"]

In [5]:
access_token = get_access_token()

In [18]:
%%time
headers = {"Authorization": f"bearer {access_token}", "User-Agent": USER_AGENT}
params = {"t": "year", "limit": 100}
urls = []
afters = []

for i in range(4500):
    print(f"{i}", end=" - ")
    
    response = requests.get(
        "https://oauth.reddit.com/r/memes/top.json", params=params, headers=headers
    )
    response.raise_for_status()
    data = response.json()
    
    params["after"] = data["data"]["after"]
    afters.append(params["after"] + "\n")
    
    children = data["data"]["children"]
    for i in range(len(children)):
        urls.append(children[i]["data"]["url"])

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 

TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'

In [19]:
data

{'kind': 'Listing',
 'data': {'after': None,
  'dist': 80,
  'modhash': None,
  'geo_filter': '',
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'memes',
     'selftext': '',
     'author_fullname': 't2_2uy8hvxg',
     'saved': False,
     'mod_reason_title': None,
     'gilded': 0,
     'clicked': False,
     'title': "It's true.",
     'link_flair_richtext': [],
     'subreddit_name_prefixed': 'r/memes',
     'hidden': False,
     'pwls': 6,
     'link_flair_css_class': None,
     'downs': 0,
     'thumbnail_height': 140,
     'top_awarded_type': None,
     'hide_score': False,
     'name': 't3_17c8qxb',
     'quarantine': False,
     'link_flair_text_color': 'dark',
     'upvote_ratio': 0.98,
     'author_flair_background_color': None,
     'subreddit_type': 'public',
     'ups': 19684,
     'total_awards_received': 0,
     'media_embed': {},
     'thumbnail_width': 140,
     'author_flair_template_id': None,
     'is_original_content': False,


In [14]:
unique_exts = [url.split(".")[-1] for url in urls]
np.unique(unique_exts)

array(['com/r/memes/comments/139cwpl/removed_by_reddit/', 'gif', 'gifv',
       'it/0l6ael5o325c1', 'it/43kxb7ea4uya1', 'it/6ab5a7kkp82c1',
       'it/axmcjhe27ztb1', 'it/bmor0t7sncgc1', 'it/cn2bf6b8gq9c1',
       'it/dvzova1vrg3c1', 'it/ev91peq1gfta1', 'it/exjuhjxwygub1',
       'it/ge997torfjbc1', 'it/i0lkiui3jp3c1', 'it/ih5nxugyqlac1',
       'it/jdttafi5ht7c1', 'it/k1z07ptllvbc1', 'it/ojex74u7ssbb1',
       'it/q6zd3l7nsvgb1', 'it/qintkzri2c1c1', 'it/u54ryuq2rwzb1',
       'it/vjxo99kwow3c1', 'it/w47mqc1l8f7c1', 'it/zm6laamo0vxb1', 'jpeg',
       'jpg', 'png'], dtype='<U47')

In [15]:
def filter_url(url) -> bool:
    if url[-1] == "/":
        return False
    if url.split(".")[-1] in {"jpeg", "jpg", "png"}:
        return False
    if url.split(".")[-1] in {"gif", "gifv"} or url.split("/")[-1]:
        return True
    return False

In [16]:
filter_urls = [url + "\n" for url in urls if filter_url(url)]
len(filter_urls), filter_urls

(178,
 ['https://i.redd.it/n5y03g4jz00b1.gif\n',
  'https://i.redd.it/fgvnz70pugwa1.gif\n',
  'https://i.redd.it/lu3ttx9w1j7b1.gif\n',
  'https://i.imgur.com/ndJH7G6.gifv\n',
  'https://i.redd.it/91yuxp879n3b1.gif\n',
  'https://i.redd.it/3kdvrb6a9dza1.gif\n',
  'https://i.redd.it/b2t564amn68c1.gif\n',
  'https://i.imgur.com/IySwXoV.gif\n',
  'https://i.redd.it/4tup6s0974db1.gif\n',
  'https://i.redd.it/3mrazxvnnlya1.gif\n',
  'https://i.redd.it/n3ovn1qc8z8b1.gif\n',
  'https://i.redd.it/77m1vl51kk7b1.gif\n',
  'https://i.redd.it/2xg8d86y4t4b1.gif\n',
  'https://i.redd.it/xz4jhf65maab1.gif\n',
  'https://i.redd.it/nkff71wqr71b1.gif\n',
  'https://v.redd.it/jdttafi5ht7c1\n',
  'https://i.redd.it/ukvivcxblpsa1.gif\n',
  'https://i.redd.it/98z1r2ydz5cb1.gif\n',
  'https://i.redd.it/hohjgg7ebn0b1.gif\n',
  'https://i.redd.it/m2cgjjdl9q7b1.gif\n',
  'https://i.redd.it/21u9ljdfckta1.gif\n',
  'https://v.redd.it/6ab5a7kkp82c1\n',
  'https://i.redd.it/n1596od8d7eb1.gif\n',
  'https://i.redd.it

In [17]:
# for i, url in enumerate(filter_urls):
#     image = requests.get(url)
#     file_name = url.split("/")[-1]
#     with open(f"images/{file_name}", "wb") as f:
#         f.write(image.content)