### Reddit scrapper example

In [24]:
import re

def obtain_data_from_reddit(start_timestamp: float, end_timestamp: float, secrets_file_location: str):
    import praw
    from datetime import datetime
    import json
    
    with open(secrets_file_location) as json_file:
        login_info = json.load(json_file)

    reddit = praw.Reddit(client_id=login_info['client_id'],
                         client_secret=login_info['client_secret'],
                         username=login_info['username'],
                         password=login_info['password'],
                         user_agent=login_info['user_agent'])
    subreddit = reddit.subreddit('memes')
    new_memes = subreddit.new(limit=1500)

    memes_to_download, scanned_memes = get_memes_to_upload(new_memes, start_timestamp, end_timestamp)

    date_hour_now = datetime.now().strftime("%Y%m%d%H")
    date_hour_nice = datetime.now().strftime('%Y %m %d, %H:%M')

    log = f'Downloading data from Reddit on {date_hour_nice}. \n'
    log = log + f'Scanned {scanned_memes} memes, found {len(memes_to_download)} memes to download. \n'

    # Save json file
    json_filename = f'./reddit_{date_hour_now}.json'

    with open(json_filename, 'w') as f:
        f.write(json.dumps(memes_to_download, indent=1))

    return memes_to_download

In [28]:
def get_memes_to_upload(new_memes, start_timestamp, end_timestamp):
    scanned_memes = 0
    memes_to_upload = []
    date_hour_now = datetime.now().strftime("%Y%m%d%H")
    
    for submission in new_memes:
        sub_creation_date = submission.created_utc
        scanned_memes += 1
        if start_timestamp <= sub_creation_date < end_timestamp:
            data = {
                'date': sub_creation_date,
                'title': str(submission.title),
                'upvotes': int(submission.score),
                'upvote_ratio': float(submission.upvote_ratio)
            }
            
            img_url = str(submission.url)
            try:
                img_extension = re.split(r'[^\w]', img_url.split('/')[-1].split('.')[1])[0]
            except:
                img_extension = "png"
            
            meme = {
                'url': img_url,
                'extension': img_extension,
                'id': 'reddit_' + str(date_hour_now) + '_' + str(scanned_memes),
                'additional_data': data
            }

            memes_to_upload.append(meme)
        elif start_timestamp > sub_creation_date:
            break

    return memes_to_upload, scanned_memes

In [29]:
def main():
    from datetime import datetime
    shift = 2
    data = obtain_data_from_reddit(start_timestamp=datetime.datetime.utcnow().timestamp() - 3600 * (shift + 1),
                                   end_timestamp=datetime.datetime.utcnow().timestamp() - 3600 * shift,
                                   secrets_file_location='./reddit.json')
    return data

In [30]:
from datetime import datetime
shift = 0
data = obtain_data_from_reddit(start_timestamp=datetime.utcnow().timestamp() - 3600 * (shift + 1),
                               end_timestamp=datetime.utcnow().timestamp() - 3600 * shift,
                               secrets_file_location='./reddit.json')

In [31]:
data

[{'url': 'https://i.redd.it/71wwgx8wcdx41.jpg',
  'extension': 'jpg',
  'id': 'reddit_2020050720_727',
  'additional_data': {'date': 1588868705.0,
   'title': "That's why you use dishwasher",
   'upvotes': 40,
   'upvote_ratio': 0.9}},
 {'url': 'https://i.redd.it/oc64170vcdx41.jpg',
  'extension': 'jpg',
  'id': 'reddit_2020050720_728',
  'additional_data': {'date': 1588868695.0,
   'title': '2021 is gonna be my year',
   'upvotes': 22,
   'upvote_ratio': 0.89}},
 {'url': 'https://i.redd.it/gpzqsxuucdx41.jpg',
  'extension': 'jpg',
  'id': 'reddit_2020050720_729',
  'additional_data': {'date': 1588868692.0,
   'title': 'Mom can I have Spotify?',
   'upvotes': 24,
   'upvote_ratio': 0.87}},
 {'url': 'https://i.redd.it/d4elxquscdx41.jpg',
  'extension': 'jpg',
  'id': 'reddit_2020050720_730',
  'additional_data': {'date': 1588868673.0,
   'title': 'Nobody is gonna use $10 to remove watermark only for a month.',
   'upvotes': 25,
   'upvote_ratio': 0.94}},
 {'url': 'https://i.redd.it/lcxx

In [13]:
data[0]['url'].lstrip('https://i.redd.it/')

'm3q2od8madx41.jpg'

In [19]:
image_url = data[0]['url']
try:
    image_extension = re.split(r'[^\w]', image_url.split('/')[-1].split('.')[1])[0]
except KeyError:
    image_extension = "png"

In [21]:
hash(image_url)

897248439231119785