In [1]:
import praw
from datetime import datetime, timezone

In [2]:
from configparser import ConfigParser

parser = ConfigParser()
_ = parser.read("../reddit.cfg")

CLIENT_ID = parser.get("reddit_api", "CLIENT_ID")
CLIENT_SECRET = parser.get("reddit_api", "CLIENT_SECRET")
PASSWORD = parser.get("reddit_api", "PASSWORD")
USERNAME = parser.get("reddit_api", "USERNAME")

In [3]:
reddit = praw.Reddit(
    client_id=f"{CLIENT_ID}",
    client_secret=f"{CLIENT_SECRET}",
    password=f"{PASSWORD}",
    user_agent=f"Post Extraction (by u/{USERNAME})",
    username=f"{USERNAME}",
    
)

print(reddit.read_only)
#print(reddit.user.me())

False


In [300]:
from collections import namedtuple, OrderedDict

risingSchema = OrderedDict()
risingSchema["postId"] = "S"
risingSchema["subreddit"] = "S"
risingSchema["title"] = "S"
risingSchema["createdUTC"] = "S" 
risingSchema["timeElapsedMin"] = "N"
risingSchema["score"] = "N"
risingSchema["numComments"] = "N"
risingSchema["upvoteRatio"] = "N"
risingSchema["numGildings"] = "N"
risingSchema["loadTimeUTC"] = "S"
risingSchema["loadDateUTC"] = "S"

def getRising(reddit, subreddit, topN=25, risingSchema=risingSchema, verbose=False):
    topN = reddit.subreddit(subreddit).rising(limit=topN)  # rising caps out at 25
    now = datetime.utcnow().replace(tzinfo=None)
    columns = risingSchema.keys()
    Row = namedtuple("Row", columns)
    dataCollected = []
    for submission in topN:
        createdUTC = datetime.utcfromtimestamp(submission.created_utc)
        timeElapsedMin = (now-createdUTC).seconds//60
        if timeElapsedMin >=120:
            continue
        postId = submission.id
        title = submission.title
        score = submission.score
        numComments = submission.num_comments
        upvoteRatio = submission.upvote_ratio
        gildings = submission.gildings
        numGildings = sum(gildings.values())
        row = Row(
            postId=postId, subreddit=subreddit, title=title, createdUTC=str(createdUTC), 
            timeElapsedMin=timeElapsedMin, score=score, numComments=numComments, 
            upvoteRatio=upvoteRatio, numGildings=numGildings, loadTimeUTC=str(now), loadDateUTC=str(now.date()))
        dataCollected.append(row)
        if verbose:
            print(row)
            print()
    return dataCollected


In [301]:
subreddit = "pics"
risingData = getRising(reddit, subreddit, 25)

In [302]:
# for visualization purposes, don't need pandas later
import pandas as pd
pd.DataFrame(risingData)

Unnamed: 0,postId,subreddit,title,createdUTC,timeElapsedMin,score,numComments,upvoteRatio,numGildings,loadTimeUTC,loadDateUTC
0,12du4w8,pics,Anyone else see this too,2023-04-06 18:57:42,85,185,20,0.83,0,2023-04-06 20:22:57.114709,2023-04-06
1,12dw16z,pics,Super Sharp Picture of the Moon,2023-04-06 19:59:46,23,21,9,1.0,0,2023-04-06 20:22:57.114709,2023-04-06
2,12dw9au,pics,Witness my Act and Deed - Frank Paton - 1883,2023-04-06 20:07:03,15,10,1,1.0,0,2023-04-06 20:22:57.114709,2023-04-06
3,12dutj6,pics,It took me 2 years to complete this. The Desec...,2023-04-06 19:20:12,62,25,7,0.88,0,2023-04-06 20:22:57.114709,2023-04-06
4,12dvdvi,pics,"A rainy day in Tokyo, Japan",2023-04-06 19:38:39,44,19,3,1.0,0,2023-04-06 20:22:57.114709,2023-04-06
5,12dubbw,pics,Sold my camera to shoot with my phone only.,2023-04-06 19:03:16,79,22,3,0.83,0,2023-04-06 20:22:57.114709,2023-04-06
6,12dtyla,pics,I work near the most important building in the...,2023-04-06 18:51:43,91,23,2,0.88,0,2023-04-06 20:22:57.114709,2023-04-06
7,12dumdy,pics,Kids with their new play set,2023-04-06 19:13:31,69,12,3,0.88,0,2023-04-06 20:22:57.114709,2023-04-06
8,12dv3wg,pics,Sunrise this morning has to be a sign of a Goo...,2023-04-06 19:29:42,53,11,2,0.87,0,2023-04-06 20:22:57.114709,2023-04-06
9,12dw5q8,pics,Pond Leaves in India,2023-04-06 20:03:42,19,7,0,1.0,0,2023-04-06 20:22:57.114709,2023-04-06


Steps

1. Set up [IAM Identity Center](https://docs.aws.amazon.com/singlesignon/latest/userguide/getting-started.html). Create a permission set and a user under this permission set. Set up MFA with this user.
2. Install [AWS CLI V2](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html)
3. [Install Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html#install-boto3) and [set up the config file](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#aws-iam-identity-center) for the IAM Identity Center. During the set up you will be prompted to log into the user you created with the MFA method you previously set up.

In [249]:
import boto3

session = boto3.Session(profile_name='AdministratorAccess-629137478606', region_name='us-east-2')
# resource vs client: https://www.learnaws.org/2021/02/24/boto3-resource-client/
dynamodb_resource = session.resource('dynamodb')  #  higher level abstractions, recommended to use, fewer methods but creating table returns a table object that you can run operations on, can also grab a Table with Table('name')
# dynamodb_client = session.client('dynamodb')  # low-level, more explicit methods. Creating table returns a dictionary

In [263]:
def getOrCreateTable(tableDefinition, dynamodb_resource):
    existingTables = [a.name for a in dynamodb_resource.tables.all()] # client method: dynamodb_client.list_tables()['TableNames']
    tableName = tableDefinition['TableName']
    if tableName not in existingTables:
        print(f"Table {tableName} not found, creating table")
        # create table
        # boto3: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb/service-resource/create_table.html#DynamoDB.ServiceResource.create_table
        # dynamodb keyschemas and secondary indexes: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.CoreComponents.html
        table = dynamodb_resource.create_table(**tableDefinition)

        # Wait until the table exists.
        table.wait_until_exists()

    else:
        print(f"Table {tableName} exists, grabbing table...")
        table = dynamodb_resource.Table(tableName)
    
    # Print out some data about the table.
    print(f"Item count in table: {table.item_count}")  ## this only updates every 6 hours
    return table

In [337]:
%%time
tableName = "risingStaging"  # table for collecting rising posts data
risingStagingTableDefinition = dict(
    AttributeDefinitions=[ 
        {
            'AttributeName': k,
            'AttributeType': risingSchema[k]
        } for k in ['postId', 'loadTimeUTC', 'subreddit']  # only need to define the ones that are used in key and sort
    ],
    TableName=tableName,
    KeySchema=[
        {
            'AttributeName': 'postId',
            'KeyType': 'HASH'
        },
        {
            'AttributeName': 'loadTimeUTC',
            'KeyType': 'RANGE'

        }
    ],
    GlobalSecondaryIndexes=[  # I wanted to future proof other ways I might look at the table (by subreddit)
        {
            'IndexName': 'bySubreddit',
            'KeySchema': [
                {
                    'AttributeName': 'subreddit',
                    'KeyType': 'HASH'
                },
                {
                    'AttributeName': 'loadTimeUTC',
                    'KeyType': 'RANGE'
                },
            ],
            'Projection': {
                'ProjectionType': 'KEYS_ONLY',
            },
            'ProvisionedThroughput': {  # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GSI.html#GSI.ThroughputConsiderations
                'ReadCapacityUnits': 1,  # 1 = 4KB/s I think
                'WriteCapacityUnits': 1  # 1 = 1KB/s
            }
        },
    ],
    BillingMode='PROVISIONED',  # recommended for consistent work
    ProvisionedThroughput={  # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#default-limits-throughput-capacity-modes
        'ReadCapacityUnits': 4,
        'WriteCapacityUnits': 4
    },
    TableClass='STANDARD',
    DeletionProtectionEnabled=False
)
risingTable = getOrCreateTable(risingStagingTableDefinition, dynamodb_resource)

Table risingStaging exists, grabbing table...
Item count in table: 0
CPU times: user 33.6 ms, sys: 5.86 ms, total: 39.5 ms
Wall time: 209 ms


In [344]:
import json
from decimal import Decimal

# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/dynamodb.html#batch-writing
# I didn't bother with dealing with duplicates because shouldn't be a problem with this type of data
# no built in way to get reponses with batch_writer https://peppydays.medium.com/getting-response-of-aws-dynamodb-batchwriter-request-2aa3f81019fa
def batchWriter(table, data, schema):
    columns = schema.keys()
    with table.batch_writer() as batch:
        for i in range(len(data)):  # for each row obtained
            batch.put_item(
                Item = json.loads(json.dumps({k:getattr(data[i], k) for k in columns}), parse_float=Decimal) # helps with parsing float to Decimal
            )

In [345]:
batchWriter(risingTable, risingData, risingSchema)

To do:

- write a function that cleans up data older than ~24 hours
- write a function that gathers the top data
- write a function that cleans the current dynamo data and joins it to targets (the top data)