In [1]:
from datetime import datetime
import os

import awswrangler as wr
import boto3
import pandas as pd
from sqlalchemy import exc, create_engine

In [2]:
def sql_connection(rds_schema: str):
    """
    SQL Connection function connecting to my postgres db with schema = nba_source where initial data in ELT lands.
    Args:
        rds_schema (str): The Schema in the DB to connect to.
    Returns:
        SQL Connection variable to a specified schema in my PostgreSQL DB
    """
    RDS_USER = os.environ.get("RDS_USER")
    RDS_PW = os.environ.get("RDS_PW")
    RDS_IP = os.environ.get("IP")
    RDS_DB = os.environ.get("RDS_DB")
    try:
        connection = create_engine(
            f"postgresql+psycopg2://{RDS_USER}:{RDS_PW}@{RDS_IP}:5432/{RDS_DB}",
            connect_args={"options": f"-csearch_path={rds_schema}"},
            # defining schema to connect to
            echo=False,
        )
        print(f"SQL Connection to schema: {rds_schema} Successful")
        return connection
    except exc.SQLAlchemyError as e:
        return e

conn = sql_connection(rds_schema='nba_source')

SQL Connection to schema: nba_source Successful


In [36]:
df = pd.read_sql_query('select * from aws_reddit_comment_data_source limit 1000;', conn)

In [4]:
client = boto3.client("comprehend", region_name='us-east-1')

In [22]:
df = df.head(5)

In [37]:
def get_sentiment(text):

    response = client.detect_sentiment(
    Text=text,
    LanguageCode='en')
     
    return response

In [38]:
df['sentiment_aws'] = df['comment'].apply(lambda x: get_sentiment(str(x)))
df['sentiment_aws_type'] = pd.json_normalize(df['sentiment_aws'])['Sentiment']
df['sentiment_aws_pos'] = pd.json_normalize(df['sentiment_aws'])['SentimentScore.Positive']
df['sentiment_aws_neg'] = pd.json_normalize(df['sentiment_aws'])['SentimentScore.Negative']
df['sentiment_aws_neu'] = pd.json_normalize(df['sentiment_aws'])['SentimentScore.Neutral']
df['sentiment_aws_mix'] = pd.json_normalize(df['sentiment_aws'])['SentimentScore.Mixed']
df = df.drop('sentiment_aws', axis = 1)
df

Unnamed: 0,comment,score,url,author,flair1,flair2,edited,scrape_date,scrape_ts,compound,neg,neu,pos,sentiment,sentiment_aws_type,sentiment_aws_pos,sentiment_aws_neg,sentiment_aws_neu,sentiment_aws_mix
0,Klay in 2019 too. Danny Green should have made...,38,https://www.reddit.com/r/nba/comments/rv9ain/w...,BEE_REAL_,Raptors2,:tor-2: Raptors,false,2022-01-03,2022-01-03 18:47:46.692418,0.0000,0.000,1.000,0.000,0,NEUTRAL,0.078710,0.097212,0.824001,0.000077
1,Ben shoulda had more DPOYs not less,14,https://www.reddit.com/r/nba/comments/rv9ain/w...,yodeadasss,Pistons2,:det-2: [DET] Ben Wallace,false,2022-01-03,2022-01-03 18:47:46.692418,0.0000,0.000,1.000,0.000,0,NEUTRAL,0.308084,0.330782,0.350957,0.010176
2,People tell me all the time there's no way Dun...,1,https://www.reddit.com/r/nba/comments/rv9ain/w...,priuschic,,,false,2022-01-03,2022-01-03 18:47:46.692418,-0.5245,0.132,0.784,0.084,0,NEGATIVE,0.110028,0.809508,0.078554,0.001910
3,> 1 or 2 of Ben Wallace’s DPOYs might fall int...,1,https://www.reddit.com/r/nba/comments/rv9ain/w...,faithfuljohn,Raptors4,Raptors,false,2022-01-03,2022-01-03 18:47:46.692418,0.9123,0.000,0.722,0.278,1,POSITIVE,0.716140,0.025100,0.220089,0.038672
4,“Drew Bledsoe” - Terry Rozier,40,https://www.reddit.com/r/nba/comments/rv9ain/w...,KtheMenace,,,false,2022-01-03,2022-01-03 18:47:46.692418,0.0000,0.000,1.000,0.000,0,NEUTRAL,0.002259,0.002292,0.995445,0.000004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,our sub was so furious because they wanted bou...,429,https://www.reddit.com/r/nba/comments/rupuok/j...,Strange1130,Thunder1,:okc-1: Thunder,false,2022-01-03,2022-01-03 18:47:46.692418,-0.4481,0.137,0.782,0.081,0,NEGATIVE,0.011188,0.714125,0.271776,0.002911
996,Scottie Barnes and Josh Giddey were both huge ...,79,https://www.reddit.com/r/nba/comments/rupuok/j...,AmazingBike,SuperSonics1,:sea-1: Supersonics,false,2022-01-03,2022-01-03 18:47:46.692418,0.8720,0.000,0.705,0.295,1,POSITIVE,0.823305,0.012191,0.164199,0.000304
997,I’m almost positive this is who we wanted when...,13,https://www.reddit.com/r/nba/comments/rupuok/j...,EfficientPlane,Grizzlies1,Grizzlies,false,2022-01-03,2022-01-03 18:47:46.692418,0.6815,0.000,0.772,0.228,1,POSITIVE,0.848764,0.031067,0.108083,0.012087
998,Giddeysexuals unite,162,https://www.reddit.com/r/nba/comments/rupuok/j...,dizZzy5,Suns2,Suns,false,2022-01-03,2022-01-03 18:47:46.692418,0.0000,0.000,1.000,0.000,0,NEUTRAL,0.004235,0.035950,0.944394,0.015421


In [35]:
df2 = df.copy()
df2['sentiment_aws_type'] = pd.json_normalize(df2['sentiment_aws'])['Sentiment']
df2['sentiment_aws_pos'] = pd.json_normalize(df2['sentiment_aws'])['SentimentScore.Positive']
df2['sentiment_aws_neg'] = pd.json_normalize(df2['sentiment_aws'])['SentimentScore.Negative']
df2['sentiment_aws_neu'] = pd.json_normalize(df2['sentiment_aws'])['SentimentScore.Neutral']
df2['sentiment_aws_mix'] = pd.json_normalize(df2['sentiment_aws'])['SentimentScore.Mixed']
# df2 = pd.json_normalize(df2['sentiment_aws'])
df2 = df2.drop('sentiment_aws', axis = 1)
df2

Unnamed: 0,comment,score,url,author,flair1,flair2,edited,scrape_date,scrape_ts,compound,neg,neu,pos,sentiment,sentiment_aws_type,sentiment_aws_pos,sentiment_aws_neg,sentiment_aws_neu,sentiment_aws_mix
0,Knicks got their guy,1,https://www.reddit.com/r/nba/comments/rv559c/c...,xHodorx,Celtics3,:bos-3: Celtics,False,2022-01-03,2022-01-03 18:47:46.692418,0.0,0.0,1.0,0.0,0,NEUTRAL,0.083747,0.042949,0.870492,0.002812
1,Cavs looking pretty dang good.,1,https://www.reddit.com/r/nba/comments/rv559c/c...,RowBoatCop36,Bulls1,:chi-1: Bulls,False,2022-01-03,2022-01-03 18:47:46.692418,0.7269,0.0,0.33,0.67,1,POSITIVE,0.997193,0.000216,0.002383,0.000208
2,Poor Rando.,-19,https://www.reddit.com/r/nba/comments/rv559c/c...,Loterygods,Thunder3,:okc-3: [OKC] Josh Giddey,False,2022-01-03,2022-01-03 18:47:46.692418,-0.4767,0.756,0.244,0.0,0,NEGATIVE,0.000242,0.993229,0.006369,0.00016
3,Really got excited thinking Westbrook was head...,-10,https://www.reddit.com/r/nba/comments/rv559c/c...,Produceher,Warriors1,Warriors,False,2022-01-03,2022-01-03 18:47:46.692418,0.3976,0.0,0.749,0.251,1,POSITIVE,0.857657,0.017486,0.124425,0.000432
4,Trade southbrick,0,https://www.reddit.com/r/nba/comments/rv559c/c...,Form_Resident,,,False,2022-01-03,2022-01-03 18:47:46.692418,0.0,0.0,1.0,0.0,0,NEUTRAL,0.008178,0.026332,0.965448,4.3e-05


In [44]:
def get_sentiment_test(text):

    response = client.detect_sentiment(
    Text=text,
    LanguageCode='en')
     
    return response

def get_keyphrases_test(text):

    response = client.detect_key_phrases(
    Text=text,
    LanguageCode='en')
     
    return response

def get_pii_test(text):

    response = client.detect_pii_entities(
    Text=text,
    LanguageCode='en')
     
    return response

In [20]:
test_string = 'this fucking sucks ass man cmon loool :)'

sentiment_scores = get_sentiment_test(test_string)

In [21]:
sentiment_scores

{'Sentiment': 'MIXED',
 'SentimentScore': {'Positive': 0.1332588791847229,
  'Negative': 0.2640407085418701,
  'Neutral': 0.038309112191200256,
  'Mixed': 0.5643913149833679},
 'ResponseMetadata': {'RequestId': '4a60c89d-4373-4ab0-8a62-074a55e362dd',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4a60c89d-4373-4ab0-8a62-074a55e362dd',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '158',
   'date': 'Wed, 15 Jun 2022 21:25:42 GMT'},
  'RetryAttempts': 0}}

In [51]:
test_string = 'i really liked this product, but the smell is awful, and the price was too high'
test_string2 = 'Tim Johnson was awful in last nights performance'       # Entities type is NAME
test_string3 = '182 W Lake Street, Chicago IL 60601 helooooooooo world' # Entities type is ADDRESS
test_string4 = 'Her salary is $175,000 per Year' # Entities is empty

test_phrases = get_keyphrases_test(test_string)
pii_phrases = get_pii_test(test_string2)
pii_phrases_address = get_pii_test(test_string3)
pii_phrases_salary = get_pii_test(test_string4)

In [43]:
test_phrases

{'KeyPhrases': [{'Score': 0.9999788999557495,
   'Text': 'this product',
   'BeginOffset': 15,
   'EndOffset': 27},
  {'Score': 0.9999860525131226,
   'Text': 'the smell',
   'BeginOffset': 33,
   'EndOffset': 42},
  {'Score': 0.9999830722808838,
   'Text': 'the price',
   'BeginOffset': 57,
   'EndOffset': 66}],
 'ResponseMetadata': {'RequestId': '2e8c3772-b1fa-4fcb-9855-037d94371c6c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2e8c3772-b1fa-4fcb-9855-037d94371c6c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '259',
   'date': 'Wed, 15 Jun 2022 21:35:07 GMT'},
  'RetryAttempts': 0}}

In [46]:
pii_phrases

{'Entities': [{'Score': 0.9999963045120239,
   'Type': 'NAME',
   'BeginOffset': 0,
   'EndOffset': 11}],
 'ResponseMetadata': {'RequestId': '62f87350-b14f-4133-843c-92fe5e719e9c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '62f87350-b14f-4133-843c-92fe5e719e9c',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '88',
   'date': 'Wed, 15 Jun 2022 21:37:55 GMT'},
  'RetryAttempts': 0}}

In [48]:
pii_phrases_address

{'Entities': [{'Score': 0.9850000143051147,
   'Type': 'ADDRESS',
   'BeginOffset': 0,
   'EndOffset': 35}],
 'ResponseMetadata': {'RequestId': 'c1fd3270-ff9a-4236-8654-e9fdc3337c8b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c1fd3270-ff9a-4236-8654-e9fdc3337c8b',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '91',
   'date': 'Wed, 15 Jun 2022 21:38:29 GMT'},
  'RetryAttempts': 0}}

In [52]:
pii_phrases_salary

{'Entities': [],
 'ResponseMetadata': {'RequestId': '9f6262f6-51db-4256-a65c-397794408fc3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9f6262f6-51db-4256-a65c-397794408fc3',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '15',
   'date': 'Wed, 15 Jun 2022 21:39:19 GMT'},
  'RetryAttempts': 0}}