In [1]:
import os
import sys
from datetime import datetime, timezone, timedelta
from urllib.request import urlopen
from urllib.error import HTTPError
import logging
from bs4 import BeautifulSoup
from sqlalchemy import exc, create_engine
import pymysql
import numpy as np
import pandas as pd
import boto3
from botocore.exceptions import ClientError
# import pyarrow
import awswrangler as wr
# from urllib.error import URLError, HTTPError
from utils import *
from praw.models import MoreComments
import nltk

logging.basicConfig(filename='example.log', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logging.info('Starting Logging Function')

today = datetime.now().date()
yesterday = today - timedelta(1)
day = (datetime.now() - timedelta(1)).day
month = (datetime.now() - timedelta(1)).month
year = (datetime.now() - timedelta(1)).year
season_type = 'Regular Season'

In [None]:
nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
])

In [None]:
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()] # this removes punctutation.
stopwords = nltk.corpus.stopwords.words("english")
words = [w for w in words if w.lower() not in stopwords] # remove stopwords 

In [None]:
from pprint import pprint

text = """
For some quick analysis, creating a corpus could be overkill.
If all you need is a word list,
there are simpler ways to achieve that goal."""
pprint(nltk.word_tokenize(text), width=79, compact=True)

In [None]:
words = nltk.word_tokenize(text)
fd = nltk.FreqDist(words)

In [None]:
fd.most_common(3)

In [None]:
fd.tabulate(3)

In [None]:
lower_fd = nltk.FreqDist([w.lower() for w in fd])

In [None]:
text = nltk.Text(nltk.corpus.state_union.words())
text.concordance("america", lines=5) # collection of words along with their context


In [None]:
concordance_list = text.concordance_list("america", lines=2)
for entry in concordance_list:
    print(entry.line)

In [None]:
words = nltk.word_tokenize(
    """Beautiful is better than ugly.
    Explicit is better than implicit.
    Simple is better than complex."""
)
text = nltk.Text(words)
fd = text.vocab()  # Equivalent to fd = nltk.FreqDist(words)
fd.tabulate(3)

Collocations can be made up of two or more words. NLTK provides classes to handle several types of collocations:

Bigrams: Frequent two-word combinations
Trigrams: Frequent three-word combinations
Quadgrams: Frequent four-word combinations


In [None]:
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]
finder = nltk.collocations.TrigramCollocationFinder.from_words(words) # find trigrams
finder.ngram_fd.most_common(2)
finder.ngram_fd.tabulate(2) # most popular trigrams

In [None]:
# isalpha() just means grab only letters, not punctuation and shit
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()] # getting all the words we want to analyze in list format
finder = nltk.collocations.BigramCollocationFinder.from_words(words) # find trigrams
finder.ngram_fd.most_common(2)
finder.ngram_fd.tabulate(2) # most popular trigrams

In [14]:
# sentiment score of neg neutral and pos, adds up to 1.
# this is vader lexicon, pretrained, good on social media w/ slang and broken sentences, @ symbols etc.
# Valence Aware Dictionary and Sentiment Reasoner. VADER
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is really powerful!")

{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

In [20]:
sia.polarity_scores("this product is really not that bad")

{'neg': 0.0, 'neu': 0.658, 'pos': 0.342, 'compound': 0.4791}

In [26]:
sia.polarity_scores("this product good to great")

{'neg': 0.0, 'neu': 0.3, 'pos': 0.7, 'compound': 0.7906}

In [30]:
sia.polarity_scores("@GAPonsonby Yes they would They've done it before  https//t.co/qISmXisGhJ")

{'neg': 0.0, 'neu': 0.748, 'pos': 0.252, 'compound': 0.4019}

In [27]:
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]


In [28]:
from random import shuffle

def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

shuffle(tweets)
for tweet in tweets[:10]:
    print(">", is_positive(tweet), tweet)

> True RT @EDPC7: #UKIP the common sense party who speak the truth and the truth hurts #Cameron #Miliband and #clegg 

#GE2015
> False @menilleyble ate menille i need youuuu :((((
> False @MSmithsonPB Last 20 polls Mean|Mode|Median ave:
GRN  5.0%|5%|5%8%%%
> False RT @jreynoldsMP: Missed the start. Did Cameron also bring a note reminding everyone Tories were pledged to match Lab's spending plans pre-f…
> False RT @hedkandikid: It's sad that people think Nicola Sturgeon represents Scotland. She doesn't. Well maybe 37% of it. #SNPout
> False @Kellipage17 I've only been a fan of 5SOS since early June last year :( And I really hate that… I get so upset over it.
Not that my parents
> False RT @standuptoUKIP: Farage's speech notes for #AskNigelFarage tonight? #GE2015 http//t.co/C1TldhbJoG
> False RT @WalesOnline: Will it come back to haunt him? Miliband rules out a deal with the SNP after May 7
http//t.co/bOs668mLNj http//t.co/lVje…
> True @GAPonsonby Yes they would They've done it before  h

In [31]:
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids

In [32]:
from statistics import mean

def is_positive(review_id: str) -> bool:
    """True if the average of all sentence compound scores is positive."""
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

In [33]:
shuffle(all_review_ids)
correct = 0
for review_id in all_review_ids:
    if is_positive(review_id):
        if review_id in positive_review_ids:
            correct += 1
    else:
        if review_id in negative_review_ids:
            correct += 1


print(F"{correct / len(all_review_ids):.2%} correct")

64.00% correct


In [34]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]
negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]

In [35]:
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

In [36]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
    if w.isalpha() and w not in unwanted
])
negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"])
    if w.isalpha() and w not in unwanted
])

In [2]:
from utils import sql_connection
conn = sql_connection('nba_source')
df = pd.read_sql(con = conn, sql = 'SELECT * FROM aws_reddit_comment_data_source;')

SQL Connection to schema: nba_source Successful


In [3]:
df2 = df.copy()

In [5]:
# this is basically a pretrained ML model
# <3 and punctuation enhances sentiment
# words after @ and # dont count bc social media.
from nltk.sentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
df2['compound'] = [analyzer.polarity_scores(x)['compound'] for x in df2['comment']]
df2['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df2['comment']]
df2['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df2['comment']]
df2['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df2['comment']]
df2['sentiment'] = np.where(df2['compound'] > 0, 1, 0)

In [None]:
df2.to_sql(con = conn, name = 'aws_reddit_comment_data_source', if_exists = 'replace', index = False)

In [6]:
reddit_comment_cols = ['comment', 'score', 'url', 'scrape_date', 'scrape_ts', 'compound',
       'neg', 'neu', 'pos', 'sentiment']

Index(['comment', 'score', 'url', 'scrape_date', 'scrape_ts', 'compound',
       'neg', 'neu', 'pos', 'sentiment'],
      dtype='object')

In [7]:
reddit_comment_cols = ['comment', 'score', 'url', 'scrape_date', 'scrape_ts', 'compound',
       'neg', 'neu', 'pos', 'sentiment']