In [40]:
import json
import pandas as pd

from random import shuffle
from scraper.reddit_scraper import Scraper
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import wordnet
import nltk
import re
import datetime

In [55]:
# Load data from file
filename = "data/qo95nt_20211110093614503123.json"

df = pd.read_json(filename).T
print("shape:", df.shape)
df.tail()

shape: (2139, 4)


Unnamed: 0,author,body,score,time
hjmmzs1,co7nw,I can appreciate and recognize talent. Hate wh...,1,1636252094.0
hjmc5sj,3w83nxpr,"Well that last goal, kind of makes this convo ...",1,1636246832.0
hjmccym,1523af,Fair enough. Save yourself the absolute bewild...,1,1636246928.0
hjmbq65,zn07m,I usually order by what looks good on other pe...,1,1636246627.0
hjmcmih,zn07m,"We had the convo, which the hockey gods decide...",1,1636247055.0


In [56]:
#Analysis setup
sia = SentimentIntensityAnalyzer()

def sentiment(comment: str):
    return sia.polarity_scores(comment)['compound']

def is_positive(comment: str):
    return sentiment(comment) > 0;

In [97]:
# Data cleanup
stopwords = nltk.corpus.stopwords.words("english") + ["like", "get"]
snowball = nltk.stem.SnowballStemmer('english')

#words = [j.lower() for i in [w.split() for w in comments] for j in i]
df['words'] = [token.split() for token in df['body']]

g = []
for comment in df['words']:
    l = []
    for token in comment:
        if token.isalpha() and token not in stopwords:
            l.append(snowball.stem(token.lower()))
    g.append(l)

df['cleaned'] = g

words = [j for i in [w for w in df['cleaned'].tolist()] for j in i]

df.head()

# various algorithms to get the base of each
#snowball_words = [snowball.stem(w) for w in df['cleaned']]

Unnamed: 0,author,body,score,time,words,cleaned
hjmbq2n,2wtn1y1m,Auston Matthews scoring a powerplay goal on a ...,45,1636246625.0,"[Auston, Matthews, scoring, a, powerplay, goal...","[auston, matthew, score, powerplay, goal, marc..."
hjmd23t,41bt33yx,Yo. Jack Campbell keeps the record. Canes lost...,43,1636247261.0,"[Yo., Jack, Campbell, keeps, the, record., Can...","[jack, campbel, keep, cane, lost]"
hjmi3w6,5i88ndky,Beat the bruins and as soon as it cuts to the ...,40,1636249695.0,"[Beat, the, bruins, and, as, soon, as, it, cut...","[beat, bruin, soon, cut, hab, game, score, good]"
hjmi4ch,5aceky7e,Hell week results: Leafs 3-0.,36,1636249701.0,"[Hell, week, results:, Leafs, 3-0.]","[hell, week, leaf]"
hjliwp1,l8jzj,First game I’m going to in like 8 years. LFG GLG,34,1636233429.0,"[First, game, I’m, going, to, in, like, 8, yea...","[first, game, go, lfg, glg]"


In [112]:
#Analysis Time!

#number of positive vs negative comments
pos = 0
neg = 0

for c in df['cleaned']:
    if is_positive(" ".join(c)): pos += 1 
    else: neg += 1

print("\ntotal: %d negative: %d positive: %d"%(len(df['cleaned']), neg, pos)) 


total: 2139 negative: 1494 positive: 645


In [101]:
#Words used most frequently
print(nltk.FreqDist(words).tabulate(10))

   i fuck play game look good   go goal  the leaf 
 294  227  110  109   92   85   85   84   82   75 
None


In [103]:
#Word combinations
item = words

bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(item)
bigram_finder.ngram_fd.tabulate(5)

trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(item)
trigram_finder.ngram_fd.tabulate(4)

quadgram_finder = nltk.collocations.QuadgramCollocationFinder.from_words(item)
quadgram_finder.ngram_fd.tabulate(3)

  ('i', 'think')    ('i', 'love') ('holi', 'fuck')  ('love', 'see')    ('i', 'know') 
              28               25               15               14               13 
 ('grade', 'a', 'chanc') ('fuck', 'good', 'fuck') ('good', 'fuck', 'good')  ('call', 'ice', 'goal') 
                       4                        3                        3                        3 
   ('fuck', 'good', 'fuck', 'good') ('mcavoy', 'wear', 'littl', 'halo')   ('wear', 'littl', 'halo', 'head') 
                                  3                                   2                                   2 


In [110]:
#Words in context. Should be interesting to see the result of this for a variety of words
words_of_interest = ['refs', 'marner', 'matthews', 'tavares', 'nylander']

for word in words_of_interest:
    concordance = nltk.Text(words).concordance_list(snowball.stem(word), lines=None)
    print(word, ":", len(concordance), "total occurrences,", end=" ")
    pos = 0
    neg = 0
    for entry in concordance:
        if is_positive(entry.line): pos += 1
        else: neg += 1

    print(pos, "positive", neg, "negative")

refs : 50 total occurrences, 20 positive 30 negative
marner : 38 total occurrences, 22 positive 16 negative
matthews : 49 total occurrences, 27 positive 22 negative
tavares : 13 total occurrences, 4 positive 9 negative
nylander : 4 total occurrences, 3 positive 1 negative
