In [94]:
import json
from random import shuffle
from scraper.reddit_scraper import Scraper
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import wordnet
import nltk
import re


In [153]:
#helpful variables
cred_file = "creds.json"
submission_id = "qo95nt"

In [154]:
#Load credentials
with open(cred_file) as f:
    creds = json.load(f)

In [155]:
#Create Scraper object and connect to Reddit
scraper = Scraper(creds, submission_id)
scraper.connect()

In [156]:
#Get the top level comments for the provided submission
comments = scraper.get_comments()

In [157]:
#Analysis setup
sia = SentimentIntensityAnalyzer()

def sentiment(comment: str):
    return sia.polarity_scores(comment)['compound']

def is_positive(comment: str):
    return sentiment(comment) > 0;

class Word_Replacement(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        
    def replace_rep(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace_rep(repl_word)
        else:
            return repl_word
        
        

In [165]:
#Data cleanup
stopwords = nltk.corpus.stopwords.words("english") + ["like", "get"]
rep_word = Word_Replacement()

words = [j.lower() for i in [w.split() for w in comments] for j in i]
filtered_words = [rep_word.replace_rep(w) for w in words if w.isalpha() and w not in stopwords]

#various algorithms to get the base of each
porter = nltk.stem.PorterStemmer()
porter_words = [porter.stem(w) for w in filtered_words]

lancaster = nltk.stem.LancasterStemmer()
lancaster_words = [lancaster.stem(w) for w in filtered_words]

snowball = nltk.stem.SnowballStemmer('english')
snowball_words = [snowball.stem(w) for w in filtered_words]

lemmatizer = nltk.stem.WordNetLemmatizer()
lemma_words = [lemmatizer.lemmatize(w) for w in filtered_words]

cleaned = [p for p in comments]

In [166]:
#Analysis Time!

#number of positive vs negative comments
pos = 0
neg = 0

for c in cleaned:
    if is_positive(c): pos += 1 
    else: neg += 1

print("\ntotal: %d negative: %d positive: %d"%(len(cleaned), neg, pos)) 


total: 2162 negative: 1350 positive: 812


In [167]:
#Words used most frequently
print(nltk.FreqDist(filtered_words).tabulate(5))
print(nltk.FreqDist(porter_words).tabulate(5))
print(nltk.FreqDist(lancaster_words).tabulate(5))
print(nltk.FreqDist(snowball_words).tabulate(5))
print(nltk.FreqDist(lemma_words).tabulate(5))

   fuck fucking    game    good     lol 
    123      99      91      87      74 
None
fuck play game look good 
 228  111  109   93   87 
None
fuck play  gam look good 
 230  132  109   93   87 
None
fuck play game look good 
 228  111  109   93   87 
None
   fuck    game fucking    good    goal 
    124     109      99      87      85 
None


In [161]:
#Word combinations
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(lemma_words)
bigram_finder.ngram_fd.tabulate(5)

trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(lemma_words)
trigram_finder.ngram_fd.tabulate(4)

quadgram_finder = nltk.collocations.QuadgramCollocationFinder.from_words(lemma_words)
quadgram_finder.ngram_fd.tabulate(3)

  ('look', 'like')   ('holy', 'fuck')   ('feel', 'like') ('fuck', 'boston')    ('love', 'see') 
                18                 15                 13                 13                 12 
('fucking', 'good', 'fucking')    ('good', 'fucking', 'good')        ('call', 'ice', 'goal')      ('game', 'get', 'scored') 
                             3                              3                              3                              2 
('fucking', 'good', 'fucking', 'good')   ('mcavoy', 'wear', 'little', 'halo')     ('wear', 'little', 'halo', 'head') 
                                     3                                      2                                      2 


In [164]:
#context
refs_concordance = nltk.Text(lancaster_words).concordance_list(lancaster.stem("refs"), lines=None)
print(len(refs_concordance), "total occurrences,", end=" ")
pos = 0
neg = 0
for entry in refs_concordance:
    if is_positive(entry.line): pos += 1
    else: neg += 1

print(pos, "positive", neg, "negative")

51 total occurrences, 13 positive 38 negative
