In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import resample
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import ClassifierI
import math
import random
import pickle
from statistics import mode
import praw

In [2]:
#Loads the picled training data from the other module
documents_f = open("pickled_outputs/documents.pickle", "rb")
documents = pickle.load(documents_f)
documents_f.close()

#Loads the features from the data
vocab_features_f = open("pickled_outputs/vocab_features.pickle", "rb")
vocab_features = pickle.load(vocab_features_f)
vocab_features_f.close()

#Loads the fatureset
featuresets_f = open("pickled_outputs/featuresets.pickle", "rb")
featuresets = pickle.load(featuresets_f)
featuresets_f.close()

In [3]:
#Loads the naieve bayes
open_NB_file = open("pickled_outputs/naivebayes.pickle", "rb")
naive_bayes_classifier = pickle.load(open_NB_file)
open_NB_file.close()

#Loads the MNB
open_MNB_file = open("pickled_outputs/MNB_classifier.pickle", "rb")
MNB_classifier = pickle.load(open_MNB_file)
open_MNB_file.close()

#Loads the BNB
open_BNB_file = open("pickled_outputs/BNB_classifier.pickle", "rb")
BNB_classifier = pickle.load(open_BNB_file)
open_BNB_file.close()

#Loads the Logistic Regression
open_LR_file = open("pickled_outputs/LogisticRegression_classifier.pickle", "rb")
LogisticRegression_classifier = pickle.load(open_LR_file)
open_LR_file.close()

#Loads the SGDC
open_SGDC_file = open("pickled_outputs/SGDClassifier_classifier.pickle", "rb")
SGDClassifier_classifier = pickle.load(open_SGDC_file)
open_SGDC_file.close()

#Loads the SVC
open_SVC_file = open("pickled_outputs/SVC_classifier.pickle", "rb")
SVC_classifier = pickle.load(open_SVC_file)
open_SVC_file.close()


#Loads the Linear SVC
open_LSVC_file = open("pickled_outputs/LinearSVC_classifier.pickle", "rb")
LinearSVC_classifier = pickle.load(open_LSVC_file)
open_LSVC_file.close()

#NUSVC
open_NUSVC_file = open("pickled_outputs/NuSVC_classifier.pickle","rb")
NuSVC_classifier = pickle.load(open_NUSVC_file)
open_NUSVC_file.close()

In [4]:
#Find feature function from the previous notebook
def find_features(comment):
    '''
    Params:
        document: our comment and sentiment
    Return:
        the features, a dictionary of words in the comment mapped to sentiment
    '''
    words = comment
    features = {}

    for w in vocab_features:
        features[w] = (w in words)

    return features

In [5]:
print("Number of featuresets: ", len(featuresets))
cutoff = round(len(featuresets) * 0.8)
training_set = featuresets[:cutoff]
testing_set = featuresets[cutoff+1:]

Number of featuresets:  3674


In [6]:
#Voting Class from previous notebook
class VotingClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for classifier in self._classifiers:
            vote = classifier.classify(features)
            votes.append(vote)
        return mode(votes)

    def evaluate_confidence(self, features):
        votes = []
        for classifier in self._classifiers:
            vote = classifier.classify(features)
            votes.append(vote)
            
        choice_votes = votes.count(mode(votes))
        confidence = choice_votes / len(votes)
        return confidence

In [7]:
#Voting Classifier from previous notebook
voting_classifier = VotingClassifier(naive_bayes_classifier, MNB_classifier, BNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, SVC_classifier, LinearSVC_classifier, NuSVC_classifier)

In [8]:
def define_sentiment(comment):
    comment_features = find_features(comment)
    return voting_classifier.classify(comment_features), voting_classifier.evaluate_confidence(comment_features)

In [9]:
define_sentiment("The market is great great today, I am super happy, I love money")


(1, 0.5)

In [10]:
test = define_sentiment("I hate the down market its fucking bullshit, it is so negative I lost everything goodbye")
print(type(test[0]))

<class 'int'>


In [11]:
#Connect to a reddit account
#Reddit account username: group_20_final_proj
#Password: sharedaccount
#Client ID: hMwEYlt_j8npjE5uqz1mcw
#Seceret: mMt6cgv2tjF1VYZTT7c_BkmdvXOqVA

#Connect to Reddit
reddit = praw.Reddit(
    client_id="hMwEYlt_j8npjE5uqz1mcw",
    client_secret="mMt6cgv2tjF1VYZTT7c_BkmdvXOqVA",
    user_agent="Group_20",
    username="group_20_final_proj",
    password="sharedaccount",
)
print(reddit.read_only)
# Output: False is correct

False


In [12]:
#Connect to a subreddit
wallstreetbets = reddit.subreddit("wallstreetbets")

print(wallstreetbets.display_name)
#Tests the connection
print(wallstreetbets.title)

wallstreetbets
wallstreetbets


In [13]:
#Cleans the file so it is empty
wsb_output_text = open("wsb_output_text.txt", "w+")
wsb_output_text.close()
wsb_output = open("wsb_output.txt", "w+")
wsb_output.close()

In [14]:
#Streaming the reddit api

#subreddit.stream.comments(skip_existing=True) will start when the stream starts
for comment in wallstreetbets.stream.comments():
    #Opens the documents
    wsb_output = open("wsb_output.txt", "a")
    wsb_output_text = open("wsb_output_text.txt", "a", encoding="utf-8")
    #gets the sentiment value
    sentiment_value, confidence_value = define_sentiment(comment.body)
    
    #Writes the comment and sentiment to the output_text file
    wsb_output_text.write(comment.body + ' ' + ':' + ' ' + str(sentiment_value))
    wsb_output_text.write('\n')

    #Writes the sentiment to the output file
    wsb_output.write(str(sentiment_value))
    wsb_output.write('\n')
    wsb_output.close



KeyboardInterrupt: 