## Import Packages

In [13]:
import pandas as pd
from IPython.core.display import display
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from analysis_util.mining import TickersMining
from analysis_util.sentiment import VaderSentiment

## Add Functions for Use

In [14]:
def combine_subreddit_texts_by_date(subreddits=['investing', 'stocks', 'wallstreetbets'], date='20211111'):
    dataframe = pd.DataFrame()
    for subreddit in subreddits:
        dataframe = dataframe.append(
            pd.read_csv('input_data/naive_bayes_input/' + subreddit + '/text_data/' + subreddit + '_text_data_' + date + '.csv'),
            ignore_index=True)
    return dataframe


def add_bearish_or_bullish_class(df, stock_data):
    merged = df.merge(stock_data, how='inner', left_on='tickers', right_on='symbol')
    merged['class'] = merged['netchange'].apply(lambda x: 'bull' if x > 0 else 'bear')
    return merged.drop(columns=['symbol', 'lastsale', 'netchange', 'volume', 'marketCap', 'ipoyear', 'industry',
                                'sector', 'url', 'country', 'name', 'pctchange'])


def clean_tickers(sentiment_data):
    to_remove = ["A", "DD", "SE", "TA", "SC"]
    single_tickers = pd.DataFrame(
        columns=['id', 'type', 'text', 'negative', 'neutral', 'positive', 'compound', 'tickers'])

    for a in range(0, len(sentiment_data.index)):
        if isinstance(sentiment_data.iloc[a].loc["tickers"], float):
            pass
        else:
            dict = sentiment_data.iloc[a].to_dict()
            tickers = dict["tickers"].split("|")
            for ticker in to_remove:
                if tickers is None:
                    continue
                elif ticker in tickers:
                    tickers.remove(ticker)
            if len(tickers) > 0:
                for ticker in tickers:
                    dict["tickers"] = ticker
                    single_tickers = single_tickers.append(dict, ignore_index=True)
    return single_tickers

## Initialize Empty DataFrame

In [15]:
main_frame = pd.DataFrame()

## Set the Dates of Files to Construct the Naive Bayes Model
## Feel Free to Change Dates Here for Using Existing Data Files

In [16]:
reddit_dates = ['20211111', '20211114', '20211115', '20211116', '20211117', '20211118', '20211121', '20211123',
                '20211124', '20211125', '20211128', '20211129', '20211130']
nasdaq_dates = ['20211112', '20211115', '20211116', '20211117', '20211118', '20211119', '20211122', '20211124',
                '20211125', '20211126', '20211129', '20211130', '20211201']

## Build Main DataFrame

In [17]:
for i in range(len(reddit_dates)):
    all_texts = combine_subreddit_texts_by_date(date=reddit_dates[i])
    vader = VaderSentiment()
    sentiment_data = vader.get_sentiment(dataframe=all_texts, show_text=True)
    miner = TickersMining('input_data/nasdaq/nasdaq_stock_data_' + nasdaq_dates[i] + '.csv')

    mined_tickers = miner.get_tickers(dataframe=sentiment_data)
    sentiment_data['tickers'] = mined_tickers['tickers']

    single_tickers = clean_tickers(sentiment_data)
    single_tickers.replace("", float("NaN"), inplace=True)
    single_tickers.dropna(inplace=True)
    grouped = single_tickers.groupby('tickers')
    grouped = grouped.mean()
    grouped['tickers'] = grouped.index
    grouped.index.name = None
    with_class = add_bearish_or_bullish_class(grouped,
                                              pd.read_csv(
                                                  'input_data/nasdaq/nasdaq_stock_data_' + nasdaq_dates[i] + '.csv'))

    main_frame = main_frame.append(with_class, ignore_index=True)

## Set up X (features) and Y (Class) from Main DataFrame
## X Contains Average Daily's Negative, Neutral, Positive, and Compound Sentiment Scores from VADER
## X Example: [0.04300,0.18200,0.21150,0.01800]


In [18]:
x = main_frame.drop(columns=['tickers', 'class', 'update_dt'])
y = main_frame.drop(columns=['negative', 'neutral', 'positive', 'compound', 'tickers', 'update_dt'])

x = x.to_numpy()
y = y.to_numpy()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
gnb = GaussianNB()

## Test Model

In [19]:
model = gnb.fit(x_train, y_train.ravel())
y_pred = model.predict(x_test)
counter = 0
for i in range(len(y_pred)):
    if y_pred[i] != y_test[i][0]:
        counter = counter + 1
        
display("Number of mislabeled points out of a total %d points : %d" % (x_test.shape[0], counter))
display("Accuracy of predicting testing set: " + str(1-(counter / x_test.shape[0])))

'Number of mislabeled points out of a total 894 points : 401'

'Accuracy of predicting testing set: 0.5514541387024608'

Traceback (most recent call last):
  File "_pydevd_bundle\pydevd_cython_win32_39_64.pyx", line 1035, in _pydevd_bundle.pydevd_cython_win32_39_64.PyDBFrame.trace_dispatch
  File "C:\Users\nlisichenok\AppData\Local\JetBrains\Toolbox\apps\PyCharm-P\ch-0\213.5744.248\plugins\python\helpers-pro\jupyter_debug\pydev_jupyter_plugin.py", line 144, in cmd_step_over
    if _is_inside_jupyter_cell(frame, pydb):
  File "C:\Users\nlisichenok\AppData\Local\JetBrains\Toolbox\apps\PyCharm-P\ch-0\213.5744.248\plugins\python\helpers-pro\jupyter_debug\pydev_jupyter_plugin.py", line 209, in _is_inside_jupyter_cell
    if is_cell_filename(filename):
  File "C:\Users\nlisichenok\AppData\Local\JetBrains\Toolbox\apps\PyCharm-P\ch-0\213.5744.248\plugins\python\helpers-pro\jupyter_debug\pydev_jupyter_plugin.py", line 220, in is_cell_filename
    ipython_shell = get_ipython()
NameError: name 'get_ipython' is not defined


## Predict Y from X via User Input Using Generated Naive Bayes Model. Example Included.
## List Input is [Negative, Neutral, Positive, Compound]

In [24]:
model.predict([[0.04400,0.01400,0.02100,0.18350]])

array(['bear'], dtype='<U4')

In [None]:
main_frame

# Visualization Interface
# Latest File Date Available Enter Below

In [None]:
import datetime as dt
latest_date_file_available = dt.date(2021, 12, 1)

In [None]:
from IPython.display import Image
from ipywidgets import interact
import datetime

# get yesterdays date
yesterday = latest_date_file_available-datetime.timedelta(days=1)

# get subset of main_frame only for that date
main_frame_yesterday = main_frame.loc[main_frame['update_dt']==str(yesterday)]

# get stock list from yesterday
dates = main_frame['update_dt'].tolist()
dates = list(dict.fromkeys(dates))

list_of_dates = []
for date in dates:
    list_of_dates.append((date, date))

# function for printing picture
def select_date(date):
    main_frame_date = main_frame.loc[main_frame['update_dt']==str(date)]

    # get stock list from selected date
    tickers = main_frame_date['tickers'].tolist()
    tickers = list(dict.fromkeys(tickers))

    list_of_tickers = []
    for ticker in tickers:
        list_of_tickers.append((ticker, ticker))
    
    # function for printing picture
    interact(get_prediction, ticker=list_of_tickers)

# function for printing picture
def get_prediction(ticker):
    
    line = main_frame.loc[main_frame['tickers']==ticker].iloc[-1]
    date_ = line["update_dt"]
    recommended_position = line["class"]
    
    if recommended_position=="bull":
        color = '\033[92m' # GREEN
        position = 'BULLISH'
        file = 'input_data/naive_bayes_input/images/bullish.png'
    else:
        color = '\033[91m' # RED
        position = 'BEARISH'
        file = 'input_data/naive_bayes_input/images/bearish.png'

    print("Our model has a {}\033[1m\033[4m{}\033[0m position on \033[1m{}\033[0m for {}".format(color, position, ticker, date_))
    display(Image(filename=file))

# Below predicts the latest data we can given the data available from the implemented Reddit's web scraper

In [None]:
# interactive select bar for date
interact(select_date, date=list_of_dates)