## Imports

In [1]:
import numpy as np
import requests
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys

from collections import Counter

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import re

### Access and combine datafiles saved in ./data folder into a master dataframe

In [2]:
filenames = ['stockmarket', 'stocks']

masterframe = []
for filename in filenames:
    frame = []
    masterframe.append(frame)
    for filenumber in range(1,21):
        filepath = f'../data/{filename}_{filenumber}.csv'
        file = pd.read_csv(filepath)
        dataframe = file[['subreddit', 'title', 'selftext', 'score', 'created_utc']]
        frame.append(dataframe)
            
assert len(masterframe) == len(filenames)

reddit = []
for i in masterframe:
    data = pd.concat(i)
    reddit.append(data)
reddit_df = pd.concat(reddit)
print(reddit_df['subreddit'].value_counts())

StockMarket    20000
stocks         20000
Name: subreddit, dtype: int64


In [3]:
reddit_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subreddit    40000 non-null  object
 1   title        40000 non-null  object
 2   selftext     38495 non-null  object
 3   score        40000 non-null  int64 
 4   created_utc  40000 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.8+ MB


In [4]:
print(f'Null values = {reddit_df.isna().sum()}')
reddit_df = reddit_df.dropna()

Null values = subreddit         0
title             0
selftext       1505
score             0
created_utc       0
dtype: int64


In [5]:
reddit_df['subreddit'].value_counts()

stocks         19538
StockMarket    18957
Name: subreddit, dtype: int64

## Create datetime column by converting utc to datetime

In [6]:
reddit_df['datetime'] = pd.to_datetime(reddit_df['created_utc'], unit='s')

In [7]:
reddit_df['datetime'].head(1)

0   2021-01-02 00:38:37
Name: datetime, dtype: datetime64[ns]

In [8]:
reddit_df['datetime'].tail(1)

999   2019-01-29 19:42:16
Name: datetime, dtype: datetime64[ns]

## Filter out [removed] and [deleted] posts

In [9]:
reddit_df['selftext'].value_counts()[0:2]

[removed]    11885
[deleted]      162
Name: selftext, dtype: int64

In [10]:
reddit_df = reddit_df[(reddit_df['selftext']!= '[removed]') & (reddit_df['selftext']!= '[deleted]') ] 

In [11]:
reddit_df.shape

(26448, 6)

In [12]:
reddit_df['subreddit'].value_counts()

stocks         15984
StockMarket    10464
Name: subreddit, dtype: int64

### Create a label column by setting subreddit stocks to 0 and subreddit stockmarket to 1

In [13]:
# Create label column
reddit_df['subreddit'] = reddit_df['subreddit'].map({"StockMarket":1 , 'stocks':0})

In [14]:
reddit_df['subreddit'].value_counts()

0    15984
1    10464
Name: subreddit, dtype: int64

#### Select reddits with score of greater than 0. These will be more representative of good posts.

In [15]:
reddit_df = reddit_df[reddit_df['score'] >0]

In [16]:
reddit_df['all_text'] = reddit_df['selftext'] + reddit_df['title']

#### apply regex on all_text

In [17]:
reddit_df['all_text'] = reddit_df['all_text'].map(lambda x: re.sub(r'\W+', ' ', x))

### Remove the oldest Stocks records to make the sets equal. There are many more posts on the sotck_market page. 2019 is the 2nd best feature for Stocks. Reducing that.

In [18]:
stocks = reddit_df[reddit_df['subreddit'] ==0]
stock_market = reddit_df[reddit_df['subreddit'] ==1]

In [19]:
print(len(stocks))
print(len(stock_market))

13709
10018


In [20]:
stocks = reddit_df[reddit_df['subreddit'] ==0]
stock_market = reddit_df[reddit_df['subreddit'] ==1]
stocks.reset_index(inplace=True)
stock_market.reset_index(inplace=True)
stocks.drop(stocks.tail(3691).index,inplace=True)
assert len(stocks) == len(stock_market)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [21]:
print(stocks['datetime'].head(1))
print(stocks['datetime'].tail(1))
print(stock_market['datetime'].head(1))
print(stock_market['datetime'].tail(1))

0   2020-02-17 16:44:21
Name: datetime, dtype: datetime64[ns]
10017   2019-06-05 16:51:20
Name: datetime, dtype: datetime64[ns]
0   2021-01-02 00:14:04
Name: datetime, dtype: datetime64[ns]
10017   2020-01-15 05:09:44
Name: datetime, dtype: datetime64[ns]


In [22]:
all_dfs = [stocks, stock_market]
model_df = pd.concat(all_dfs).reset_index(drop=True)

#### Combine Title and Selftext

In [23]:
model_df['all_text'] = model_df['title'] + model_df['selftext']

### Sentiment Analysis as a feature

#### Words added to sentiment vocabulary from expert knowledge and logistic regression coefficient importance. These words were added to customize our sentiment analysis  

In [24]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

new_words = {
    'buy': 4.0,
    'vaccine': 4.0,
    'stimulus': 1.0,
    'bull': 4.0,
    'bullish': 4.0,
    'bear': -4.0,
    'bearish': -4.0,
    'sell': -4.0,
    'unstable': -4.0,
    'covid': -4.0,
    'pandemic': -4.0,
    'Wuhan':-10,
    'ignorant': -4,
    'lockdown': -4,
    'covid19': -4
}

SIA = SentimentIntensityAnalyzer()

SIA.lexicon.update(new_words)

In [25]:
# instantiate the sentiment analyzer

# Write a function to get the compound sentiment scores for a post
def get_compound_sentiment(post):
    return SIA.polarity_scores(post)['compound']

In [26]:
model_df['sentiment'] = model_df['all_text'].apply(get_compound_sentiment)

#### Remove some popular words that add no meaning or won't be applicable in the future

In [27]:
stop_word = ['https', 'com', '2019', 'amp']

for word in stop_word:
    model_df['all_text'] = model_df['all_text'].str.replace(word, '')

In [28]:
model_df['all_text'] = model_df['all_text'].str.replace('stocks', 'stock')

### Write model file to csv

In [29]:
model_df.head(3)

Unnamed: 0,index,subreddit,title,selftext,score,created_utc,datetime,all_text,sentiment
0,0,0,Taxes if I don't have any income?,So I'm a college student and I'm fooling aroun...,1,1581957861,2020-02-17 16:44:21,Taxes if I don't have any ine?So I'm a college...,-0.168
1,1,0,"Those of you that keep fairly long watchlists,...",Now that I'm learning to use screeners and che...,1,1581957764,2020-02-17 16:42:44,"Those of you that keep fairly long watchlists,...",0.8816
2,2,0,Covered Calls (Vanguard),I'm looking for some feedback from people that...,1,1581956404,2020-02-17 16:20:04,Covered Calls (Vanguard)I'm looking for some f...,0.765


In [30]:
model_df.to_csv('../data/model_df.csv', index = False)