### PreProcessing

##### Importing Libraries and Data

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import reticker
import string

In [4]:
df = pd.read_csv('./datasets/data.csv').drop(columns='Unnamed: 0')
df.head(3)

Unnamed: 0,subreddit,title,selftext,created_utc,all_text,post_length
0,wallstreetbets,"Made 45k on QQQ puts in 3 days, will finally p...",,1642196072,"Made 45k on QQQ puts in 3 days, will finally p...",16
1,wallstreetbets,There is a mining program giving unprecedented...,,1642195904,There is a mining program giving unprecedented...,26
2,wallstreetbets,TUESDAY = $ B B I G 🚀🚀🚀LFG 🚀🚀🚀,[removed],1642195841,TUESDAY = $ B B I G 🚀🚀🚀LFG 🚀🚀🚀,9


In [3]:
sample_text = df.all_text[8]
sample_text

"What will JPM close at on Jan 17? JPM was the #6 talked about company on WSB on Jan 14th\n\nSkipping #4 / #5 because they've already had predictions\n\n[View Poll](https://www.reddit.com/poll/s4300r)"

### Preprocessing Text
##### Text Data Anomalies: url's, digits, newline characters, extraneous symbols, stock tickers


#### Getting Stock Tickers 
Using the Reticker Library found here (https://pypi.org/project/reticker/).

In [5]:
ticker = reticker.TickerExtractor()

In [6]:
ticker.extract(sample_text)

['JPM']

In [7]:
' '.join(ticker.extract(sample_text))

'JPM'

In [8]:
df['ticker_text'] = [' '.join(ticker.extract(text)) for text in df.all_text]


In [11]:
df['ticker_counts'] = [len(s.split()) for s in df['ticker_text']]

In [13]:
tickers_df = df.drop(columns=['title', 'selftext', 'created_utc', 'all_text', 'post_length'])
tickers_df.head(3)

Unnamed: 0,subreddit,ticker_text,ticker_counts
0,wallstreetbets,QQQ,1
1,wallstreetbets,,0
2,wallstreetbets,LFG,1


In [14]:
text_df = df.drop(columns=['ticker_text', 'ticker_counts', 'created_utc', 'post_length'])
text_df.head(3)

Unnamed: 0,subreddit,title,selftext,all_text
0,wallstreetbets,"Made 45k on QQQ puts in 3 days, will finally p...",,"Made 45k on QQQ puts in 3 days, will finally p..."
1,wallstreetbets,There is a mining program giving unprecedented...,,There is a mining program giving unprecedented...
2,wallstreetbets,TUESDAY = $ B B I G 🚀🚀🚀LFG 🚀🚀🚀,[removed],TUESDAY = $ B B I G 🚀🚀🚀LFG 🚀🚀🚀


In [97]:
tickers_df.loc[tickers_df.subreddit == 'stocks']

Unnamed: 0,subreddit,ticker_text,ticker_counts
9394,stocks,,0
9395,stocks,TQQ,1
9396,stocks,,0
9397,stocks,DTE ET,2
9398,stocks,,0
...,...,...,...
18786,stocks,,0
18787,stocks,ZYXI,1
18788,stocks,,0
18789,stocks,,0


In [23]:
#tickers_df.loc[tickers_df['ticker_counts']!=0]

In [24]:
#tickers_df.subreddit.value_counts()

#### Removing urls from all_text:

In [37]:
def url_remover(text):
    text = re.sub(r'http\S+', '', text)
    return text

In [38]:
sample_text

"What will JPM close at on Jan 17? JPM was the #6 talked about company on WSB on Jan 14th\n\nSkipping #4 / #5 because they've already had predictions\n\n[View Poll](https://www.reddit.com/poll/s4300r)"

In [39]:
url_remover(sample_text)

"What will JPM close at on Jan 17? JPM was the #6 talked about company on WSB on Jan 14th\n\nSkipping #4 / #5 because they've already had predictions\n\n[View Poll]("

In [41]:
text_df.all_text = [url_remover(each) for each in text_df.all_text]

#### Removing newline characters from text (ex: \n\n, \n\n*****\n\n)

In [55]:
def newline_remover(text):
    text = text.replace("\n", "")
    return text

In [56]:
text_df.all_text = [newline_remover(each) for each in text_df.all_text]

#### Removing symbols and punctuation

In [58]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [66]:
def punct_remover(text):
    for each in string.punctuation:
        text=text.replace(each," ")
    return text

In [70]:
sample = text_df.all_text[8]
sample

"What will JPM close at on Jan 17? JPM was the #6 talked about company on WSB on Jan 14thSkipping #4 / #5 because they've already had predictions[View Poll]("

In [71]:
punct_remover(sample)

'What will JPM close at on Jan 17  JPM was the  6 talked about company on WSB on Jan 14thSkipping  4    5 because they ve already had predictions View Poll  '

In [77]:
text_df.all_text = [punct_remover(each) for each in text_df.all_text]

#### Removing Digits and --th's and --k's

In [78]:
def digit_remover(text):
    text = re.sub(r'\d+th', '', text)
    text = re.sub(r'\d+k)
    text = re.sub(r'\d+', '', text)
    return text


In [81]:
text_df.all_text = [digit_remover(each) for each in text_df.all_text]

#### Removing Extra WhiteSpace

In [83]:
def no_extra_spaces(text):
    text = text.split()
    text = ' '.join(text)
    return text

In [84]:
sample = text_df.all_text[8]
sample

'What will JPM close at on Jan   JPM was the   talked about company on WSB on Jan Skipping       because they ve already had predictions View Poll  '

In [85]:
no_extra_spaces(sample)

'What will JPM close at on Jan JPM was the talked about company on WSB on Jan Skipping because they ve already had predictions View Poll'

In [87]:
text_df.all_text = [no_extra_spaces(each) for each in text_df.all_text]

In [88]:
text_df.head()

Unnamed: 0,subreddit,title,selftext,all_text
0,wallstreetbets,"Made 45k on QQQ puts in 3 days, will finally p...",,Made k on QQQ puts in days will finally pay of...
1,wallstreetbets,There is a mining program giving unprecedented...,,There is a mining program giving unprecedented...
2,wallstreetbets,TUESDAY = $ B B I G 🚀🚀🚀LFG 🚀🚀🚀,[removed],TUESDAY B B I G 🚀🚀🚀LFG 🚀🚀🚀
3,wallstreetbets,PTN almost at the double Botham 🥳🍾💵💵💵💵💵💵,,PTN almost at the double Botham 🥳🍾💵💵💵💵💵💵
4,wallstreetbets,I love weeklies. Fuck AMC,,I love weeklies Fuck AMC


In [90]:
tickers_df.to_csv('./datasets/tickers.csv')

In [91]:
text_df.to_csv('./datasets/text.csv')

In [94]:
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18791 entries, 0 to 18790
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   subreddit  18791 non-null  object
 1   title      18791 non-null  object
 2   selftext   13268 non-null  object
 3   all_text   18791 non-null  object
dtypes: object(4)
memory usage: 587.3+ KB
