In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../../')

In [57]:
import os
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from config import STOCKTWITS_TICKER_LIST
from util.file_util import StockTwitsFileReader
from nlp.twokenize import normalizeTextForTagger, tokenize
from nlp.text_processor import (
    token_is_cash_tag, token_is_punct, token_matches_ticker, twit_tokenize
)

In [8]:
np.random.seed(828)

In [4]:
stock_twits_reader = StockTwitsFileReader()

In [5]:
data_dir = os.path.join(stock_twits_reader.root_dir,
                        'processed/text_analysis')

In [6]:
twit_train_df = pd.read_pickle(os.path.join(data_dir,
                                            'train_twits.pkl'))

In [7]:
twit_train_df.shape

(314276, 7)

In [22]:
corpus = twit_train_df['body'].values

In [23]:
sample_twits = np.random.choice(corpus, 10)

In [24]:
sample_twits

array(['$TSLA Biden is going down for teaming up with Chanos. Reelection Secured.',
       '$TSLA I&#39;m somewhat disappointed this happened so quickly because I was going to load up on calls.  At least Long position  is still intact.',
       '$TSLA $SWIR Li-Ion battery &amp; Internet of Things revolution will change the world. Fundamentally.',
       '$AEO #Swinghomework $STUDY critical resistance here we either break by EOD or follow that yellow squiggly arrow I drew. Called @$24.5💰💵',
       '$BYND chipotle sells for price.sales of 4.  Bynd price.sales of 69.  It tells you the stock price is not sustainable.',
       '$BYND you tree-hugging idiots need to sell asap and save your money. be realistic for god&#39;s sake.',
       '$TSLA As we have said...the REAL selling has not begun.',
       '$MSFT huge money— Friday $125 calls',
       '$BABA China deal rumor? $AAPL $SPY $MSFT $FB', '$BYND DT are out'],
      dtype=object)

## Try Twokenize

In [25]:
normalized_twits = [normalizeTextForTagger(x) for x in sample_twits]
normalized_twits

['$TSLA Biden is going down for teaming up with Chanos. Reelection Secured.',
 "$TSLA I'm somewhat disappointed this happened so quickly because I was going to load up on calls.  At least Long position  is still intact.",
 '$TSLA $SWIR Li-Ion battery & Internet of Things revolution will change the world. Fundamentally.',
 '$AEO #Swinghomework $STUDY critical resistance here we either break by EOD or follow that yellow squiggly arrow I drew. Called @$24.5💰💵',
 '$BYND chipotle sells for price.sales of 4.  Bynd price.sales of 69.  It tells you the stock price is not sustainable.',
 "$BYND you tree-hugging idiots need to sell asap and save your money. be realistic for god's sake.",
 '$TSLA As we have said...the REAL selling has not begun.',
 '$MSFT huge money— Friday $125 calls',
 '$BABA China deal rumor? $AAPL $SPY $MSFT $FB',
 '$BYND DT are out']

In [31]:
temp = tokenized_twits[3][-1]

In [41]:
a = temp.encode()

In [42]:
a.decode('utf-8')

'💰💵'

In [44]:
tokenized_twits = [tokenize(x) for x in normalized_twits]
tokenized_twits

[['$TSLA',
  'Biden',
  'is',
  'going',
  'down',
  'for',
  'teaming',
  'up',
  'with',
  'Chanos',
  '.',
  'Reelection',
  'Secured',
  '.'],
 ['$TSLA',
  "I'm",
  'somewhat',
  'disappointed',
  'this',
  'happened',
  'so',
  'quickly',
  'because',
  'I',
  'was',
  'going',
  'to',
  'load',
  'up',
  'on',
  'calls',
  '.',
  'At',
  'least',
  'Long',
  'position',
  'is',
  'still',
  'intact',
  '.'],
 ['$TSLA',
  '$SWIR',
  'Li-Ion',
  'battery',
  '&',
  'Internet',
  'of',
  'Things',
  'revolution',
  'will',
  'change',
  'the',
  'world',
  '.',
  'Fundamentally',
  '.'],
 ['$AEO',
  '#Swinghomework',
  '$STUDY',
  'critical',
  'resistance',
  'here',
  'we',
  'either',
  'break',
  'by',
  'EOD',
  'or',
  'follow',
  'that',
  'yellow',
  'squiggly',
  'arrow',
  'I',
  'drew',
  '.',
  'Called',
  '@',
  '$24.5',
  '💰💵'],
 ['$BYND',
  'chipotle',
  'sells',
  'for',
  'price',
  '.',
  'sales',
  'of',
  '4',
  '.',
  'Bynd',
  'price',
  '.',
  'sales',
  'of',

In [51]:
token_infos = []

for tokens in tokenized_twits:
    for tok in tokens:
        token_infos.append({
            'token': tok,
            'is_punct': token_is_punct(tok),
            'is_cash_tag': token_is_cash_tag(tok),
        })

In [53]:
temp_df = pd.DataFrame(token_infos)

In [68]:
for s in sample_twits:
    print(s)
    
    tokenized_twit = twit_tokenize(s, ticker=None, normalize=True)
    print(tokenized_twit)
    print('\n')
    

$TSLA Biden is going down for teaming up with Chanos. Reelection Secured.
['biden', 'going', 'down', 'teaming', 'up', 'chanos', 'reelection', 'secured']


$TSLA I&#39;m somewhat disappointed this happened so quickly because I was going to load up on calls.  At least Long position  is still intact.
["i'm", 'somewhat', 'disappointed', 'happened', 'quickly', 'going', 'load', 'up', 'on', 'calls', 'least', 'long', 'position', 'still', 'intact']


$TSLA $SWIR Li-Ion battery &amp; Internet of Things revolution will change the world. Fundamentally.
['li-ion', 'battery', 'internet', 'things', 'revolution', 'change', 'world', 'fundamentally']


$AEO #Swinghomework $STUDY critical resistance here we either break by EOD or follow that yellow squiggly arrow I drew. Called @$24.5💰💵
['#swinghomework', 'critical', 'resistance', 'either', 'break', 'eod', 'follow', 'yellow', 'squiggly', 'arrow', 'drew', 'called', '$24.5', '💰💵']


$BYND chipotle sells for price.sales of 4.  Bynd price.sales of 69.  It te

# Tokenizer - what is &#39;s

### TODO: Remove Binarys