# Import

In [1]:
import pickle

from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import pipeline
import torch
import pandas as pd
import emoji
import re
from os.path import exists

from torch.utils.data import DataLoader,SequentialSampler,TensorDataset
import time
import numpy as np

import spacy
import nltk
from nltk.corpus import stopwords
from sentence_transformers import *

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from scipy.cluster import hierarchy as sch

from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

[nltk_data] Downloading package stopwords to /home/andrei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load tweets

In [2]:
tweets_path = "Datasets/kaggle/Tweet.csv"
id_to_ticker_path = "Datasets/kaggle/Company_Tweet.csv"
ticker_to_name_path = "Datasets/kaggle/Company.csv"

In [3]:
def init_datasets():
    tweets_df = pd.read_csv(tweets_path)
    tickers_df = pd.read_csv(id_to_ticker_path)
    company_name_df = pd.read_csv(ticker_to_name_path)

    tweets_df.post_date = pd.to_datetime(tweets_df.post_date, unit='s', origin='unix') # Converting to datetime

    single_stock = tickers_df.tweet_id.value_counts()
    single_stock = pd.Series(single_stock[single_stock == 1].index)

    return tweets_df, single_stock, tickers_df

In [4]:
tweets_df, single_stock, tickers_df = init_datasets()

In [5]:
if exists("Datasets/results/preprocessing/final.pkl"):
    tickers_df = pd.read_pickle("Datasets/results/preprocessing/tickers_df.pkl")
    pass
elif exists("Datasets/results/preprocessing/tweets_df.pkl") and exists("Datasets/results/preprocessing/single_stock.pkl"):
    tweets_df = pd.read_pickle("Datasets/results/preprocessing/tweets_df.pkl")
    single_stock = pd.read_pickle("Datasets/results/preprocessing/single_stock.pkl")
else:
    tweets_df, single_stock, tickers_df = init_datasets()
    tweets_df.to_pickle("Datasets/results/preprocessing/tweets_df.pkl")
    single_stock.to_pickle("Datasets/results/preprocessing/single_stock.pkl")
    tickers_df.to_pickle("Datasets/results/preprocessing/tickers_df.pkl")

# Preprocessing

### Remove tweets with more than 1 stock

In [6]:
if exists("Datasets/results/preprocessing/final.pkl") or exists("Datasets/results/preprocessing/preprocessed.pkl"):
    print("421101 tweets were removed - 11.33 %")
else:
    clean_tweets = tweets_df[tweets_df.tweet_id.isin(single_stock)].reset_index(drop=True)
    print(f"{tweets_df.shape[0] - clean_tweets.shape[0]} tweets were removed \
    - {((1 - (clean_tweets.shape[0] / tweets_df.shape[0])) * 100):.2f} %")

421101 tweets were removed - 11.33 %


### Remove intraday duplicates by complete matches

In [7]:
if exists("Datasets/results/preprocessing/final.pkl") or exists("Datasets/results/preprocessing/preprocessed.pkl"):
    print("160160 tweets were removed - 4.86 %")
else:
    a = clean_tweets.shape[0]
    clean_tweets = clean_tweets.assign(day=clean_tweets['post_date'].dt.date).drop_duplicates(subset=(['day', 'body'])).drop('day',axis=1).reset_index(drop=True)
    b = clean_tweets.shape[0]
    print(f"{a-b} tweets were removed - {((1 - (a/b))*100):.2f} %")

160160 tweets were removed - 4.86 %


### Text clean

In [8]:
def clean_text(x):
    x = re.sub(r'http[s]?://\S+', ' ', x) # remove links (caused problems so had to move here)
    x = re.sub(r'[?<=^|\s](\w*-?\w+\.[a-z]{2,}\S*)', ' ', x) # remove links
    x = re.sub(r'\$[A-Za-z]+', ' ', x) # remove tickers
    x = re.sub(r'\@[A-Za-z]+', ' ', x) # remove mentions
    x = x.lower() # lowercase everything
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    # cleaning up text
    x = re.sub(r'\d', ' ', x)
    x = re.sub(r'["“”\*+-/:;<=>\\^_`{|}~]', ' ', x) # remove useless punctuation
    x = re.sub(r'\s{2,}', ' ', x)
    x = re.sub(r'\s[^\w\s]\s', '', x)
    return x.strip()

In [9]:
if not exists("Datasets/results/preprocessing/final.pkl") and not exists("Datasets/results/preprocessing/preprocessed.pkl"):
    clean_tweets['preprocessed'] = clean_tweets.body.apply(lambda tweet: clean_text(tweet))

### Remove intraday duplicates after preprocessing

In [10]:
if exists("Datasets/results/preprocessing/final.pkl") or exists("Datasets/results/preprocessing/preprocessed.pkl"):
    print("359478 tweets were removed")
else:
    a = clean_tweets.shape[0]
    clean_tweets = clean_tweets.assign(day=clean_tweets['post_date'].dt.date).drop_duplicates(subset=(['day', 'preprocessed'])).drop('day',axis=1).reset_index(drop=True)
    b = clean_tweets.shape[0]
    print(f"{a-b} tweets were removed")

359478 tweets were removed


### Combine tweet interactions into 1 feature

In [11]:
if not exists("Datasets/results/preprocessing/final.pkl") and not exists("Datasets/results/preprocessing/preprocessed.pkl"):
    clean_tweets['interactions'] = clean_tweets.comment_num + clean_tweets.retweet_num + clean_tweets.like_num
    clean_tweets.drop(clean_tweets.filter(["comment_num", "retweet_num", "like_num", "writer"]), inplace=True, axis=1)
    clean_tweets.to_pickle("Datasets/results/preprocessing/preprocessed.pkl")

# Sentiment analysis

In [13]:
tokenizer_sentiment = RobertaTokenizer.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
model_sentiment = RobertaForSequenceClassification.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')

In [14]:
def batch_checkSenti(texts,model=model_sentiment,tokenizer=tokenizer_sentiment,return_logits=False):
  start = time.time()
  input_ids = []
  attention_masks = []
  for text in texts:
      encoded_data = tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      truncation=True,
      max_length=64,
      padding='max_length',
      return_attention_mask= True,
      return_tensors='pt'
      )

      input_ids.append(encoded_data['input_ids'])
      attention_masks.append(encoded_data['attention_mask'])

  input_ids = torch.cat(input_ids,dim=0)
  attention_masks = torch.cat(attention_masks,dim=0)

  model.to(device)
  
  print(f'It takes {time.time() - start}s to tokenize')
  checkpointtime = time.time()

  testset = TensorDataset(input_ids,attention_masks)
  test_dl = DataLoader(testset,sampler = SequentialSampler(testset),batch_size = 128)

  predictions = []
  for batch in test_dl:
      batch = tuple(b.to(device) for b in batch)
      inputs = {
          'input_ids':batch[0],
          'attention_mask': batch[1],
      }

      with torch.no_grad():
          outputs = model(**inputs)

      logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      predictions.append(logits)

  predictions = np.concatenate(predictions,axis=0)
  index = predictions.argmax(axis=1)
  print(f'It takes {time.time() - checkpointtime}s to do predictions')

  return (index, predictions) if return_logits else index

In [15]:
if exists("Datasets/results/preprocessing/final.pkl"):
    pass
elif  exists("Datasets/results/preprocessing/sentiment_finetuned.pkl"):
    clean_tweets = pd.read_pickle("Datasets/results/preprocessing/sentiment_finetuned.pkl")
else:
    clean_tweets = pd.read_pickle("Datasets/results/preprocessing/preprocessed.pkl")
    clean_tweets_sentiment = batch_checkSenti(clean_tweets.preprocessed)
    clean_tweets = clean_tweets.join(
        pd.Series(clean_tweets_sentiment, name='sentiment_score')
        )
    clean_tweets.to_pickle("Datasets/results/preprocessing/sentiment_finetuned.pkl")

### Compare Sentiment finetuned with FinBert

In [16]:
def comparison():
    clean_tweets_old = pd.read_pickle("Datasets/results/preprocessing/tweets_total.pkl")
    compare = pd.merge(clean_tweets_old, clean_tweets[['tweet_id', 'sentiment_score']], on='tweet_id', how='left')
    diff = compare[(compare.label=='Positive') & (compare.sentiment_score == 0)]
    return diff

In [None]:
# diff = comparison()
# diff[100:120]

In [None]:
# diff.loc[6284].body

# Lemmatization

In [17]:
def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_).lower() for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stopwords] 
    return lemma_list

def preprocess_pipe(texts):
    preproc_pipe = [lemmatize_pipe(doc) for doc in nlp.pipe(texts, batch_size=512, n_process=-1)]
    return preproc_pipe

In [18]:
if exists("Datasets/results/preprocessing/final.pkl"):
    clean_tweets = pd.read_pickle("Datasets/results/preprocessing/final.pkl")
else:
    clean_tweets['preprocessed'] = clean_tweets['preprocessed'].str.replace('[^ \w+]', '')
    nlp = spacy.load("en_core_web_sm")
    clean_tweets['lemmatized'] = preprocess_pipe(clean_tweets['preprocessed'])
    clean_tweets.to_pickle("Datasets/results/preprocessing/final.pkl")

In [19]:
clean_tweets['cleaned'] = clean_tweets.lemmatized.apply(lambda x: ' '.join(x))
clean_tweets = clean_tweets.drop_duplicates(subset=['cleaned']).reset_index(drop=True)

### bert-large-nli-mean-tokens

In [20]:
# model_sentence = SentenceTransformer('bert-large-nli-mean-tokens')

In [21]:
# if exists("Datasets/results/preprocessing/embeddings.pkl"):
#     with open('Datasets/results/preprocessing/embeddings.pkl', "rb") as file:
#         embedding = pickle.load(file)
# else:
#     embeddings = model_sentence.encode(clean_tweets['cleaned'])
    
#     with open('Datasets/results/preprocessing/embeddings.pkl', "wb") as file:
#         pickle.dump(embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

#     with open('Datasets/results/preprocessing/embeddings_raw.pkl', "wb") as file:
#         pickle.dump(clean_tweets['cleaned'], file, protocol=pickle.HIGHEST_PROTOCOL)

### MiniLM-L6-v2

In [22]:
model_sentence_2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [23]:
if exists("Datasets/results/preprocessing/embeddings_2.pkl"):
    with open('Datasets/results/preprocessing/embeddings_2.pkl', "rb") as file:
        embeddings = pickle.load(file)
else:
    embeddings = model_sentence_2.encode(clean_tweets['cleaned'])
    
    with open('Datasets/results/preprocessing/embeddings_2.pkl', "wb") as file:
        pickle.dump(embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

    with open('Datasets/results/preprocessing/embeddings_raw.pkl', "wb") as file:
        pickle.dump(clean_tweets['cleaned'], file, protocol=pickle.HIGHEST_PROTOCOL)

# Clustering

In [24]:
if exists("Datasets/results/preprocessing/appl_tweets.pkl"):
    tweets = pd.read_pickle("Datasets/results/preprocessing/appl_tweets.pkl")
    print("Downloaded")
else:
    ids = tickers_df[tickers_df.ticker_symbol == 'AAPL'].tweet_id
    tweets = clean_tweets[clean_tweets.tweet_id.isin(ids)]
    tweets.to_pickle("Datasets/results/preprocessing/appl_tweets.pkl")

Downloaded


In [25]:
if exists("Datasets/results/preprocessing/msft_tweets.pkl"):
    tweets = pd.read_pickle("Datasets/results/preprocessing/msft_tweets.pkl")
    print("Downloaded")
else:
    ids = tickers_df[tickers_df.ticker_symbol == 'MSFT'].tweet_id
    tweets = clean_tweets[clean_tweets.tweet_id.isin(ids)]
    tweets.to_pickle("Datasets/results/preprocessing/msft_tweets.pkl")

Downloaded


In [26]:
if exists("Datasets/results/preprocessing/tsla_tweets.pkl"):
    tweets = pd.read_pickle("Datasets/results/preprocessing/tsla_tweets.pkl")
    print("Downloaded")
else:
    ids = tickers_df[tickers_df.ticker_symbol == 'TSLA'].tweet_id
    tweets = clean_tweets[clean_tweets.tweet_id.isin(ids)]
    tweets.to_pickle("Datasets/results/preprocessing/tsla_tweets.pkl")

Downloaded


In [27]:
if exists("Datasets/results/preprocessing/amzn_tweets.pkl"):
    tweets = pd.read_pickle("Datasets/results/preprocessing/amzn_tweets.pkl")
    print("Downloaded")
else:
    ids = tickers_df[tickers_df.ticker_symbol == 'AMZN'].tweet_id
    tweets = clean_tweets[clean_tweets.tweet_id.isin(ids)]
    tweets.to_pickle("Datasets/results/preprocessing/amzn_tweets.pkl")

Downloaded


In [28]:
if exists("Datasets/results/preprocessing/google_tweets.pkl"):
    tweets = pd.read_pickle("Datasets/results/preprocessing/google_tweets.pkl")
    print("Downloaded")
else:
    ids = tickers_df[(tickers_df.ticker_symbol == 'GOOG') | (tickers_df.ticker_symbol == 'GOOGL')].tweet_id
    tweets = clean_tweets[clean_tweets.tweet_id.isin(ids)]
    tweets.to_pickle("Datasets/results/preprocessing/google_tweets.pkl")

Downloaded


In [29]:
len(tweets)

264094

In [30]:
seed_topic_list = [
    ["buy", "sell", "long", "short", "trade", "chart"],
    ["report", "quarter", "dividend", "annual", "revenue"],
    ["read", "via", "article"],
    ["management", "sale", "goal"]
    ]

# hdbscan_model = HDBSCAN(min_cluster_size=900, metric='euclidean', cluster_selection_method='eom')
# topic_model = BERTopic(hdbscan_model=hdbscan_model, language='English', verbose=True, seed_topic_list=seed_topic_list)

# umap_model = UMAP(n_neighbors=4, n_components=5, metric='cosine', low_memory=False)
umap_model = UMAP(n_neighbors=4, n_components=5, metric='cosine', low_memory=True)
hdbscan_model = HDBSCAN(min_cluster_size=1000, metric='euclidean', prediction_data=False)

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    seed_topic_list=seed_topic_list,
    language="english",
    top_n_words=10,
    # min_topic_size=500,
    low_memory=True,
    calculate_probabilities=False,
    nr_topics="auto",
    verbose=True,
    # n_gram_range=(1, 2)
    )

In [31]:
with open('Datasets/results/preprocessing/embeddings_2.pkl', "rb") as file:
    embeddings = pickle.load(file)

with open('Datasets/results/preprocessing/embeddings_raw.pkl', "rb") as file:
    embeddings_raw = pickle.load(file)

In [32]:
raw = embeddings_raw.iloc[tweets.index.tolist()].reset_index(drop=True)
emb = embeddings[tweets.index.tolist()]

In [33]:
topics, probs = topic_model.fit_transform(raw, emb)

2022-12-24 19:54:06,280 - BERTopic - Reduced dimensionality
2022-12-24 19:55:08,039 - BERTopic - Clustered reduced embeddings
2022-12-24 19:55:13,066 - BERTopic - Reduced number of topics from 11 to 11


In [3]:
topic_model_google = BERTopic.load("topics/google_model")

In [4]:
topic_model_google.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,126222,-1_read_google_inc_stock
1,0,88743,0_google_buy_read_not
2,1,11385,1_stock_corp_invest_inc
3,2,6155,2_alphabet_earning_revenue_parent
4,3,4878,3_inc_alphabet_llc_management
5,4,4876,4_ip_patent_research_tech
6,5,4300,5_car_self_drive_waymo
7,6,2835,6_vr_daydream_glass_headset
8,7,2832,7_pixel_xl_read_phone
9,8,2449,8_twitter_tweet_twtr_buy


In [10]:
topic_model_google.visualize_topics()

In [6]:
topic_model_google.visualize_barchart()

In [7]:
topic_model_google.get_topic(0)

[('google', 0.02331680256738344),
 ('buy', 0.019610177242018592),
 ('read', 0.01863683575339828),
 ('not', 0.017912427321849538),
 ('go', 0.017610024910324082),
 ('earning', 0.01733713497151893),
 ('call', 0.017010998708562243),
 ('trade', 0.016760428089490506),
 ('today', 0.0166949119262012),
 ('ad', 0.01636145962582802)]

In [8]:
topic_model_google.get_representative_docs(3)

['alphabet inc stake raise blbb advisor llc',
 'rnc capital management llc hasposition alphabet inc',
 'barton investment management hold holding alphabet inc']

In [9]:
topic_model_google.visualize_hierarchy()

In [None]:
# topic_model.reduce_topics(embeddings_raw.iloc[tweets.index.tolist()].reset_index(drop=True), nr_topics=30)

In [None]:
# topic_model.visualize_hierarchy()


In [None]:
# topic_model.get_topic_info()