SETUP

In [None]:
##MAKE SURE ALL EXTERNAL LIBRARIES ARE INSTALLED

# !pip install pandas
# !pip install matplotlib
# !pip install seaborn
# !pip install wordcloud
# !pip install torch
# !pip install transformers
# !pip install regex
# !pip install datasets
# !pip install pyLDAvis
# !pip install scikit-learn
# !pip install plotly
# !pip install tqdm
# !pip install yfinance
# !pip install networkx

In [None]:
from datasets import load_dataset
import pandas as pd
import plotly.express as px
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import plotly.graph_objects as go
import yfinance as yf
import plotly.io as pio
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import regex as re
from operator import itemgetter

PREPROCESSING

In [None]:
#LOAD DATASET FROM HUGGINFFACE

dataset = load_dataset('mjw/stock_market_tweets')
df = dataset['train'].to_pandas()
df.head()


In [None]:
#BASIC INFORMATION ABOUT THE DATASET

print(f"Dataset shape: {df.shape}")
print(f"Column names: {list(df.columns)}")
print(f"The starting date of the dataset: {df['post_date'].min()} and the ending date of the dataset: {df['post_date'].max()}")

#NA CHECK

print("The numbers of missing value for each columns:")
print(df.isnull().sum())

In [None]:
#INITIAL CLEANUP

df = df.drop(['Unnamed: 0'], axis=1)
df = df.dropna()
print(df.isnull().sum())

In [None]:
#TYPECASTING

df['comment_num'] = df['comment_num'].astype(int)
df['retweet_num'] = df['retweet_num'].astype(int)
df['like_num'] = df['like_num'].astype(int)



In [None]:
#TWEET PREPROCESSING

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text


EDA

In [None]:
#RANGE OF EACH COLUMN

print(f"Range of comment_num: {df['comment_num'].min()} - {df['comment_num'].max()}")
print(f"Range of retweet_num: {df['retweet_num'].min()} - {df['retweet_num'].max()}")
print(f"Range of like_num: {df['like_num'].min()} - {df['like_num'].max()}")

In [None]:
#EXPLORE THE DISTRIBUTION OF NUMERIC COLUMNS 

import plotly.express as px
for col in ['comment_num', 'retweet_num', 'like_num']:
    fig = px.histogram(df, x=col, nbins=30)
    fig.show()


In [None]:
#EXPLORE THE DISTRIBUTION OF THE READILY AVAILABLE TICKER SYMBOLS
fig = px.histogram(df, x='ticker_symbol')
fig.show()


APPLY PRETRAINED MODEL TO OBTAIN SENTIMENT LABELS

In [None]:
## LEVERAGE UPON THE GPU FACILITY OF GOOGLE COLAB TO APPLY THE PRETRAINED MODEL

# import torch
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# from tqdm import tqdm

##CHECK IF GPU IS AVAILABLE
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## MODEL AND TOKENIZER LOADING
# model_name = "cardiffnlp/twitter-roberta-base-sentiment"
# model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

##LABELS MAPPING
# labels = {'LABEL_2': 1, 'LABEL_1': 0, 'LABEL_0': -1}

# def sentiment_analysis(texts):
#     #INPUT PROCESSING
#     inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
#     inputs = inputs.to(device) 
#     #EXTRACT PREDICTIONS
#     with torch.no_grad():  
#         predictions = model(**inputs)
#     _, predicted = torch.max(predictions.logits, 1)
#     return [labels[f'LABEL_{p.item()}'] for p in predicted]




In [None]:
# #APPLY THE MODEL TO THE DATASET
# batch_size = 16 
# sentiments = []
# for i in tqdm(range(0, len(df), batch_size)):
#     batch = df['body'].iloc[i:i+batch_size].tolist()
#     batch_sentiments = sentiment_analysis(batch)
#     sentiments.extend(batch_sentiments)
# #BUILD DF
# df['sentiment'] = sentiments

In [None]:
##DUMPT THE NEW DATASET WITH CLASSIFIFIED LABELS INTO A CSV
#df.to_csv('cached_labels.csv', index=False)

In [None]:
## EARLIER THIS CODE SNIPPET IS NECESSARY WHEN USING GOOGLE COLAB
# from google.colab import drive
# drive.mount('/content/drive')
# import pandas as pd
# df = pd.read_csv('/content/drive/My Drive/Quantallia Analyst Notes/cached_labels.csv')  


In [None]:
##FOR REPRODUCIBILITY, ENSURE YOU EDIT THIS PATH
path = "C:\\Users\\hajiw\\Downloads\\cached_labels.csv"
df = pd.read_csv(path)

In [None]:
#SENTIMENT DISTRIBUTION ACROSS TICKER
df_sentiment = df.groupby(['ticker_symbol', 'sentiment']).size().reset_index(name='counts')
color_sequence = ['yellow', 'green', 'red']

fig = px.pie(df_sentiment, names='sentiment', values='counts', facet_col='ticker_symbol', 
             title='Sentiment Distribution per Ticker Symbol', 
             labels={'counts':'Count', 'sentiment':'Sentiment', 'ticker_symbol':'Ticker Symbol'},
             color_discrete_sequence=color_sequence)

fig.show()


In [None]:
##PLOTS OF DAILY, AVERAGE SENTIMENT SCORES FOR EACH TICKER SYMBOL OVER SAMPLED PERIOD

df['post_date'] = pd.to_datetime(df['post_date'])
grouped_df = df.groupby('ticker_symbol')

# Construct time series of average daily sentiment scores for each ticker
for ticker, group in grouped_df:
    # Sort the tweets by 'post_date'
    group = group.sort_values('post_date')

    # Resample the sorted group to daily level and take the mean of the sentiment scores
    daily_sentiment = group.resample('D', on='post_date')['sentiment'].mean()
    
    # Create a figure for the current ticker
    fig = go.Figure(data=go.Scatter(x=daily_sentiment.index, y=daily_sentiment.values))
    
    # Set layout options for the current plot
    fig.update_layout(
        xaxis=dict(title='Date'),
        yaxis=dict(title='Average Daily Sentiment Score'),
        title=f'Average Daily Sentiment Score over Time - {ticker}'
    )
    fig.show()


TRADING STRATEGY BACKTESTING

In [None]:
##PREPARE THE CLOSE PRICE FOR BACKTETING

#INPUT TICKERS TO MAKE REQUESTS TO YFINANCE 
tickers = df['ticker_symbol'].dropna().unique().tolist()
#EXTRACT CLOSE PRICE USING .DOWNLOAD()
close_prices = yf.download(tickers, start=df['post_date'].min(), end=df['post_date'].max())['Close']







In [None]:
#CREATING A NEW DATAFRAME FOR AVERAGE DAILY SENTIMENT SCORES

sentiment_df = df.groupby(['ticker_symbol', pd.Grouper(key='post_date', freq='D')])['sentiment'].mean().reset_index()

#UNSTACKING THE MULTI-INDEX DATAFRAME TO MAKE IT READY TO MERGE WITH CLOSE_PRICES

sentiment_df = sentiment_df.pivot(index='post_date', columns='ticker_symbol', values='sentiment')

In [None]:
### RERUNNING THIS SCRIPT CAN BE EXTREMELY TIME-CONSUMING.



### PREPARE AUGMENTED DF FOR BACKTESTING

# augmented_df = pd.concat([close_prices, sentiment_df], keys=['Close', 'Sentiment'], axis=1).sort_index(axis=1)
# augmented_df = augmented_df.sort_index()  
# #Calculate sentiment score difference over 3-day period
# for ticker in tickers:
#     if ticker in augmented_df.columns.get_level_values(1):
#         augmented_df[('Signal', ticker)] = augmented_df[('Sentiment', ticker)].diff(3)
# # Create a DataFrame to store positions
# positions_df = pd.DataFrame(index=augmented_df.index)

# # If the sentiment scores increase for three days in a row, we buy (long position = 1)
# # If the sentiment scores decrease for three days in a row, we sell (short position = -1)
# # If the sentiment scores do not consistently increase or decrease, we do nothing (position = 0)
# for ticker in tickers:
#     if ticker in augmented_df.columns.get_level_values(1):
#         positions_df[ticker] = np.where(augmented_df[('Signal', ticker)] > 0, 1, 0)
#         positions_df[ticker] = np.where(augmented_df[('Signal', ticker)] < 0, -1, positions_df[ticker])

# #Forward fill positions to simulate holding positions until next trade
# positions_df.ffill(inplace=True)


In [None]:
##1/ CALCULATE DAILY LOG RETURNS OF THE CLOSE PRICES

# for ticker in tickers:
#     if ticker in augmented_df.columns.get_level_values(1):
#         augmented_df[('Log Returns', ticker)] = np.log(augmented_df[('Close', ticker)] / augmented_df[('Close', ticker)].shift(1))

##2/ PREPARE LOG RETURNS DATAFRAME

# log_returns_df = augmented_df.xs('Log Returns', axis=1, level=0, drop_level=False)
# log_returns_df.columns = log_returns_df.columns.droplevel(0)

##3/ CALCULATE DAILY STRATEGY RETURNS

# strategy_returns_df = positions_df.shift().reindex(log_returns_df.index) * log_returns_df

##4/ CALCULATE CUMULATIVE STRATEGY RETURNS

# cumulative_strategy_returns_df = strategy_returns_df.cumsum()

##5/ RETURN THE CUMULATIVE RETURNS OF THE STRATEGY

# print(cumulative_strategy_returns_df.tail())


In [None]:
## CONVERTING LOG RETURNS TO SIMPLE RETURNS MANUALLY SINCE I FORGOT TO SAVE THE AUGMENTED_DF (RESULTS ARE SCREENSHOTTED)

# ticker_cumreturns = {'AAPL': 0.75183, 'AMZN': 2.107555, 'GOOG': -0.263453, 'MSFT': -1.082538, 'TSLA': 20.471324, 'SP500':0.450892}
# initial_investment = 1000

# for ticker, log_return in ticker_cumreturns.items():
#     simple_return = np.exp(log_return) - 1
#     final_value = initial_investment * (1 + simple_return)
#     print(f"{ticker}: {final_value}")


In [None]:
## CALCULATE THE EXPECTED PERFORMANCE OF AN EQUALLY-WEIGHTED PORTFOLIO

# initial_investment_per_asset = 1000 / len(ticker_cumreturns)
# total_value = 0
# for ticker, log_return in ticker_cumreturns.items():
#     simple_return = np.exp(log_return) - 1
#     final_value = initial_investment_per_asset * (1 + simple_return)
#     total_value += final_value
#     print(f"Final value of {ticker}: ${final_value:.2f}")

# total_value = int(total_value)
# print(f"Total value of the portfolio: ${total_value:,}")

In [None]:
## RETURN THE BACKTEST STATISTICS

# print(f'Backtest Timeframe: {augmented_df.index[0].date()} to {augmented_df.index[-1].date()}')

##  Calculate number of trades entered
# num_trades = np.count_nonzero(positions_df.diff().abs())

## Calculate number of winning trades
# num_winning_trades = np.count_nonzero((strategy_returns_df > 0).sum(axis=1))

## Calculate win rate
# win_rate = num_winning_trades / num_trades

# print(f'Number of trades entered: {num_trades}')
# print(f'Number of winning trades: {num_winning_trades}')
# print(f'Win rate: {win_rate:.2f}')


In [None]:
##CALCULATION SP500 LOG RETURNS OVER THE SAMPLED PERIOD
# data = yf.download('^GSPC', start='2015-01-01', end='2020-01-01')
# data['Log Returns'] = np.log(data['Close'] / data['Close'].shift(1))
# data['Cumulative Log Returns of SP500'] = data['Log Returns'].cumsum()
# print(data['Cumulative Log Returns of SP500'].tail())


In [None]:
##CALCULATE THE CASH TERMS FOR THE STRATEGY'S RETURNS.

# initial_investment = 1000
# ticker_cumreturns = {'AAPL': 0.75183, 'AMZN': 2.107555, 'GOOG': -0.263453, 'MSFT': -1.082538, 'TSLA': 20.471324}
# final_value = {}
# for ticker, cum_return in ticker_cumreturns.items():
#     final_value[ticker] = initial_investment * (1 + cum_return)
# print("Final value of the investment for each stock, given an initial investment of $1000:")
# for ticker, value in final_value.items():
#     print(f"{ticker}: ${value:.2f}")


In [None]:
## IDENTIFICATION OF LONGEST PERIOD OF POSITIVE/NEGATIVE SENTIMENT SCORES FOR EACH TICKER SYMBOL

    
# Make sure 'post_date' is a datetime object
df['post_date'] = pd.to_datetime(df['post_date'])
# Group the DataFrame by 'ticker_symbol'
grouped_df = df.groupby('ticker_symbol')
# Initialize variables to store the longest positive and negative periods for each ticker
positive_periods = {}
negative_periods = {}

# Construct time series of average daily sentiment scores for each ticker
for ticker, group in grouped_df:
    group = group.sort_values('post_date')
    daily_sentiment = group.resample('D', on='post_date')['sentiment'].mean()

    # Find the periods when sentiment scores remain above 0
    positive_mask = daily_sentiment > 0
    positive_runs = (positive_mask != positive_mask.shift()).cumsum()
    positive_periods[ticker] = daily_sentiment[positive_mask].groupby(positive_runs).apply(lambda x: (x.index[0], x.index[-1]))

    # Find the periods when sentiment scores remain below 0
    negative_mask = daily_sentiment < 0
    negative_runs = (negative_mask != negative_mask.shift()).cumsum()
    negative_periods[ticker] = daily_sentiment[negative_mask].groupby(negative_runs).apply(lambda x: (x.index[0], x.index[-1]))

for ticker in positive_periods:
    longest_positive_period = max(positive_periods[ticker], key=lambda x: x[1] - x[0])
    print(f"Ticker: {ticker}")
    print(f"Longest positive period: {longest_positive_period[0].date()} to {longest_positive_period[1].date()}")

for ticker in negative_periods:
    longest_negative_period = max(negative_periods[ticker], key=lambda x: x[1] - x[0])
    print(f"Ticker: {ticker}")
    print(f"Longest negative period: {longest_negative_period[0].date()} to {longest_negative_period[1].date()}")
