In [None]:
# Data processing
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer

from tensorflow.keras import preprocessing as kprocessing
from tensorflow.keras import models, layers, optimizers

import transformers

import gensim.models.keyedvectors as word2vec
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM
from keras import layers 

# Visualization
from matplotlib import pyplot as plt
from wordcloud import WordCloud

# Text processing
import re
import nltk
import gensim.downloader as api
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
import contractions
import string 

# Various
from datetime import datetime

In [None]:
import snscrape.modules.twitter as sntwitter

# Creating list to append tweet data to
tweets_list = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i, tweet in enumerate(sntwitter.TwitterSearchScraper('XAU/USD since:2020-03-29 until:2022-08-24').get_items()):
    if i > 500000000:
        break
    tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username])
    print (i)
    
# Creating a dataframe from the tweets list above
df_tweets = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])


In [None]:
import pandas as pd
df_tweets = pd.read_csv('Twitter_past_6months', lineterminator='\n')

In [None]:
df_tweets = df_tweets[df_tweets['Datetime'] < '2022-08-23 08:30']
df_tweets = df_tweets[df_tweets['Datetime'] > '2022-03-30 21:30']
df_tweets.reindex()
df_tweets = df_tweets.reset_index(drop = True)
df_tweets

In [None]:
df_30 = pd.read_excel('/Users/karinnathea/Downloads/Intraday_Price_Data.xlsx', '30 min')
df_30['Return'] = df_30['Close'].pct_change()
df_30 = df_30.dropna()
df_30 = calculate_binary(df_30)
df_30

In [None]:
def calculate_binary(data):
    data['Binary'] = ""
    for i in range(len(data)):
        if (data['Return'].iloc[i] > 0):
            data['Binary'].iloc[i] = 1
        else:
            data['Binary'].iloc[i] = 0
    return (data)

df_30 = calculate_binary(df_30)

In [None]:
# pre-processing data

df_tweets['Converted_Text'] = df_tweets['Text'].apply(lambda x: re.sub('(http[s]?):\/\/\S+', '', x))
 
# remove hashtags
df_tweets['Converted_Text'] = df_tweets['Converted_Text'].apply(lambda x: re.sub('#+', ' ', x))
 
# convert to lower case
df_tweets['Converted_Text'] = df_tweets['Converted_Text'].apply(lambda x: x.lower())

# remove punctuations
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

df_tweets['Converted_Text'] = df_tweets['Converted_Text'].apply(lambda x: remove_punct(x))

# apply contractions
df_tweets['Converted_Text'] = df_tweets['Converted_Text'].apply(lambda x: contractions.fix(x))


## Tokenisation
def tokenisation(text):
    text = re.split('\W+', text)
    return text

df_tweets['Token'] = df_tweets['Converted_Text'].apply(lambda x: tokenisation(x.lower()))

def stemming(text):
    text = [nltk.PorterStemmer().stem(word) for word in text]
    return text

df_tweets['Token'] = df_tweets['Token'].apply(lambda x: stemming(x))

lm = WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
df_tweets['Token'] = df_tweets['Token'].apply(lemmatizer_on_text)

df_tweets

In [None]:
# add the time of year, month, day and minute to each dataframe
def extract_year(datetime):
    return datetime.date().year

def extract_month(datetime):
    return datetime.date().month

def extract_day(datetime):
    return datetime.date().day

def extract_hour(datetime):
    return datetime.hour

def extract_minute(datetime):
    return datetime.minute


df_tweets['year'] = df_tweets['Datetime'].apply(extract_year)
df_tweets['month'] = df_tweets['Datetime'].apply(extract_month)
df_tweets['day'] = df_tweets['Datetime'].apply(extract_day)
df_tweets['hour'] = df_tweets['Datetime'].apply(extract_hour)
df_tweets['min'] = df_tweets['Datetime'].apply(extract_minute)

In [None]:
df_30['year'] = df_30['Date'].apply(extract_year)
df_30['month'] = df_30['Date'].apply(extract_month)
df_30['day'] = df_30['Date'].apply(extract_day)
df_30['hour'] = df_30['Date'].apply(extract_hour)
df_30['minute'] = df_30['Date'].apply(extract_minute)
df_30

In [None]:
def nearest_30min (minute):
    if minute in list(range(0,31)):
        return 30
    if minute in list(range(30,61)):
        return 0
    
df_tweets['minute'] = df_tweets['min'].apply(nearest_30min)

In [None]:
df_30_merge = df_30[["Return", "Binary", "year", "month", "day", "hour", "minute"]]
new_merge = pd.merge(df_tweets, df_30_merge, on=['year', 'month', 'day', 'hour', 'minute'], how='outer')
new_df = new_merge_30[:-1]
new_df.fillna(method = 'ffill', inplace = True)
new_df.to_csv('cleanedData', index = False)

In [None]:
# Positive return 
data_pos = new_df['Text'][new_df ['Binary'] == 1]
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(data_pos))
plt.imshow(wc)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Negative return 
data_neg = new_df['Text'][new_df ['Binary'] == 1]
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(data_neg))
plt.imshow(wc)

In [9]:
import pandas as pd

new_df = pd.read_csv('cleanedData', lineterminator='\n')

In [None]:
## Modelling processes

In [None]:
# test data from 24th of July 2022 onwards
test = new_df[:48906]
y_test = test['Binary']
x_test = test['Token']
# train data from 30th of March 2022 to 22nd of July 2022
train = new_df[48906:]
y_train = train['Binary']
x_train = train['Token']

In [None]:
# TF-IDF + Unigram + Bigram Vectorisation
from sklearn.feature_extraction.text import TfidfVectorizer
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features = 5000000)
vectoriser.fit(x_train)

In [None]:
x_train = vectoriser.transform(x_train)
x_test  = vectoriser.transform(x_test)

In [None]:
def model_Evaluate(model):
    # Predict values for Test dataset
    y_pred = model.predict(x_test)
    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)
    categories = ['0','1']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]
    labels = [f'{v1}n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
    xticklabels = categories, yticklabels = categories)
    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values" , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

In [None]:
SVCmodel = LinearSVC()
SVCmodel.fit(x_train, y_train)
model_Evaluate(SVCmodel)
y_pred1 = SVCmodel.predict(x_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFModel = RandomForestClassifier(n_estimators=200, random_state=0)
RFModel.fit(x_train, y_train)
model_Evaluate(RFModel)
y_pred2 = LRmodel.predict(x_test)

In [10]:
## LSTM 
X_tf = new_df['Converted_Text']
y_tf_class = new_df['Binary'].astype(str)

# Split into training and test data
X_tf_train = X_tf[48906:]
y_tf_train = y_tf_class[48906:]
X_tf_test = X_tf[:48906]
y_tf_test = y_tf_class[:48906]

y_tf_train = np.asarray(y_tf_train).astype("float64")
y_tf_test = np.asarray(y_tf_test).astype("float64")

corpus = X_tf_train
max_words = 125000
tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ', num_words=max_words, oov_token="<pad>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(corpus)
voc = tokenizer.word_index
reverse_voc = dict([(value, key) for (key, value) in voc.items()])

max_len = 200
sequences = tokenizer.texts_to_sequences(X_tf_train)
# Convert both vectorised train and test data to sequence 
X_tf_train_seq = kprocessing.sequence.pad_sequences(sequences, maxlen=max_len)
X_tf_test_seq = kprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(X_tf_test), maxlen=max_len)


In [14]:
import gensim.downloader as api

In [15]:
# This may take several minutes
w2v = api.load("word2vec-google-news-300")
emb_matrix = np.zeros((max_words+1, 300))
for i in range(max_words):
    w = reverse_voc[i+1]
    if w in w2v:
        emb_matrix[i+1,:] = w2v[w]
emb_size = emb_matrix.shape[1]



In [26]:
# create model
max_words = 125000

emb_matrix = np.zeros((max_words + 1, 300))
for i in range(max_words):
    w = reverse_voc[i+1]
    if w in w2v:
        emb_matrix[i+1,:] = w2v[w]
emb_size = emb_matrix.shape[1]

lstm_out = 125
model = Sequential()
model.add(layers.Embedding(max_words+1,emb_size,weights=[emb_matrix],trainable=False, name='embedding'))
model.add(layers.Bidirectional(layers.LSTM(lstm_out)))
model.add(layers.Dropout(0.2, name='dropout'))
model.add(Dense(32, activation='relu', name='dense'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


#fit model
batch_size = 32
model.fit(X_tf_train_seq, y_tf_train, epochs = 10, verbose=1, batch_size=batch_size)

#analyze the results
score, acc = model.evaluate(X_tf_test_seq, y_tf_test, verbose = 2, batch_size=batch_size)
y_pred3 = model.predict(X_tf_test_seq)

In [31]:
from sklearn.metrics import confusion_matrix, roc_curve,  roc_auc_score, classification_report
#ROC AUC curve
rocAuc = roc_auc_score(y_tf_test, y_pred3)

falsePositiveRate, truePositiveRate, _ = roc_curve(y_tf_test, y_pred3)

plt.figure()

plt.plot(falsePositiveRate, truePositiveRate, color='green',
         lw=3, label='ROC curve (area = %0.2f)' % rocAuc)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic of Sentiiment Analysis Model')
plt.legend(loc="lower right")
plt.show()


#Other accuracy metrices
y_pred3 = (y_pred3 > 0.5)

#confusion metrix
cm = confusion_matrix(y_tf_test, y_pred3)
print(cm)

#F1 Score, Recall and Precision
print(classification_report(y_tf_test, y_pred3, target_names=['Negative', 'Positive']))

# Plotting confusion matrix
categories = ['0','1']
group_names = ['True Neg','False Pos', 'False Neg','True Pos']
group_percentages = ['{0:.2%}'.format(value) for value in cm.flatten() / np.sum(cm)]
labels = [f'{v1}n{v2}' for v1, v2 in zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot = labels, cmap = 'Blues',fmt = '', xticklabels = categories, yticklabels = categories)

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve,  roc_auc_score, classification_report
#ROC AUC curve
rocAuc = roc_auc_score(y_tf_test, y_pred4)

falsePositiveRate, truePositiveRate, _ = roc_curve(y_tf_test, y_pred4)

plt.figure()

plt.plot(falsePositiveRate, truePositiveRate, color='green',
         lw=3, label='ROC curve (area = %0.2f)' % rocAuc)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic of Sentiiment Analysis Model')
plt.legend(loc="lower right")
plt.show()


#Other accuracy metrices
y_pred3 = (y_pred3 > 0.5)

#confusion metrix
cm = confusion_matrix(y_tf_test, y_pred3)
print(cm)

#F1 Score, Recall and Precision
print(classification_report(y_tf_test, y_pred3, target_names=['Negative', 'Positive']))

# Plotting confusion matrix
categories = ['0','1']
group_names = ['True Neg','False Pos', 'False Neg','True Pos']
group_percentages = ['{0:.2%}'.format(value) for value in cm.flatten() / np.sum(cm)]
labels = [f'{v1}n{v2}' for v1, v2 in zip(group_names,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot = labels, cmap = 'Blues',fmt = '', xticklabels = categories, yticklabels = categories)
plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
plt.ylabel("Actual values" , fontdict = {'size':14}, labelpad = 10)
plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)