In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
# Replace by most recent version
filestamp = '1711835043'

# Functions for later use

In [4]:
# Filter the datapoints to be published after the knowledge cut-off of GPT-4
def knowledge_cut_off(data):
  data = data[data['timestamp']>='01-04-2023']
  data.reset_index(inplace=True, drop=True)
  return data

In [5]:
# Sample datapoints to get a balanced training, validation, test set. validation and test are always 200 samples, training depends on the availability.

def sampling_balanced_data(data, val_size=200, test_size=200, sample_size=3000):
  data = data.iloc[:2*sample_size]
  UP = data[data.result_1 == 'UP']
  DOWN = data[data.result_1 == 'DOWN']
  UP_rand = UP.sample(n=round(sample_size/2), random_state=42).reset_index(drop=True)
  DOWN_rand = DOWN.sample(n=round(sample_size/2), random_state=42).reset_index(drop=True)

  split_val_start = round(sample_size/2 - val_size/2 - test_size/2)
  split_test_start = round(sample_size/2 - test_size/2)

  train = pd.concat([UP_rand.iloc[:split_val_start], DOWN_rand.iloc[:split_val_start]]).reset_index(drop=True)
  validation = pd.concat([UP_rand.iloc[split_val_start:split_test_start], DOWN_rand.iloc[split_val_start:split_test_start]]).reset_index(drop=True)
  test = pd.concat([UP_rand.iloc[split_test_start:], DOWN_rand.iloc[split_test_start:]]).reset_index(drop=True)

  train = train.sample(frac=1, random_state=42).reset_index(drop=True)
  validation = validation.sample(frac=1, random_state=42).reset_index(drop=True)
  test = test.sample(frac=1, random_state=42).reset_index(drop=True)

  return train, validation, test

In [6]:
# Sample datapoints to get from the original balanced dataset to build the mix
def sample_balance_mixed_data(data, sample_size):
  UP = data[data.result_1 == 'UP']
  DOWN = data[data.result_1 == 'DOWN']
  UP_rand = UP.sample(n=round(sample_size/2), random_state=42).reset_index(drop=True)
  DOWN_rand = DOWN.sample(n=round(sample_size/2), random_state=42).reset_index(drop=True)
  sample = pd.concat([UP_rand, DOWN_rand]).reset_index(drop=True)

  return sample


# News

In [7]:
# Load the data
news = pd.read_json(f'/content/drive/MyDrive/Thesis data/cleaned_data/{filestamp}_news.json')
news.count()

ticker             15271
text               15271
publisher          15271
timestamp          15271
title              15271
url                15271
uuid               15271
partial_article    15271
prompt             15271
token_count        15271
valid_ticker       15271
open               15271
close_1            15271
close_2            15271
close_3            15271
close_4            15271
close_5            15271
result_1           15271
result_2           15271
result_3           15271
result_4           15271
result_5           15271
result_1_bin       15271
result_2_bin       15271
result_3_bin       15271
result_4_bin       15271
result_5_bin       15271
relevance          15271
dtype: int64

In [8]:
# Filter on knowledge cutoff date
news = knowledge_cut_off(news)
news.count()

ticker             15270
text               15270
publisher          15270
timestamp          15270
title              15270
url                15270
uuid               15270
partial_article    15270
prompt             15270
token_count        15270
valid_ticker       15270
open               15270
close_1            15270
close_2            15270
close_3            15270
close_4            15270
close_5            15270
result_1           15270
result_2           15270
result_3           15270
result_4           15270
result_5           15270
result_1_bin       15270
result_2_bin       15270
result_3_bin       15270
result_4_bin       15270
result_5_bin       15270
relevance          15270
dtype: int64

In [9]:
news = news[['ticker','prompt','text','url','result_1','result_1_bin', 'relevance', 'token_count']]

In [10]:
# Filter on relevancy
relevant_news = news[news.relevance=="TRUE"]
print(f'Relevant UP: {relevant_news.result_1_bin.sum()}, Relevant DOWN: {len(relevant_news)-relevant_news.result_1_bin.sum()}')
print(f'Random UP: {news.result_1_bin.sum()}, Random DOWN: {len(news)-news.result_1_bin.sum()}')

Relevant UP: 6138, Relevant DOWN: 5595
Random UP: 7956, Random DOWN: 7314


In [11]:
relevant_news = relevant_news.sort_values('token_count').reset_index(drop=True)
news = news.sort_values('token_count').reset_index(drop=True)

In [12]:
# Relevant and random selected datapoints are stored, in the research the focus is on relevant datapoints.
# 4000 datapoints are used to get a balance between completeness and training requirements later on.
relevant_news_train, relevant_news_val, relevant_news_test = sampling_balanced_data(relevant_news, 200, 200, 4000)
news_train, news_val, news_test = sampling_balanced_data(news, 200, 200, 4000)


In [13]:
print(relevant_news_test.token_count.sum())
print(news_test.token_count.sum())

150246
135536


In [None]:
relevant_news_train.to_json(f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_news_train.json')
relevant_news_val.to_json(f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_news_val.json')
relevant_news_test.to_json(f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_news_test.json')
news_train.to_json(f'/content/drive/MyDrive/Thesis data/random_selection/{filestamp}_news_train.json')
news_val.to_json(f'/content/drive/MyDrive/Thesis data/random_selection/{filestamp}_news_val.json')
news_test.to_json(f'/content/drive/MyDrive/Thesis data/random_selection/{filestamp}_news_test.json')

# Twitter

In [14]:
# Load the data
twitter = pd.read_json(f'/content/drive/MyDrive/Thesis data/cleaned_data/{filestamp}_twitter.json')

In [15]:
# Filter on knowledge cutoff date
twitter = knowledge_cut_off(twitter)

In [16]:
twitter.count()

timestamp       7918
text            7918
url             7918
ticker          7918
ticker_tweet    7918
valid_ticker    7918
prompt          7918
token_counts    7918
token_count     7918
open            7918
close_1         7918
close_2         7918
close_3         7918
close_4         7918
close_5         7918
result_1        7918
result_2        7918
result_3        7918
result_4        7918
result_5        7918
result_1_bin    7918
result_2_bin    7918
result_3_bin    7918
result_4_bin    7918
result_5_bin    7918
relevance       7918
dtype: int64

In [17]:
twitter = twitter[['ticker','prompt','text','url','result_1','result_1_bin', 'relevance', 'token_count']]

In [18]:
twitter.groupby('relevance')['relevance'].count()

relevance
FALSE    4241
TRUE     3677
Name: relevance, dtype: int64

In [19]:
# Filter on relevancy
relevant_twitter = twitter[twitter['relevance']=="TRUE"]
print(f'Relevant UP: {relevant_twitter.result_1_bin.sum()}, Relevant DOWN: {len(relevant_twitter)-relevant_twitter.result_1_bin.sum()}')
print(f'Random UP: {twitter.result_1_bin.sum()}, Random DOWN: {len(twitter)-twitter.result_1_bin.sum()}')

Relevant UP: 1827, Relevant DOWN: 1850
Random UP: 3941, Random DOWN: 3977


In [20]:
# Relevant and random selected datapoints are stored, in the research the focus is on relevant datapoints.
# Only 3600 samples are selected for relevant datapoints as there are just over 1800 samples available for both.
relevant_twitter_train, relevant_twitter_val, relevant_twitter_test = sampling_balanced_data(relevant_twitter, 200, 200, 3600)
twitter_train, twitter_val, twitter_test = sampling_balanced_data(twitter, 200, 200, 4000)

In [21]:
print(relevant_twitter_test.token_count.sum())
print(twitter_test.token_count.sum())

19736
21365


In [None]:
relevant_twitter_train.to_json(f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_twitter_train.json')
relevant_twitter_val.to_json(f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_twitter_val.json')
relevant_twitter_test.to_json(f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_twitter_test.json')
twitter_train.to_json(f'/content/drive/MyDrive/Thesis data/random_selection/{filestamp}_twitter_train.json')
twitter_val.to_json(f'/content/drive/MyDrive/Thesis data/random_selection/{filestamp}_twitter_val.json')
twitter_test.to_json(f'/content/drive/MyDrive/Thesis data/random_selection/{filestamp}_twitter_test.json')
# Twitter selection ends here

# Mix


In [22]:
# Retrieve samples from twitter and news set to create a mixture of both (only for relevant datapoints)
relevant_mix_twitter_train = sample_balance_mixed_data(relevant_twitter_train, 1800)
relevant_mix_twitter_val = sample_balance_mixed_data(relevant_twitter_val, 100)
relevant_mix_twitter_test = sample_balance_mixed_data(relevant_twitter_test, 100)
relevant_mix_news_train = sample_balance_mixed_data(relevant_news_train, 1800)
relevant_mix_news_val = sample_balance_mixed_data(relevant_news_val, 100)
relevant_mix_news_test = sample_balance_mixed_data(relevant_news_test, 100)




In [23]:
# Combine the selected samples
relevant_mix_train = pd.concat([relevant_mix_twitter_train, relevant_mix_news_train]).sample(frac=1, random_state=42).reset_index(drop=True)
relevant_mix_val = pd.concat([relevant_mix_twitter_val, relevant_mix_news_val]).sample(frac=1, random_state=42).reset_index(drop=True)
relevant_mix_test = pd.concat([relevant_mix_twitter_test, relevant_mix_news_test]).sample(frac=1, random_state=42).reset_index(drop=True)

In [24]:
mix_twitter_train = sample_balance_mixed_data(twitter_train, 1800)
mix_twitter_val = sample_balance_mixed_data(twitter_val, 100)
mix_twitter_test = sample_balance_mixed_data(twitter_test, 100)
mix_news_train = sample_balance_mixed_data(news_train, 1800)
mix_news_val = sample_balance_mixed_data(news_val, 100)
mix_news_test = sample_balance_mixed_data(news_test, 100)

In [25]:
# Combine the selected samples
mix_train = pd.concat([mix_twitter_train, mix_news_train]).sample(frac=1, random_state=42).reset_index(drop=True)
mix_val = pd.concat([mix_twitter_val, mix_news_val]).sample(frac=1, random_state=42).reset_index(drop=True)
mix_test = pd.concat([mix_twitter_test, mix_news_test]).sample(frac=1, random_state=42).reset_index(drop=True)

In [26]:
relevant_mix_train.to_json(f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_mix_train.json')
relevant_mix_val.to_json(f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_mix_val.json')
relevant_mix_test.to_json(f'/content/drive/MyDrive/Thesis data/relevant_selection/{filestamp}_mix_test.json')
mix_train.to_json(f'/content/drive/MyDrive/Thesis data/random_selection/{filestamp}_mix_train.json')
mix_val.to_json(f'/content/drive/MyDrive/Thesis data/random_selection/{filestamp}_mix_val.json')
mix_test.to_json(f'/content/drive/MyDrive/Thesis data/random_selection/{filestamp}_mix_test.json')