https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt  
https://medium.com/nlplanet/bert-finetuning-with-hugging-face-and-training-visualizations-with-tensorboard-46368a57fc97


In [219]:
import os
import shutil
import numpy as np
import pandas as pd
from collections import Counter
from multiprocessing import Pool
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, f1_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import helper_data, helper_model

1: https://www.kaggle.com/code/taufiquesekh/stock-sentiment-analysis

In [220]:
dataset_filename = {
    # '0': ("training.1600000.processed.noemoticon.csv", ["target", "ids", "date", "flag", "user", "text"]), # not financial sentiment, not used
    '0': ("gpt.csv", ["text,label"]),
    '1': ("stock_data.csv", ["text", "target"]),
    '2': ("nasdaq.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '3': ("djia_news copy.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '4': ("data-3.csv", ["Sentence", "Sentiment"]),
    '5': ("sentiment.csv", ["Stock Ticker", "Tweet Text", "Sentiment", "Tweet URL"]),
    '6': ('train_tweet.csv', ["id", "label", "tweet"]),  # 0 positive, 1 negative
    '7': ('stock_tweets.csv', ["Date", "Tweet", "Stock Name", "Company Name"]),  # 0 positive, 1 negative
    '8': ('tweets_labelled.csv', ['id','created_at','text','sentiment'])
}

In [221]:
DATASET_ENCODING = "ISO-8859-1"

In [222]:
dataset_path = os.path.join("", "data", dataset_filename["0"][0])
df0 = pd.read_csv(dataset_path, names=dataset_filename["0"][1], skiprows=[0])
df0 = pd.DataFrame(df0)
df0[['text', 'label']] = df0['text,label'].str.rsplit(',', n=1, expand=True)
df0.drop(columns=['text,label'], inplace=True)
decode_map = {0: "NEGATIVE", 1: "POSITIVE"}
df0['target'] = df0['label'].apply(lambda x: decode_map[int(x)])
df0.drop(columns=['label'], inplace=True)
df0.rename(columns={df0.columns[1]: 'target'}, inplace=True)

In [223]:
# dataset_path = os.path.join("", "data", dataset_filename["0"][0])
# df0 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["0"][1])
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
# df0.target = df0.target.apply(lambda x: decode_map[int(x)])
# df0 = df0[['text', 'target']]

In [224]:
dataset_path = os.path.join("", "data", dataset_filename["1"][0])
df1 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING)
df1 = df1.iloc[:, :-3]
decode_map = {-1: "NEGATIVE", 1: "POSITIVE"}
df1['target'] = pd.to_numeric(df1['target'], errors='coerce')
df1['target'] = df1['target'].map(decode_map)

In [225]:
# dataset_path = os.path.join("", "data", dataset_filename["2"][0])
# df2 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["2"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
# df2['target'] = df2['Label'].apply(lambda x: decode_map[int(x)])
# df2 = df2[['Headline', 'target']]
# df2.rename(columns={df2.columns[0]: 'text'}, inplace=True)

In [226]:
# dataset_path = os.path.join("", "data", dataset_filename["3"][0])
# df3 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["3"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
# df3['target'] = df3['Label'].apply(lambda x: decode_map[int(x)])
# df3 = df3[['Headline', 'target']]
# df3.rename(columns={df3.columns[0]: 'text'}, inplace=True)

In [227]:
dataset_path = os.path.join("", "data", dataset_filename["4"][0])
df4 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["4"][1], skiprows=1)
decode_map = {"negative": "NEGATIVE", "neutral": "NEUTRAL", "positive": "POSITIVE"}
df4['target'] = df4['Sentiment'].apply(lambda x: decode_map[x])
df4.drop(columns=['Sentiment'], inplace=True)
df4.rename(columns={df4.columns[0]: 'text'}, inplace=True)

In [228]:
dataset_path = os.path.join("", "data", dataset_filename["5"][0])
df5 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["5"][1], skiprows=1)
decode_map = {"Negative": "NEGATIVE", "Positive": "POSITIVE"}
df5['target'] = df5['Sentiment'].apply(lambda x: decode_map[x])
df5 = df5[['Tweet Text', 'target']]
df5.rename(columns={df5.columns[0]: 'text'}, inplace=True)

In [229]:
# dataset_path = os.path.join("", "data", dataset_filename["6"][0])
# df6 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["6"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 1: "POSITIVE"}
# df6['target'] = df6['label'].apply(lambda x: decode_map[int(x)])
# df6 = df6[['tweet', 'target']]
# df6.rename(columns={df6.columns[0]: 'text'}, inplace=True)

In [230]:
dataset_path = os.path.join("", "data", dataset_filename["7"][0])
df7 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["7"][1], skiprows=1)
df7 = df7[['Tweet']]
df7.rename(columns={df7.columns[0]: 'text'}, inplace=True)
df_test_unlabeled = df7

In [231]:
dataset_path = os.path.join("", "data/unorganized", dataset_filename["8"][0])
DATASET_ENCODING = "utf-8"  # Replace with the appropriate encoding
df8 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["8"][1], skiprows=1, index_col=None)
decode_map = {"negative": "NEGATIVE", "neutral": "NEUTRAL", "positive": "POSITIVE"}
df8['target'] = df8['sentiment'].map(decode_map)
df8 = df8.dropna(subset=['sentiment'])
df8 = df8[['text', 'target']]
df_test = df8.dropna(subset=['target'])

In [232]:
df_test

Unnamed: 0,text,target
0,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",POSITIVE
1,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,NEGATIVE
2,Net issuance increases to fund fiscal programs...,POSITIVE
3,RT @bentboolean: How much of Amazon's traffic ...,POSITIVE
4,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,POSITIVE
...,...,...
1295,#stocks back from the recovery room: https://t...,POSITIVE
1296,RT @MacroCharts: Breadth – expanding last week...,POSITIVE
1297,RT @MawsonResource: Rompas-Rajapalot: A Big Ne...,NEUTRAL
1298,$AAPL $QQQ Top may now be in. https://t.co/iNK...,POSITIVE


In [233]:
target_counts = df_test['target'].value_counts()
count_df = pd.DataFrame({
    'label': target_counts.index,
    'count': target_counts.values
})
print(count_df)

      label  count
0  POSITIVE    528
1   NEUTRAL    424
2  NEGATIVE    348


In [236]:
total_rows = len(df0) + len(df1) + len(df4) + len(df5) 
print("Total number of rows:", total_rows)

Total number of rows: 14342


In [237]:
df = pd.concat([df0, df1, df4, df5], ignore_index=True)
df.shape

(14342, 2)

In [238]:
df = helper_data.shuffle_dataframe(df)
df = df[df['target'] != "NEUTRAL"]
df.rename(columns={'target': 'label'}, inplace=True)

df_test.rename(columns={'target': 'label'}, inplace=True)

df_test_unlabeled.rename(columns={'target': 'label'}, inplace=True)

In [239]:
unique_elements_table = df['label'].value_counts().reset_index()
unique_elements_table.columns = ['Label', 'Count']
unique_elements_table

Unnamed: 0,Label,Count
0,POSITIVE,6346
1,NEGATIVE,4811


In [240]:
# df = df[:100]
# df_test = df_test[:8]
# df_test_unlabeled = df_test_unlabeled[:10]

In [241]:
label_mapping = {'POSITIVE': 1, 'NEGATIVE': 0}
df = df.dropna(subset=['label'])
df['label'] = df['label'].map(label_mapping)

label_mapping = {'POSITIVE': 2, 'NEUTRAL': 1, 'NEGATIVE': 0}
df_test = df_test.dropna(subset=['label'])
df_test['label'] = df_test['label'].map(label_mapping)

In [242]:
df

Unnamed: 0,text,label
6272,JPM bounced off it's fib support level today too.,0
644,$BTC's price is plummeting. Market sentiment i...,0
4307,Prudential liquidates CM stake... 1st ones off...,0
575,Eagerly anticipating $AAPL's product launch ev...,1
6634,".user clearly, if (but not until) the 20MA cra...",0
...,...,...
14288,#Premarket Top % Gainers:\r\n\r\n$IDRA $BRDS $...,1
12,$SPY is too volatile right now. Staying out of...,0
1688,$DIS's stock experiences volatility as the com...,0
2181,Regulatory scrutiny affects $SQ's stock price ...,0


In [243]:
df_test

Unnamed: 0,text,label
0,"RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Mon...",2
1,#SriLanka surcharge on fuel removed!\n⛽📉\nThe ...,0
2,Net issuance increases to fund fiscal programs...,2
3,RT @bentboolean: How much of Amazon's traffic ...,2
4,$AMD Ryzen 4000 desktop CPUs looking ‘great’ a...,2
...,...,...
1295,#stocks back from the recovery room: https://t...,2
1296,RT @MacroCharts: Breadth – expanding last week...,2
1297,RT @MawsonResource: Rompas-Rajapalot: A Big Ne...,1
1298,$AAPL $QQQ Top may now be in. https://t.co/iNK...,2


In [244]:
target_counts = df['label'].value_counts()
count_df = pd.DataFrame({
    'label': target_counts.index,
    'count': target_counts.values
})
print(count_df)

   label  count
0      1   6346
1      0   4811


In [245]:
unique_elements_table = df_test['label'].value_counts().reset_index()
unique_elements_table.columns = ['Label', 'Count']
unique_elements_table

Unnamed: 0,Label,Count
0,2,528
1,1,424
2,0,348


In [246]:
df_test_unlabeled

Unnamed: 0,text
0,Mainstream media has done an amazing job at br...
1,Tesla delivery estimates are at around 364k fr...
2,3/ Even if I include 63.0M unvested RSUs as of...
3,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...
4,"@RealDanODowd @Tesla Stop trying to kill kids,..."
...,...
80788,Some of the fastest growing tech stocks on the...
80789,"With earnings on the horizon, here is a quick ..."
80790,Our record delivery results are a testimony of...
80791,"We delivered 10,412 Smart EVs in Sep 2021, rea..."


In [247]:
df = df.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df)
test_dataset = Dataset.from_pandas(df_test)
test_dataset_unlabled = Dataset.from_pandas(df_test_unlabeled)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset, 
    'test': test_dataset,
    'test_unlabeled': test_dataset_unlabled
})

In [248]:
splitted_datasets = dataset_dict["train"].train_test_split(test_size=0.2)
dataset_dict["train"] = splitted_datasets["train"]
dataset_dict["validation"] = splitted_datasets["test"]
dataset_dict = DatasetDict({
    'train': dataset_dict['train'],
    'validation': dataset_dict['validation'],
    'test': dataset_dict['test'],
    'test_unlabeled': dataset_dict['test_unlabeled']
})

folder = "data/finetune_data"
if os.path.exists(folder):
    shutil.rmtree(folder)
dataset_dict.save_to_disk("data/finetune_data")
dataset_dict

Saving the dataset (1/1 shards): 100%|██████████| 8925/8925 [00:00<00:00, 170701.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2232/2232 [00:00<00:00, 135157.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1300/1300 [00:00<00:00, 144267.63 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 80793/80793 [00:00<00:00, 3087938.79 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8925
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2232
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1300
    })
    test_unlabeled: Dataset({
        features: ['text'],
        num_rows: 80793
    })
})