https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt  
https://medium.com/nlplanet/bert-finetuning-with-hugging-face-and-training-visualizations-with-tensorboard-46368a57fc97


In [43]:
import os
import shutil
import numpy as np
import pandas as pd
from collections import Counter
from multiprocessing import Pool
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, f1_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import helper_data, helper_model

In [44]:
dataset_filename = {
    # '0': ("training.1600000.processed.noemoticon.csv", ["target", "ids", "date", "flag", "user", "text"]), # not financial sentiment, not used
    '0': ("gpt.csv", ["text,label"]),
    '1': ("stock_data.csv", ["text", "target"]),
    '2': ("nasdaq.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '3': ("djia_news copy.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '4': ("data-3.csv", ["Sentence", "Sentiment"]),
    '5': ("sentiment.csv", ["Stock Ticker", "Tweet Text", "Sentiment", "Tweet URL"]),
    '6': ('train_tweet.csv', ["id", "label", "tweet"])  # 0 positive, 1 negative
}

In [45]:
DATASET_ENCODING = "ISO-8859-1"

In [46]:
dataset_path = os.path.join("", "data", dataset_filename["0"][0])
df0 = pd.read_csv(dataset_path, names=dataset_filename["0"][1], skiprows=[0])
df0 = pd.DataFrame(df0)
df0[['text', 'label']] = df0['text,label'].str.rsplit(',', n=1, expand=True)
df0.drop(columns=['text,label'], inplace=True)
decode_map = {0: "NEGATIVE", 1: "POSITIVE"}
df0['target'] = df0['label'].apply(lambda x: decode_map[int(x)])
df0.drop(columns=['label'], inplace=True)
df0.rename(columns={df0.columns[1]: 'target'}, inplace=True)

In [47]:
# dataset_path = os.path.join("", "data", dataset_filename["0"][0])
# df0 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["0"][1])
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
# df0.target = df0.target.apply(lambda x: decode_map[int(x)])
# df0 = df0[['text', 'target']]

In [48]:
dataset_path = os.path.join("", "data", dataset_filename["1"][0])
df1 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING)
df1 = df1.iloc[:, :-3]
decode_map = {-1: "NEGATIVE", 1: "POSITIVE"}
df1['target'] = pd.to_numeric(df1['target'], errors='coerce')
decode_map = {-1: "NEGATIVE", 1: "POSITIVE"}
df1['target'] = df1['target'].map(decode_map)
df_test = df1

In [49]:
df_test

Unnamed: 0,text,target
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,POSITIVE
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,POSITIVE
2,user I'd be afraid to short AMZN - they are lo...,POSITIVE
3,MNTA Over 12.00,POSITIVE
4,OI Over 21.37,POSITIVE
...,...,...
5706,Industry body CII said #discoms are likely to ...,NEGATIVE
5707,"#Gold prices slip below Rs 46,000 as #investor...",NEGATIVE
5708,Workers at Bajaj Auto have agreed to a 10% wag...,POSITIVE
5709,"#Sharemarket LIVE: Sensex off dayâs high, up...",POSITIVE


In [50]:
# dataset_path = os.path.join("", "data", dataset_filename["2"][0])
# df2 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["2"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
# df2['target'] = df2['Label'].apply(lambda x: decode_map[int(x)])
# df2 = df2[['Headline', 'target']]
# df2.rename(columns={df2.columns[0]: 'text'}, inplace=True)

In [51]:
# dataset_path = os.path.join("", "data", dataset_filename["3"][0])
# df3 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["3"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
# df3['target'] = df3['Label'].apply(lambda x: decode_map[int(x)])
# df3 = df3[['Headline', 'target']]
# df3.rename(columns={df3.columns[0]: 'text'}, inplace=True)

In [52]:
dataset_path = os.path.join("", "data", dataset_filename["4"][0])
df4 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["4"][1], skiprows=1)
decode_map = {"negative": "NEGATIVE", "neutral": "NEUTRAL", "positive": "POSITIVE"}
df4['target'] = df4['Sentiment'].apply(lambda x: decode_map[x])
df4.drop(columns=['Sentiment'], inplace=True)
df4.rename(columns={df4.columns[0]: 'text'}, inplace=True)

In [53]:
dataset_path = os.path.join("", "data", dataset_filename["5"][0])
df5 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["5"][1], skiprows=1)
decode_map = {"Negative": "NEGATIVE", "Positive": "POSITIVE"}
df5['target'] = df5['Sentiment'].apply(lambda x: decode_map[x])
df5 = df5[['Tweet Text', 'target']]
df5.rename(columns={df5.columns[0]: 'text'}, inplace=True)

In [54]:
# dataset_path = os.path.join("", "data", dataset_filename["6"][0])
# df6 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["6"][1], skiprows=1)
# decode_map = {0: "POSITIVE", 1: "NEGATIVE"}
# df6['target'] = df6['label'].apply(lambda x: decode_map[int(x)])
# df6 = df6[['tweet', 'target']]
# df6.rename(columns={df6.columns[0]: 'text'}, inplace=True)

In [55]:
total_rows = len(df0) + len(df4) + len(df5) 
print("Total number of rows:", total_rows)

Total number of rows: 7111


In [56]:
df = pd.concat([df0, df4, df5], ignore_index=True)
df.shape

(7111, 2)

In [57]:
df = helper_data.shuffle_dataframe(df)
df = df[df['target'] != "NEUTRAL"]
df.rename(columns={'target': 'label'}, inplace=True)
df_test.rename(columns={'target': 'label'}, inplace=True)

In [58]:
unique_elements_table = df['label'].value_counts().reset_index()
unique_elements_table.columns = ['Label', 'Count']
unique_elements_table

Unnamed: 0,Label,Count
0,POSITIVE,2713
1,NEGATIVE,1268


In [59]:
df[:-10]

Unnamed: 0,text,label
73,Selling some of my consumer goods stocks. Dive...,POSITIVE
2353,"Finnish Sampo Bank , of Danish Danske Bank gro...",POSITIVE
5415,Britain's FTSE forges ahead as Shire surges,POSITIVE
671,$AMZN's earnings disappointment is a blow. Unc...,NEGATIVE
290,Selling $AMZN shares. Taking profits and reass...,POSITIVE
...,...,...
458,Exited my $AMD position with a profit. Timing ...,POSITIVE
6326,The Nokian tyre proves its high safety excelle...,POSITIVE
5600,"EXCLUSIVE-BP, China's CNPC to unveil oil allia...",POSITIVE
4075,"Profit per share was EUR 1.03 , up from EUR 0....",POSITIVE


In [60]:
label_mapping = {'POSITIVE': 0, 'NEGATIVE': 1}
df['label'] = df['label'].map(label_mapping)
df_test['label'] = df_test['label'].map(label_mapping)

target_counts = df['label'].value_counts()
count_df = pd.DataFrame({
    'label': target_counts.index,
    'count': target_counts.values
})
print(count_df)

   label  count
0      0   2713
1      1   1268


In [61]:
df.shape

(3981, 2)

In [62]:
df = df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df)
test_dataset = Dataset.from_pandas(df_test)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset, 
    'test': test_dataset
})

In [63]:
splitted_datasets = dataset_dict["train"].train_test_split(test_size=0.2)
dataset_dict["train"] = splitted_datasets["train"]
dataset_dict["validation"] = splitted_datasets["test"]
dataset_dict = DatasetDict({
    'train': dataset_dict['train'],
    'validation': dataset_dict['validation'],
    'test': dataset_dict['test']
})


folder = "data/finetune_data"
if os.path.exists(folder):
    shutil.rmtree(folder)
dataset_dict.save_to_disk("data/finetune_data")
dataset_dict

Saving the dataset (1/1 shards): 100%|██████████| 3184/3184 [00:00<00:00, 123671.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 797/797 [00:00<00:00, 79760.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5711/5711 [00:00<00:00, 951599.80 examples/s] 


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3184
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 797
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5711
    })
})