https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt  
https://medium.com/nlplanet/bert-finetuning-with-hugging-face-and-training-visualizations-with-tensorboard-46368a57fc97


In [64]:
import os
import shutil
import numpy as np
import pandas as pd
from collections import Counter
from multiprocessing import Pool
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, f1_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import helper_data, helper_model

In [65]:
dataset_filename = {
    # '0': ("training.1600000.processed.noemoticon.csv", ["target", "ids", "date", "flag", "user", "text"]), # not financial sentiment, not used
    '0': ("training.1600000.processed.noemoticon.csv", ["target", "ids", "date", "flag", "user", "text"]),
    '1': ("stock_data.csv", ["text", "target"]),
    '2': ("nasdaq.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '3': ("djia_news copy.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '4': ("data-3.csv", ["Sentence", "Sentiment"]),
    '5': ("sentiment.csv", ["Stock Ticker", "Tweet Text", "Sentiment", "Tweet URL"]),
    '6': ('train_tweet.csv', ["id", "label", "tweet"])  # 0 positive, 1 negative
}

In [66]:
DATASET_ENCODING = "ISO-8859-1"

dataset_path = os.path.join("", "data", dataset_filename["0"][0])
df0 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["0"][1])
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
df0.target = df0.target.apply(lambda x: decode_map[int(x)])
df0 = df0[['text', 'target']]

dataset_path = os.path.join("", "data", dataset_filename["1"][0])   # test dataset
df1 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["1"][1], skiprows=1)
df1.rename(columns={df1.columns[0]: 'text'}, inplace=True)
decode_map = {-1: "NEGATIVE", 1: "POSITIVE"}
df1.target = df1.target.apply(lambda x: decode_map[int(x)])
df_test = df1

dataset_path = os.path.join("", "data", dataset_filename["2"][0])
df2 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["2"][1], skiprows=1)
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
df2['target'] = df2['Label'].apply(lambda x: decode_map[int(x)])
df2 = df2[['Headline', 'target']]
df2.rename(columns={df2.columns[0]: 'text'}, inplace=True)

dataset_path = os.path.join("", "data", dataset_filename["3"][0])
df3 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["3"][1], skiprows=1)
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
df3['target'] = df3['Label'].apply(lambda x: decode_map[int(x)])
df3 = df3[['Headline', 'target']]
df3.rename(columns={df3.columns[0]: 'text'}, inplace=True)

dataset_path = os.path.join("", "data", dataset_filename["4"][0])
df4 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["4"][1], skiprows=1)
decode_map = {"negative": "NEGATIVE", "neutral": "NEUTRAL", "positive": "POSITIVE"}
df4['target'] = df4['Sentiment'].apply(lambda x: decode_map[x])
df4.drop(columns=['Sentiment'], inplace=True)
df4.rename(columns={df4.columns[0]: 'text'}, inplace=True)

dataset_path = os.path.join("", "data", dataset_filename["5"][0])
df5 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["5"][1], skiprows=1)
decode_map = {"Negative": "NEGATIVE", "Positive": "POSITIVE"}
df5['target'] = df5['Sentiment'].apply(lambda x: decode_map[x])
df5 = df5[['Tweet Text', 'target']]
df5.rename(columns={df5.columns[0]: 'text'}, inplace=True)

dataset_path = os.path.join("", "data", dataset_filename["6"][0])
df6 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["6"][1], skiprows=1)
decode_map = {0: "POSITIVE", 1: "NEGATIVE"}
df6['target'] = df6['label'].apply(lambda x: decode_map[int(x)])
df6 = df6[['tweet', 'target']]
df6.rename(columns={df6.columns[0]: 'text'}, inplace=True)

In [67]:
total_rows = len(df0) + len(df2) + len(df3) + len(df4) + len(df5) + len(df6)
print("Total number of rows:", total_rows)

Total number of rows: 1653866


In [68]:
df = pd.concat([df0, df2, df3, df4, df5, df6], ignore_index=True)
df.shape

(53866, 2)

In [69]:
df = helper_data.shuffle_dataframe(df)
df = df[df['target'] != "NEUTRAL"]
df.rename(columns={'target': 'label'}, inplace=True)
df_test.rename(columns={'target': 'label'}, inplace=True)

In [70]:
df[:-10]

Unnamed: 0,text,label
39105,#cuttack #odisha clinches in for an odi agains...,POSITIVE
48983,#breakfast #holiday #pougal @ the irish rover,POSITIVE
45438,@user @user @user @user and the #teapay #bihe...,NEGATIVE
31631,"after reading about the @user tapings, i am as...",POSITIVE
7733,Zimbabwe: 'Zim Needs an Independent Central Bank',NEGATIVE
...,...,...
51840,"says it all @user you, @user be . #create #co...",POSITIVE
44227,@user @user @user @user feeling #worried.,POSITIVE
2639,Henan huatai corn germ oil press machine/corn ...,POSITIVE
39193,remember itÃ°ÂÂÂ #lost #empire #dreams #su...,POSITIVE


In [71]:
label_mapping = {'POSITIVE': 0, 'NEGATIVE': 1}
df['label'] = df['label'].map(label_mapping)
df_test['label'] = df_test['label'].map(label_mapping)


target_counts = df['label'].value_counts()
count_df = pd.DataFrame({
    'label': target_counts.index,
    'count': target_counts.values
})
print(count_df)

   label  count
0      0  37135
1      1  13270


In [72]:
df.shape

(50405, 2)

In [73]:
df = df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df)
test_dataset = Dataset.from_pandas(df_test)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset, 
    'test': test_dataset
})

In [74]:
splitted_datasets = dataset_dict["train"].train_test_split(test_size=0.3)
dataset_dict["train"] = splitted_datasets["train"]
dataset_dict["validation"] = splitted_datasets["test"]
dataset_dict = DatasetDict({
    'train': dataset_dict['train'],
    'validation': dataset_dict['validation'],
    'test': dataset_dict['test']
})


folder = "data/finetune_data"
if os.path.exists(folder):
    shutil.rmtree(folder)
dataset_dict.save_to_disk("data/finetune_data")
dataset_dict

Saving the dataset (0/1 shards):  26%|██▌       | 9000/35283 [00:00<00:00, 73528.67 examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 35283/35283 [00:00<00:00, 98740.77 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 15122/15122 [00:00<00:00, 143797.32 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5791/5791 [00:00<00:00, 573548.71 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 35283
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 15122
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5791
    })
})