https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt  
https://medium.com/nlplanet/bert-finetuning-with-hugging-face-and-training-visualizations-with-tensorboard-46368a57fc97


In [1]:
import os
import shutil
import numpy as np
import pandas as pd
from collections import Counter
from multiprocessing import Pool
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, f1_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import helper_data, helper_model




[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Howard\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Howard\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset_filename = {
    # '0': ("training.1600000.processed.noemoticon.csv", ["target", "ids", "date", "flag", "user", "text"]), # not financial sentiment, not used
    '0': ("gpt.csv", ["text,label"]),
    '1': ("stock_data.csv", ["text", "target"]),
    '2': ("nasdaq.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '3': ("djia_news copy.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '4': ("data-3.csv", ["Sentence", "Sentiment"]),
    '5': ("sentiment.csv", ["Stock Ticker", "Tweet Text", "Sentiment", "Tweet URL"]),
    '6': ('train_tweet.csv', ["id", "label", "tweet"])  # 0 positive, 1 negative
}

In [3]:
DATASET_ENCODING = "ISO-8859-1"

In [4]:
dataset_path = os.path.join("", "data", dataset_filename["0"][0])
df0 = pd.read_csv(dataset_path, names=dataset_filename["0"][1], skiprows=[0])
df0 = pd.DataFrame(df0)
df0[['text', 'label']] = df0['text,label'].str.rsplit(',', n=1, expand=True)
df0.drop(columns=['text,label'], inplace=True)
decode_map = {0: "NEGATIVE", 1: "POSITIVE"}
df0['target'] = df0['label'].apply(lambda x: decode_map[int(x)])
df0.drop(columns=['label'], inplace=True)
df0.rename(columns={df0.columns[1]: 'target'}, inplace=True)

In [5]:
# dataset_path = os.path.join("", "data", dataset_filename["0"][0])
# df0 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["0"][1])
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
# df0.target = df0.target.apply(lambda x: decode_map[int(x)])
# df0 = df0[['text', 'target']]

In [6]:
dataset_path = os.path.join("", "data", dataset_filename["1"][0])
df1 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING)
df1 = df1.iloc[:, :-3]
decode_map = {-1: "NEGATIVE", 1: "POSITIVE"}
df1['target'] = pd.to_numeric(df1['target'], errors='coerce')
df1['target'] = df1['target'].map(decode_map)
df_test = df1

In [7]:
target_counts = df_test['target'].value_counts()
count_df = pd.DataFrame({
    'label': target_counts.index,
    'count': target_counts.values
})
print(count_df)

      label  count
0  POSITIVE   3633
1  NEGATIVE   2023


In [8]:
# dataset_path = os.path.join("", "data", dataset_filename["2"][0])
# df2 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["2"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
# df2['target'] = df2['Label'].apply(lambda x: decode_map[int(x)])
# df2 = df2[['Headline', 'target']]
# df2.rename(columns={df2.columns[0]: 'text'}, inplace=True)

In [9]:
# dataset_path = os.path.join("", "data", dataset_filename["3"][0])
# df3 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["3"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
# df3['target'] = df3['Label'].apply(lambda x: decode_map[int(x)])
# df3 = df3[['Headline', 'target']]
# df3.rename(columns={df3.columns[0]: 'text'}, inplace=True)

In [10]:
dataset_path = os.path.join("", "data", dataset_filename["4"][0])
df4 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["4"][1], skiprows=1)
decode_map = {"negative": "NEGATIVE", "neutral": "NEUTRAL", "positive": "POSITIVE"}
df4['target'] = df4['Sentiment'].apply(lambda x: decode_map[x])
df4.drop(columns=['Sentiment'], inplace=True)
df4.rename(columns={df4.columns[0]: 'text'}, inplace=True)

In [11]:
dataset_path = os.path.join("", "data", dataset_filename["5"][0])
df5 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["5"][1], skiprows=1)
decode_map = {"Negative": "NEGATIVE", "Positive": "POSITIVE"}
df5['target'] = df5['Sentiment'].apply(lambda x: decode_map[x])
df5 = df5[['Tweet Text', 'target']]
df5.rename(columns={df5.columns[0]: 'text'}, inplace=True)

In [12]:
# dataset_path = os.path.join("", "data", dataset_filename["6"][0])
# df6 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["6"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 1: "POSITIVE"}
# df6['target'] = df6['label'].apply(lambda x: decode_map[int(x)])
# df6 = df6[['tweet', 'target']]
# df6.rename(columns={df6.columns[0]: 'text'}, inplace=True)

In [13]:
total_rows = len(df0) + len(df4) + len(df5) 
print("Total number of rows:", total_rows)

Total number of rows: 8531


In [14]:
df = pd.concat([df0, df4, df5], ignore_index=True)
df.shape

(8531, 2)

In [15]:
df = helper_data.shuffle_dataframe(df)
df = df[df['target'] != "NEUTRAL"]
df.rename(columns={'target': 'label'}, inplace=True)
df_test.rename(columns={'target': 'label'}, inplace=True)

In [16]:
unique_elements_table = df['label'].value_counts().reset_index()
unique_elements_table.columns = ['Label', 'Count']
unique_elements_table

Unnamed: 0,Label,Count
0,POSITIVE,2713
1,NEGATIVE,2688


In [17]:
df = df[:10]

In [18]:
df[:-10]

Unnamed: 0,text,label


In [19]:
label_mapping = {'POSITIVE': 1, 'NEGATIVE': 0}
df['label'] = df['label'].map(label_mapping)
df_test = df_test.dropna(subset=['label'])
df_test['label'] = df_test['label'].map(label_mapping)

target_counts = df['label'].value_counts()
count_df = pd.DataFrame({
    'label': target_counts.index,
    'count': target_counts.values
})
print(count_df)

   label  count
0      1      5
1      0      5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['label'] = df_test['label'].map(label_mapping)


In [20]:
df

Unnamed: 0,text,label
5853,Finnish Rautaruukki has been awarded a contrac...,1
2625,Finnish software developer Basware Oyj said on...,1
6891,"`` After the share purchase is completed , fin...",1
2339,"Profit before taxes was EUR 5.4 mn , up from E...",1
8173,"What's going on with $GLBE?\r\nI mean, an over...",1
1018,Uncertainty is caused by setbacks impacting $I...,0
864,$INO is in a biotech bind. Inovio's vaccine de...,0
1431,The $GME stock saga underscores the need for i...,0
1048,Setbacks mount for $WKHS as its electric dream...,0
1657,Concerns about regulatory changes negatively a...,0


In [21]:
df_test

Unnamed: 0,text,label
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
...,...,...
5706,Industry body CII said #discoms are likely to ...,0
5707,"#Gold prices slip below Rs 46,000 as #investor...",0
5708,Workers at Bajaj Auto have agreed to a 10% wag...,1
5709,"#Sharemarket LIVE: Sensex off dayâs high, up...",1


In [22]:
df.shape

(10, 2)

In [23]:
df = df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df)
test_dataset = Dataset.from_pandas(df_test)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset, 
    'test': test_dataset
})

In [24]:
splitted_datasets = dataset_dict["train"].train_test_split(test_size=0.2)
dataset_dict["train"] = splitted_datasets["train"]
dataset_dict["validation"] = splitted_datasets["test"]
dataset_dict = DatasetDict({
    'train': dataset_dict['train'],
    'validation': dataset_dict['validation'],
    'test': dataset_dict['test']
})

folder = "data/finetune_data"
if os.path.exists(folder):
    shutil.rmtree(folder)
dataset_dict.save_to_disk("data/finetune_data")
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 5656
    })
})