https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt  
https://medium.com/nlplanet/bert-finetuning-with-hugging-face-and-training-visualizations-with-tensorboard-46368a57fc97


In [193]:
import os
import shutil
import numpy as np
import pandas as pd
from collections import Counter
from multiprocessing import Pool
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, f1_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import helper_data, helper_model

In [194]:
dataset_filename = {
    # '0': ("training.1600000.processed.noemoticon.csv", ["target", "ids", "date", "flag", "user", "text"]), # not financial sentiment, not used
    '0': ("gpt.csv", ["text,label"]),
    '1': ("stock_data.csv", ["text", "target"]),
    '2': ("nasdaq.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '3': ("djia_news copy.csv", ["Label", "Ticker", "Headline"]), # 0 negative, 1 positive, 2 neural
    '4': ("data-3.csv", ["Sentence", "Sentiment"]),
    '5': ("sentiment.csv", ["Stock Ticker", "Tweet Text", "Sentiment", "Tweet URL"]),
    '6': ('train_tweet.csv', ["id", "label", "tweet"]),  # 0 positive, 1 negative
    '7': ('stock_tweets.csv', ["Date", "Tweet", "Stock Name", "Company Name"])  # 0 positive, 1 negative
}

In [195]:
DATASET_ENCODING = "ISO-8859-1"

In [196]:
dataset_path = os.path.join("", "data", dataset_filename["0"][0])
df0 = pd.read_csv(dataset_path, names=dataset_filename["0"][1], skiprows=[0])
df0 = pd.DataFrame(df0)
df0[['text', 'label']] = df0['text,label'].str.rsplit(',', n=1, expand=True)
df0.drop(columns=['text,label'], inplace=True)
decode_map = {0: "NEGATIVE", 1: "POSITIVE"}
df0['target'] = df0['label'].apply(lambda x: decode_map[int(x)])
df0.drop(columns=['label'], inplace=True)
df0.rename(columns={df0.columns[1]: 'target'}, inplace=True)

In [197]:
# dataset_path = os.path.join("", "data", dataset_filename["0"][0])
# df0 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["0"][1])
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
# df0.target = df0.target.apply(lambda x: decode_map[int(x)])
# df0 = df0[['text', 'target']]

In [198]:
dataset_path = os.path.join("", "data", dataset_filename["1"][0])
df1 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING)
df1 = df1.iloc[:, :-3]
decode_map = {-1: "NEGATIVE", 1: "POSITIVE"}
df1['target'] = pd.to_numeric(df1['target'], errors='coerce')
df1['target'] = df1['target'].map(decode_map)
df_test = df1

In [199]:
df_test

Unnamed: 0,text,target
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,POSITIVE
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,POSITIVE
2,user I'd be afraid to short AMZN - they are lo...,POSITIVE
3,MNTA Over 12.00,POSITIVE
4,OI Over 21.37,POSITIVE
...,...,...
5706,Industry body CII said #discoms are likely to ...,NEGATIVE
5707,"#Gold prices slip below Rs 46,000 as #investor...",NEGATIVE
5708,Workers at Bajaj Auto have agreed to a 10% wag...,POSITIVE
5709,"#Sharemarket LIVE: Sensex off dayâs high, up...",POSITIVE


In [200]:
# dataset_path = os.path.join("", "data", dataset_filename["2"][0])
# df2 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["2"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
# df2['target'] = df2['Label'].apply(lambda x: decode_map[int(x)])
# df2 = df2[['Headline', 'target']]
# df2.rename(columns={df2.columns[0]: 'text'}, inplace=True)

In [201]:
# dataset_path = os.path.join("", "data", dataset_filename["3"][0])
# df3 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["3"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 1: "POSITIVE"}
# df3['target'] = df3['Label'].apply(lambda x: decode_map[int(x)])
# df3 = df3[['Headline', 'target']]
# df3.rename(columns={df3.columns[0]: 'text'}, inplace=True)

In [202]:
dataset_path = os.path.join("", "data", dataset_filename["4"][0])
df4 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["4"][1], skiprows=1)
decode_map = {"negative": "NEGATIVE", "neutral": "NEUTRAL", "positive": "POSITIVE"}
df4['target'] = df4['Sentiment'].apply(lambda x: decode_map[x])
df4.drop(columns=['Sentiment'], inplace=True)
df4.rename(columns={df4.columns[0]: 'text'}, inplace=True)

In [203]:
dataset_path = os.path.join("", "data", dataset_filename["5"][0])
df5 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["5"][1], skiprows=1)
decode_map = {"Negative": "NEGATIVE", "Positive": "POSITIVE"}
df5['target'] = df5['Sentiment'].apply(lambda x: decode_map[x])
df5 = df5[['Tweet Text', 'target']]
df5.rename(columns={df5.columns[0]: 'text'}, inplace=True)

In [204]:
# dataset_path = os.path.join("", "data", dataset_filename["6"][0])
# df6 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["6"][1], skiprows=1)
# decode_map = {0: "NEGATIVE", 1: "POSITIVE"}
# df6['target'] = df6['label'].apply(lambda x: decode_map[int(x)])
# df6 = df6[['tweet', 'target']]
# df6.rename(columns={df6.columns[0]: 'text'}, inplace=True)

In [205]:
dataset_path = os.path.join("", "data", dataset_filename["7"][0])
df7 = pd.read_csv(dataset_path, encoding=DATASET_ENCODING, names=dataset_filename["7"][1], skiprows=1)
df7 = df7[['Tweet']]
df7.rename(columns={df7.columns[0]: 'text'}, inplace=True)
df_test_unlabeled = df7

In [206]:
df7

Unnamed: 0,text
0,Mainstream media has done an amazing job at br...
1,Tesla delivery estimates are at around 364k fr...
2,3/ Even if I include 63.0M unvested RSUs as of...
3,@RealDanODowd @WholeMarsBlog @Tesla Hahaha why...
4,"@RealDanODowd @Tesla Stop trying to kill kids,..."
...,...
80788,Some of the fastest growing tech stocks on the...
80789,"With earnings on the horizon, here is a quick ..."
80790,Our record delivery results are a testimony of...
80791,"We delivered 10,412 Smart EVs in Sep 2021, rea..."


In [207]:
# target_counts = df_test['target'].value_counts()
# count_df = pd.DataFrame({
#     'label': target_counts.index,
#     'count': target_counts.values
# })
# print(count_df)

In [208]:
total_rows = len(df0) + len(df4) + len(df5) 
print("Total number of rows:", total_rows)

Total number of rows: 8631


In [209]:
df = pd.concat([df0, df4, df5], ignore_index=True)
df.shape

(8631, 2)

In [210]:
df = helper_data.shuffle_dataframe(df)
df = df[df['target'] != "NEUTRAL"]
df.rename(columns={'target': 'label'}, inplace=True)
df_test.rename(columns={'target': 'label'}, inplace=True)
df_test_unlabeled.rename(columns={'target': 'label'}, inplace=True)

In [211]:
unique_elements_table = df['label'].value_counts().reset_index()
unique_elements_table.columns = ['Label', 'Count']
unique_elements_table

Unnamed: 0,Label,Count
0,NEGATIVE,2788
1,POSITIVE,2713


In [212]:
df = df[:10]
df_test = df_test[:8]
df_test_unlabeled = df_test_unlabeled[:10]

In [213]:
df[:-10]

Unnamed: 0,text,label


In [214]:
label_mapping = {'POSITIVE': 1, 'NEGATIVE': 0}
df = df.dropna(subset=['label'])
df['label'] = df['label'].map(label_mapping)
# df_test = df_test.dropna(subset=['label'])
df_test['label'] = df_test['label'].map(label_mapping)

target_counts = df['label'].value_counts()
count_df = pd.DataFrame({
    'label': target_counts.index,
    'count': target_counts.values
})
print(count_df)

   label  count
0      1      7
1      0      3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['label'] = df_test['label'].map(label_mapping)


In [215]:
df

Unnamed: 0,text,label
4703,Sales of security and system packaging increas...,1
8225,Good Morrrning Retail!!! Todays Plays: $BBIG $...,1
4608,Both the net sales and operating profit were r...,1
8404,Now glancing at $SE which was once called the ...,1
5592,NaturalGas Settles At 3-year Low $DBO $BNO ht...,0
1311,$BIIB's stock is affected by clinical trial se...,0
5343,Finnish construction company YIT Corporation i...,1
3233,18 May 2010 - Finnish electronics producer Elc...,1
275,$AAPL's CEO just made a major announcement. Wh...,1
5816,RT @joemccann the correleation between the dol...,0


In [216]:
df_test

Unnamed: 0,text,label
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
5,PGNX Over 3.04,1
6,AAP - user if so then the current downtrend wi...,0
7,Monday's relative weakness. NYX WIN TIE TAP IC...,0


In [217]:
df.shape

(10, 2)

In [218]:
df = df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(df)
test_dataset = Dataset.from_pandas(df_test)
test_dataset_unlabled = Dataset.from_pandas(df_test_unlabeled)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset, 
    'test': test_dataset,
    'test_unlabeled': test_dataset_unlabled
})

In [219]:
splitted_datasets = dataset_dict["train"].train_test_split(test_size=0.2)
dataset_dict["train"] = splitted_datasets["train"]
dataset_dict["validation"] = splitted_datasets["test"]
dataset_dict = DatasetDict({
    'train': dataset_dict['train'],
    'validation': dataset_dict['validation'],
    'test': dataset_dict['test'],
    'test_unlabeled': dataset_dict['test_unlabeled']
})

folder = "data/finetune_data"
if os.path.exists(folder):
    shutil.rmtree(folder)
dataset_dict.save_to_disk("data/finetune_data")
dataset_dict

Saving the dataset (1/1 shards): 100%|██████████| 8/8 [00:00<00:00, 1088.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2/2 [00:00<00:00, 272.80 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8/8 [00:00<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 1244.34 examples/s]




DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 8
    })
    test_unlabeled: Dataset({
        features: ['text'],
        num_rows: 10
    })
})