## Set-up

In [1]:
#!pip install pandarallel

In [2]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"
# !pip install gcsfs

## Sentiment analysis (Colab)

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from pprint import pprint
import re
from tqdm import tqdm
tqdm.pandas()
import numpy as np

#from pandarallel import pandarallel
#pandarallel.initialize(nb_workers=8, progress_bar=True)

import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

import warnings
warnings.filterwarnings("ignore")

In [4]:
#bucket_read = 'https://storage.googleapis.com/msca-sp23-bucket/nlp_data/checkpoint_0525_full_article_split_sentences.parquet'
#df_sentences_exp = pd.read_parquet(bucket_read, engine='pyarrow')

In [2]:
# colab mount
import os
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive/')
path_gdrive = '/content/drive/MyDrive/Colab Datasets'
os.chdir(path_gdrive)
print(os.getcwd())

Mounted at /content/drive/
/content/drive/MyDrive/Colab Datasets


In [6]:
# # save parquet to GDrive
#df_sentences_exp.to_parquet('checkpoint_0525_full_article_split_sentences.parquet', engine='pyarrow')
#df_sentences_exp = pd.read_parquet('checkpoint_0525_full_article_split_sentences.parquet', engine='pyarrow')

In [7]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU acceleration is available.")
    print("Device:", torch.cuda.get_device_name())
else:
    device = torch.device("cpu")
    print("GPU acceleration is not available. Running on CPU.")

GPU acceleration is available.
Device: Tesla T4


In [None]:
!pip install transformers

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

In [10]:
#!pip install Xformers

In [11]:
df_sentences_exp.shape

(2223016, 4)

### Predicting sentiments, with RAM optimization

In [12]:
df_sentences_exp.head(3)

Unnamed: 0,date,title,article_id,sentences
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,"Decentralized Machine Learning Reaches Market Cap of 15,919.00 DML Enterprise Leader Daily Ratings News for Decentralized Machine Learning Complete the form below to receive the latest headlines and analysts' mendations for Decentralized Machine Learning with our free daily email newsletter: Follow EnterpriseLeade Recent Posts GE Announces Its Plan To Sell The Distributed Power Business To Advent International Ebang Communication Resorts To The Filing Of An IPO In Hong Kong How to Open DAA, ..."
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,3 Ways to Tell if Your Next Business Move will be a Mistake Foxconn Launches Investigation After Reports Of Harsh Working Conditions At Its Factory Flagstar Bancorp Acquires 52 Retails Branches Belonging To Wells Fargo
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,"PREVIOUS 996.90 Million in Sales Expected for Monster Beverage Corp NASDAQ:MNST This Quarter NEXTAragon ANT Achieves Market Cap of 12.63 Million GE Announces Its Plan To Sell The Distributed Power Business To Advent International Ebang Communication Resorts To The Filing Of An IPO In Hong Kong How to Open DAA, VCD, NRG, IMG, MDF Files Smart or Risky?"


In [13]:
from torch.utils.data import Dataset

class FinancialNewsDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


In [14]:
from torch.utils.data import DataLoader

def create_data_loader(sentences, tokenizer, max_length, batch_size):
    ds = FinancialNewsDataset(
        sentences=sentences,
        tokenizer=tokenizer,
        max_length=max_length
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
    )


In [15]:
# Specify the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move the model to the device
model = model.to(device)

## Sentence sentiment (no target)

In [16]:
%%time
from torch.nn import Softmax

# Extract the sentences
sentences = df_sentences_exp['sentences'].tolist()

max_length = 128
batch_size = 16

# Create a DataLoader
data_loader = create_data_loader(sentences, tokenizer, max_length, batch_size)

# Create a softmax object
softmax = Softmax(dim=1)

# Loop over the DataLoader
sentiments = []
for batch in data_loader:
    # Move data to the device
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
        # Apply softmax to convert logits to probabilities
        probs = softmax(logits)
        # Find the max probability
        max_probs, predictions = torch.max(probs, dim=1)
        # Convert tensors to numpy arrays
        predictions = predictions.to('cpu').numpy()
        max_probs = max_probs.to('cpu').numpy()
        # Append tuple of (prediction, max_prob) to sentiments
        sentiments.extend(list(zip(predictions, max_probs)))

CPU times: user 2h 5min 37s, sys: 53.9 s, total: 2h 6min 31s
Wall time: 2h 6min 51s


In [17]:
%%time
# Define class names
class_names = ['negative', 'neutral', 'positive']

# Convert class indices to class names
sentiments = [(class_names[prediction], score) for prediction, score in sentiments]

CPU times: user 316 ms, sys: 71 ms, total: 387 ms
Wall time: 386 ms


In [18]:
len(sentiments)

2223016

In [19]:
# load sentiments back to df
df_sentences_exp['sentiments'] = sentiments

In [20]:
df_sentences_exp.head(3)

Unnamed: 0,date,title,article_id,sentences,sentiments
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,"Decentralized Machine Learning Reaches Market Cap of 15,919.00 DML Enterprise Leader Daily Ratings News for Decentralized Machine Learning Complete the form below to receive the latest headlines and analysts' mendations for Decentralized Machine Learning with our free daily email newsletter: Follow EnterpriseLeade Recent Posts GE Announces Its Plan To Sell The Distributed Power Business To Advent International Ebang Communication Resorts To The Filing Of An IPO In Hong Kong How to Open DAA, ...","(neutral, 0.99981207)"
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,3 Ways to Tell if Your Next Business Move will be a Mistake Foxconn Launches Investigation After Reports Of Harsh Working Conditions At Its Factory Flagstar Bancorp Acquires 52 Retails Branches Belonging To Wells Fargo,"(negative, 0.80320925)"
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,"PREVIOUS 996.90 Million in Sales Expected for Monster Beverage Corp NASDAQ:MNST This Quarter NEXTAragon ANT Achieves Market Cap of 12.63 Million GE Announces Its Plan To Sell The Distributed Power Business To Advent International Ebang Communication Resorts To The Filing Of An IPO In Hong Kong How to Open DAA, VCD, NRG, IMG, MDF Files Smart or Risky?","(neutral, 0.99829644)"


In [24]:
import pandas as pd

# Convert it to a pandas DataFrame
df_tmp = pd.DataFrame(sentiments, columns=['sentiments', 'score'])

# Save the DataFrame to a Parquet file
df_tmp.to_parquet('sentiments.parquet', engine='pyarrow')


In [25]:
sentiments_read = pd.read_parquet('sentiments.parquet', engine='pyarrow')

In [26]:
sentiments_read.shape

(2223016, 2)

In [28]:
sentiments_read.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2223016 entries, 0 to 2223015
Data columns (total 2 columns):
 #   Column      Dtype  
---  ------      -----  
 0   sentiments  object 
 1   score       float32
dtypes: float32(1), object(1)
memory usage: 25.4+ MB


In [31]:
sentiments_read['score'] = sentiments_read['score'].astype('float64')

In [30]:
#df_sentences_exp = df_sentences_exp.drop(columns=['sentiments'])

In [32]:
df_sentences_exp['sentiments'] = sentiments_read['sentiments']
df_sentences_exp['score'] = sentiments_read['score']

In [33]:
#df_sentences_exp['sentiments'] = df_sentences_exp['sentiments'].astype('float64')

# Then save to parquet
#df_sentences_exp.to_parquet('checkpoint_0525_full_article_split_sentences_sentiments.parquet', engine='pyarrow')

In [37]:
df_sentences_exp = pd.read_parquet('checkpoint_0525_full_article_split_sentences_sentiments.parquet', engine='pyarrow')
df_sentences_exp.head()

Unnamed: 0,date,title,article_id,sentences,sentiments,score
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,"Decentralized Machine Learning Reaches Market Cap of 15,919.00 DML Enterprise Leader Daily Ratings News for Decentralized Machine Learning Complete the form below to receive the latest headlines and analysts' mendations for Decentralized Machine Learning with our free daily email newsletter: Follow EnterpriseLeade Recent Posts GE Announces Its Plan To Sell The Distributed Power Business To Advent International Ebang Communication Resorts To The Filing Of An IPO In Hong Kong How to Open DAA, ...",neutral,0.999812
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,3 Ways to Tell if Your Next Business Move will be a Mistake Foxconn Launches Investigation After Reports Of Harsh Working Conditions At Its Factory Flagstar Bancorp Acquires 52 Retails Branches Belonging To Wells Fargo,neutral,0.999812
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,"PREVIOUS 996.90 Million in Sales Expected for Monster Beverage Corp NASDAQ:MNST This Quarter NEXTAragon ANT Achieves Market Cap of 12.63 Million GE Announces Its Plan To Sell The Distributed Power Business To Advent International Ebang Communication Resorts To The Filing Of An IPO In Hong Kong How to Open DAA, VCD, NRG, IMG, MDF Files Smart or Risky?",neutral,0.999812
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,Receive News Updates for Decentralized Machine Learning Daily Enter your email address below to receive a concise daily summary of the latest news and updates for Decentralized Machine Learning and related cryptocurrencies with 's FREE CryptoBeat newsletter.,neutral,0.999812
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,Investors can then use their newly acquired Bitcoin or Ethereum to buy Decentralized Machine Learning using one of the exchanges listed above.,neutral,0.999812


# Reference
1. Sentiment Analysis in 10 Minutes with BERT and TensorFlow. https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671