## Set-up

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from pprint import pprint
import re
from tqdm import tqdm
tqdm.pandas()
import numpy as np

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=12, progress_bar=True)

import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

import warnings
warnings.filterwarnings("ignore")

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
#!pwd
import os

# Google Bucket
# file name checkpoint_0512_sent_split.parquet
path_bucket = 'gs://msca-sp23-bucket/nlp_data'
runtime_path = '/home/anthony/projects/nlp_runtime'
bucket_read = path_bucket + '/' + 'checkpoint_0525_full_article_split_sentences.parquet'

os.chdir(runtime_path)
print(os.getcwd())

/home/anthony/projects/nlp_runtime


In [3]:
import spacy
import en_core_web_md

2023-05-26 03:54:42.773345: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preparation

In [4]:
df_sentence = pd.read_parquet(bucket_read, engine='pyarrow')
df_sentence.head(3)

Unnamed: 0,date,title,article_id,sentences
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,"Decentralized Machine Learning Reaches Market Cap of 15,919.00 DML Enterprise Leader Daily Ratings News for Decentralized Machine Learning Complete the form below to receive the latest headlines and analysts' mendations for Decentralized Machine Learning with our free daily email newsletter: Follow EnterpriseLeade Recent Posts GE Announces Its Plan To Sell The Distributed Power Business To Advent International Ebang Communication Resorts To The Filing Of An IPO In Hong Kong How to Open DAA, ..."
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,3 Ways to Tell if Your Next Business Move will be a Mistake Foxconn Launches Investigation After Reports Of Harsh Working Conditions At Its Factory Flagstar Bancorp Acquires 52 Retails Branches Belonging To Wells Fargo
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,"PREVIOUS 996.90 Million in Sales Expected for Monster Beverage Corp NASDAQ:MNST This Quarter NEXTAragon ANT Achieves Market Cap of 12.63 Million GE Announces Its Plan To Sell The Distributed Power Business To Advent International Ebang Communication Resorts To The Filing Of An IPO In Hong Kong How to Open DAA, VCD, NRG, IMG, MDF Files Smart or Risky?"


In [5]:
df_sentence.shape

(2223016, 4)

## NER and Target Sentiment

In [7]:
sentences = df_sentence['sentences'].tolist()

In [20]:
from textblob import TextBlob

nlp = spacy.load("en_core_web_md", disable=["tagger", "parser"])

In [22]:
%%time

entities_sentiment = []

for doc in nlp.pipe(sentences, batch_size=50, n_process=10):
    doc_target_sentiment = []
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'GPE', 'LOC', 'PERSON']:
            ent_sentiment = TextBlob(ent.text).sentiment.polarity
            doc_target_sentiment.append((ent.text, ent.label_, ent_sentiment))
    entities_sentiment.append(doc_target_sentiment)

CPU times: user 45min 5s, sys: 33.5 s, total: 45min 38s
Wall time: 46min 36s


In [23]:
len(entities_sentiment)

2223016

In [24]:
# load to sentence dataframe
df_sentence['target_sentiment'] = entities_sentiment

In [33]:
entities_sentiment[:10]

[[('Decentralized Machine Learning Reaches Market Cap', 'ORG', 0.0),
  ('Decentralized Machine Learning', 'ORG', 0.0),
  ('Hong Kong', 'GPE', 0.0),
  ('NRG', 'ORG', 0.0),
  ('IMG', 'ORG', 0.0)],
 [('Mistake Foxconn Launches Investigation After Reports Of Harsh Working Conditions',
   'ORG',
   -0.2),
  ('Flagstar Bancorp', 'ORG', 0.0),
  ('Wells Fargo', 'ORG', 0.0)],
 [('Monster Beverage Corp NASDAQ', 'ORG', 0.0),
  ('GE', 'ORG', 0.0),
  ('IPO', 'ORG', 0.0),
  ('Hong Kong', 'GPE', 0.0),
  ('NRG', 'ORG', 0.0),
  ('IMG', 'ORG', 0.0)],
 [('Decentralized Machine Learning Daily Enter', 'ORG', 0.0),
  ('Decentralized Machine Learning', 'ORG', 0.0)],
 [('Bitcoin or', 'ORG', 0.0), ('Decentralized Machine Learning', 'ORG', 0.0)],
 [('Decentralized Machine Learning', 'ORG', 0.0),
  ('Bitcoin or Ethereum', 'ORG', 0.0),
  ('US', 'GPE', 0.0),
  ('Gemini', 'ORG', 0.0)],
 [('Decentralized Machine Learning', 'ORG', 0.0), ('US', 'GPE', 0.0)],
 [('IDEX', 'ORG', 0.0)],
 [],
 [('Decentralized Machine Lear

Merge with single setence sentiment

In [37]:
df_sentence.head(1)

Unnamed: 0,date,title,article_id,sentences,target_sentiment,sentiment,sentiment_score
0,2020-01-01,"Decentralized Machine Learning Reaches Market Cap of $15,919.00 (DML) - Enterprise Leader",1,"Decentralized Machine Learning Reaches Market Cap of 15,919.00 DML Enterprise Leader Daily Ratings News for Decentralized Machine Learning Complete the form below to receive the latest headlines and analysts' mendations for Decentralized Machine Learning with our free daily email newsletter: Follow EnterpriseLeade Recent Posts GE Announces Its Plan To Sell The Distributed Power Business To Advent International Ebang Communication Resorts To The Filing Of An IPO In Hong Kong How to Open DAA, ...","[(Decentralized Machine Learning Reaches Market Cap, ORG, 0.0), (Decentralized Machine Learning, ORG, 0.0), (Hong Kong, GPE, 0.0), (NRG, ORG, 0.0), (IMG, ORG, 0.0)]",neutral,0.999812


In [27]:
# load sentence sentiments
path_bucket = 'gs://msca-sp23-bucket/nlp_data'
df_sentence_sentiment = pd.read_parquet(path_bucket + '/' +'sentence_sentiments.parquet', engine='pyarrow')

In [28]:
df_sentence_sentiment.shape

(2223016, 2)

In [31]:
# load to sentence dataframe
df_sentence['sentiment'] = df_sentence_sentiment['sentiments']
df_sentence['sentiment_score'] = df_sentence_sentiment['score']

In [43]:
# save a checkpoint
import pickle

# Assuming `df` is your DataFrame and `col` is the column with the complex datatype
df_sentence['target_sentiment'] = df_sentence['target_sentiment'].apply(pickle.dumps)

In [44]:
df_sentence.to_parquet(path_bucket + '/' + 'checkpoint_0526_filtered_sentences_sentiment_ner.parquet', engine='pyarrow')

## Methodology: How to measure sentiments within an article.

As we have split sentences with SpaCy for each article, and the sentences are exploded into a dataframe. Now what we have is a dataframe of sentences instead of a df of articles. 

The primary idea is to use aggregated sentiment scores with sentences, rather than the overall sentiment of an article. The benefit of such method is that:
1. it reduces token size for input; this is particularly helpful when we are using transformer and Bert-based models
2. it is more accurate by aggregating the sentiment of sentences because 
    - full article sentiment has a lot of noises (e.g. unclean text like other titles/ads text from parsing)
    - full article is computationally expensive for text embedding

# Reference
1. Sentiment Analysis in 10 Minutes with BERT and TensorFlow. https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671