# Simplified Notebook for SentimentArcs

Created:

* 28 Oct 2022
* Jon Chun

Simplified version of SentimentArcs Notebooks:

* https://github.com/jon-chun/sentimentarcs_notebooks

* https://arxiv.org/pdf/2110.09454.pdf

# Review VM Specs

In [None]:
# Make sure your Linux VM is connected to a GPU

!nvidia-smi

In [None]:
# Node GPU Count/Type

!nvidia-smi -L

In [None]:
# Memory

!free -h --si | awk  '/Mem:/{print $2}'

In [None]:
# GPU log information

# !nvidia-smi -q

In [None]:
# Check how many CPU cores available for parallization

!cat /proc/cpuinfo

In [None]:
!lscpu

# Setup

## Install Libraries

In [None]:
!pip install transformers[sentencepiece]

# !pip install transformers

In [None]:
# May require [RESET RUNTIME]

# !pip install modin[all]

## Import Libraries

In [None]:
from google.colab import files

In [None]:
import numpy as np
import pandas as pd
# import modin.pandas as pd_modin
import matplotlib.pyplot as plt
import seaborn as sns

import datetime
import re
import os

from tqdm import tqdm
import tqdm.notebook as tq
# for i in tq.tqdm(...):

## Configure Settings

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Global Variables & Functions

## Global Variables

In [None]:
TEXT_ENCODING = 'utf-8'

In [None]:
# Main (Modin) DataFrame for Novel Sentiments

sentiment_df = pd.DataFrame


## Common Functions

In [None]:
def verify_novel(novel_str, index_ends=500):
  '''
  INPUT: string in some stage of processing
  OUTPUT: display summary index_ends chars of header/footer for verification
  '''

  print(f'Novel Name: {novel_name_str}')
  print(f'  Char Len: {len(novel_str)}')
  print('====================================\n')
  print(f'Beginning:\n\n {novel_str[:index_ends]}\n\n')
  print('\n------------------------------------')
  print(f'Ending:\n\n {novel_str[-index_ends:]}\n\n')

In [None]:
def save_text2txt_and_download(text_obj, file_suffix='_save.txt'):
  '''
  INPUT: text object and suffix to add to output text filename
  OUTPUT: Write text object to text file (both temp VM and download)
  '''

  if type(text_obj) == str:
    print('STEP 1. Processing String Object\n')
    str_obj = text_obj
  elif type(text_obj) == list:
    if (len(text_obj) > 0):
      if type(text_obj[0]) == str:
        print('STEP 1. Processing List of Strings Object\n')
        str_obj = "\n".join(text_obj)
      else:
        print('ERROR: Object is not an List of Strings [save_text2txt_and_download()]')
        return -1
    else:
      print('ERROR: Object is an empty List [save_text2txt_and_download()]')
      return -1
  else:
    print('ERROR: Object Type is neither String nor List [save_text2txt_and_download()]')
    return -1

  datetime_str = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
  out_filename = novel_name_str.split('.')[0] + '_' + datetime_str + file_suffix

  # Write file to temporary VM filesystem
  print(f'STEP 2. Saving textfile to temporary VM file: {out_filename}\n')
  with open(out_filename, "w") as fp:
    fp.write(str_obj)

  # Download permanent copy of file
  print(f'STEP 3. Downloading permanent copy of textfile: {out_filename}\n')
  files.download(out_filename)

In [None]:
def save_df2csv_and_download(df_obj, file_suffix='_save.csv', nodate=True):
  '''
  INPUT: DataFrame object and suffix to add to output csv filename
  OUTPUT: Write DataFrame object to csv file (both temp VM and download)
  '''

  if isinstance(df_obj, pd.DataFrame):
    datetime_str = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    if nodate:
      out_filename = novel_name_str.split('.')[0] + file_suffix
    else:
      out_filename = novel_name_str.split('.')[0] + '_' + datetime_str + file_suffix
    # print(f'STEP 1. Saving DataFrame: {df_obj.__name__} to temporary VM file: {out_filename}\n') # Also, isinstance(obj, pd.DataFrame)
    print(f'STEP 1. Saving DataFrame to temporary VM file: {out_filename}\n')
    df_obj.to_csv(out_filename, index=False) 
  else:
    print(f'ERROR: Object is not a DataFrame [save_df2csv_and_download()]')
    return -1

  # Download permanent copy of file
  print(f'STEP 2. Downloading permanent copy of csvfile: {out_filename}\n')
  files.download(out_filename)


# Test

# save_df2csv_and_download(temp_df, '_bert-nlptown.txt')

# Get Clean Text

## Option (a): Clean Text

### Upload Raw Text File

Get plain text of familiar novel at:
* https://gutenberg.net.au/ (AUS)
* https://gutenberg.org/ (US)

In [None]:
%%time

# NOTE: 1m07s

# Upload Plain Text File
novel_name_str = ''
uploaded = files.upload()

# NOTE: Allows for multiple file uploads, will only process the last
#       Left in for future feature addition (processing multiple files at once)
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  novel_name_str = fn

# Extract from Dict and decode binary into char string
novel_raw_str = uploaded[novel_name_str].decode(TEXT_ENCODING)

In [None]:
# Verify

verify_novel(novel_raw_str)

### Clean Text

In [None]:
!pip install clean-text

In [None]:
!pip install unidecode  # clean-text dependency

In [None]:
from cleantext import clean

In [None]:
novel_clean_str = clean(novel_raw_str,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    # replace_with_punct="",          # instead of removing punctuations you may replace them
    # replace_with_url="<URL>",
    # replace_with_email="<EMAIL>",
    # replace_with_phone_number="<PHONE>",
    # replace_with_number="<NUMBER>",
    # replace_with_digit="0",
    # replace_with_currency_symbol="<CUR>",
    lang="en"                       # set to 'de' for German special handling
)

# Replace all new lines/returns with single whitespace
novel_clean_str = novel_clean_str.replace('\n\r', ' ')
novel_clean_str = novel_clean_str.replace('\n', ' ')
novel_clean_str = novel_clean_str.replace('\r', ' ')
novel_clean_str = ' '.join(novel_clean_str.split())
novel_clean_str 

In [None]:
# Verify

verify_novel(novel_clean_str, index_ends=500)

### [CAUTION] Trim Header & Footer

**CAUTION:** This requires manually adjusting the RegEx expressions to identify boundries between the header-novel (header_end_re) and the novel-footer (footer_start_re).

It is usually faster and more efficient to manually download, trim header/footer and upload a clean plain text file than use this procedure.

In [None]:
# RegEx to trip header and footer

# RegEx for End of Header
header_end_re = r'*** START OF THE PROJECT GUTENBERG EBOOK THE IDIOT ***'
# header_end_re = r'Towards the end of November, during a thaw'

# RegEx for Start of Footer
footer_start_re = r'*** END OF THE PROJECT GUTENBERG EBOOK THE IDIOT ***'
# footer_start_re = r'as she took leave of Evgenie Pavlovitch'

In [None]:
#function to extract
def trim_header_footer(text_str, aheader_end_re, afooter_start_re):
  '''
  INPUT: Given a long text string consisting of: [Header] + [Novel] + [Footer]
  OUTPUT: Return just the [Novel] text string
  '''

  # Discards the metadata from the beginning of the book
  # index_start = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*",raw ).end()
  header_end_index = re.search(re.escape(aheader_end_re), text_str, re.IGNORECASE).end()
  # header_end_index = re.search(r'*** START OF THE PROJECT GUTENBERG EBOOK THE IDIOT ***', novel_raw_str).end()

  # Discards the metadata from the end of the book
  footer_start_index = re.search(re.escape(afooter_start_re), text_str, re.IGNORECASE).start()
  # footer_start_index = re.search(r'*** END OF THE PROJECT GUTENBERG EBOOK THE IDIOT ***', novel_raw_str).start()

  # Keeps the relevant text
  novel_trim_str = text_str[header_end_index:footer_start_index]

  return novel_trim_str

In [None]:
# Trim Header and Footer

novel_trim_str = trim_header_footer(novel_raw_str, header_end_re, footer_start_re)
print(f'    Length (Raw): {len(novel_raw_str)}')
print(f'Length (Trimmed): {len(novel_trim_str)}')

In [None]:
# Verify

verify_novel(novel_trim_str, 500)

### Segment Text

In [None]:
!pip install pysbd  # Python Sentence Boundry Detection

In [None]:
import pysbd

In [None]:
# FIX: Normally assigned within the 'Trim Header & Footer' Section

# Ensure we have trimmed version of novel in novel_trim_str

if len(novel_trim_str) > 0:
  # Header/Footer already trimmed from body of Novel
  pass
else:
  novel_trim_str = novel_raw_str

In [None]:
%%time

# NOTE: 1m05s

# Split Novel into Segments (~Sentences)
seg = pysbd.Segmenter(language="en", clean=False)
novel_segments_ls = seg.segment(novel_trim_str)

In [None]:
# Trim any leading/trailing whitespace on all Sentences

novel_clean_ls = [x.strip() for x in novel_segments_ls]

In [None]:
# Verify

verify_novel(novel_clean_ls, 10)

In [None]:
# Save to file and download copy

save_text2txt_and_download(novel_clean_ls, '_segments.txt')

In [None]:
sentiment_df

In [None]:
# Populate novel sentiment_df with sentence number and clean segmented strings

sentence_no_ls = list(range(len(novel_clean_ls)))
sentence_no_ls[-1]

sentiment_df = pd.DataFrame({'line_no':sentence_no_ls, 'line':novel_clean_ls})
sentiment_df.head()

## Option (b): Read Clean Text from File

In [None]:
!ls

In [None]:
sentiment_df.head()

In [None]:
!head -n 10 TheIdiot_FyodorDostoyevsky_GutenbergOrg_20221028-031127__vader.csv

In [None]:
saved_vader_csv = 'TheIdiot_FyodorDostoyevsky_GutenbergOrg_vader.csv'

sentiment_df = pd.read_csv(saved_vader_csv, index_col=[0])
sentiment_df.drop(columns=['vader'], inplace=True)

novel_clean_ls = sentiment_df.line.to_list()

sentiment_df.head()

# Compute Sentiment

## Option (1): Symbolic: Lexicons

### VADER

In [None]:
!pip install vaderSentiment

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sid_obj = SentimentIntensityAnalyzer()

In [None]:
sentiment_vader_ls = [sid_obj.polarity_scores(asentence)['compound'] for asentence in novel_clean_ls]

In [None]:
# Create new SyuzhetR DataFrame to save results

vader_df = sentiment_df[['line_no', 'line']].copy(deep=True)
vader_df['vader'] = pd.Series(sentiment_vader_ls) 
vader_df.head()

In [None]:
win_per = 0.1
win_size = int(win_per * sentiment_df.shape[0])

_ = vader_df['vader'].rolling(win_size, center=True).mean().plot(grid=True)

In [None]:
# Save VADER Model Sentiment Time Series

save_df2csv_and_download(vader_df, '_vader.csv', nodate=True)

### SyuzhetR (4)

* SyzuhetR: https://cran.r-project.org/web/packages/readtext/vignettes/readtext_vignette.html
* http://rstudio-pubs-static.s3.amazonaws.com/283881_efbb666d653a4eb3b0c5e5672e3446c6.html

* SentimentR: https://github.com/trinker/sentimentr

* JupyterLab w/Py OR R: https://www.youtube.com/watch?v=Q35WIqZoUF4

In [None]:
%load_ext rpy2.ipython

In [None]:
# Load Python libraries to exchange data with R Program Space and read R Datafiles

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr

In [None]:
%R getwd()

In [None]:
%R list.files()

In [None]:
%%time 
%%capture 
%%R

# Install Syuzhet.R, Sentiment.R and Utility Libraries

# NOTE: 56s 17:30EST on 27Oct2022 - Colab Pro

install.packages(c('syuzhet', 'sentimentr', 'tidyverse', 'lexicon'))

library(syuzhet)
library(sentimentr)
library(tidyverse)
library(lexicon)

In [None]:
%R sessionInfo()

In [None]:
sentiment_df.head()

In [None]:
%%time

# Compute Sentiments from all 4 Syuzhet Models

# NOTE:  3m57s 17:40EST on 27Oct2022 Colab Pro (The Idiot)
#        3m55s 18:02EST on 27Oct2022 Colab Pro (The Idiot)
#        4m10s 23:14EST on 27Oct2022 Colab Pro (The Idiot)

syuzhet = importr('syuzhet')

# Create new SyuzhetR DataFrame to save results
syuzhet_df = sentiment_df[['line_no', 'line']].copy(deep=True)

print('[1/4] Processing syuzhetr_syuzhet')
syuzhet_df['syuzhetr_syuzhet'] = syuzhet.get_sentiment(syuzhet_df['line'].to_list(), method='syuzhet')
print('[2/4] Processing syuzhetr_bing')
syuzhet_df['syuzhetr_bing'] = syuzhet.get_sentiment(syuzhet_df['line'].to_list(), method='bing')
print('[3/4] Processing syuzhetr_afinn')
syuzhet_df['syuzhetr_afinn'] = syuzhet.get_sentiment(syuzhet_df['line'].to_list(), method='afinn')
print('[4/4] Processing syuzhetr_nrc')
syuzhet_df['syuzhetr_nrc'] = syuzhet.get_sentiment(syuzhet_df['line'].to_list(), method='nrc')

syuzhet_df.head()

In [None]:
win_per = 0.1
win_size = int(win_per * syuzhet_df.shape[0])

syuzhet_model_ls = ['syuzhetr_syuzhet', 'syuzhetr_bing', 'syuzhetr_afinn', 'syuzhetr_nrc']
_ = syuzhet_df[syuzhet_model_ls].rolling(win_size, center=True).mean().plot(figsize=(12,6), grid=True)

In [None]:
# Save SyuzhetR Models' Sentiment Time Series

save_df2csv_and_download(syuzhet_df, '_syuzhetr.csv', nodate=True)

### SentimentR (8)

Call function in external get_sentimentr.R from within Python Loop

* https://medium.com/analytics-vidhya/calling-r-from-python-magic-of-rpy2-d8cbbf991571

* https://rpy2.github.io/doc/v3.0.x/html/generated_rst/pandas.html

In [None]:
%%file get_sentimentr.R

library(sentimentr)
library(lexicon)

get_sentimentr_values <- function(s_v) {
  
  print('[1/8] Processing sentimentr_jockersrinker')
  sentimentr_jockersrinker <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_jockers_rinker, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('[2/8] Processing sentimentr_jockers')
  sentimentr_jockers <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_jockers, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('[3/8] Processing sentimentr_huliu')
  sentimentr_huliu <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_huliu, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('[4/8] Processing sentimentr_nrc')
  sentimentr_nrc <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_nrc, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('[5/8] Processing sentimentr_senticnet')
  sentimentr_senticnet <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_senticnet, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('[6/8] Processing sentimentr_sentiword')
  sentimentr_sentiword <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_sentiword, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('[7/8] Processing sentimentr_loughran_mcdonald')
  sentimentr_loughran_mcdonald <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_loughran_mcdonald, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('[8/8] Processing sentimentr_socal_google')
  sentimentr_socal_google <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_socal_google, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  anovel_sentimentr_df <- data.frame(# 'text_clean' = s_v,
                                'sentimentr_jockersrinker' = sentimentr_jockersrinker$sentiment,
                                'sentimentr_jockers' = sentimentr_jockers$sentiment,
                                'sentimentr_huliu' = sentimentr_huliu$sentiment,
                                'sentimentr_nrc' = sentimentr_nrc$sentiment,
                                'sentimentr_senticnet' = sentimentr_senticnet$sentiment,
                                'sentimentr_sentiword' = sentimentr_sentiword$sentiment,
                                'sentimentr_loughran_mcdonald' = sentimentr_loughran_mcdonald$sentiment,
                                'sentimentr_socal_google' = sentimentr_socal_google$sentiment
                                )
  return(anovel_sentimentr_df)

}

In [None]:
# Verify the *.R file above was written correctly

# !cat get_sentimentr.R

In [None]:
# Setup python robject with external library::function()
# https://rpy2.github.io/doc/v3.0.x/html/generated_rst/pandas.html

# import rpy2.robjects as robjects

# Defining the R script and loading the instance in Python
# from rpy2.robjects import pandas2ri 
r = robjects.r

# Loading the function we have defined in R.
r['source']('get_sentimentr.R')

# Reading and processing data
get_sentimentr_function_r = robjects.globalenv['get_sentimentr_values']

In [None]:
%%time

# NOTE:   2m40s  @17:48EST on 27Oct2022 Colab Pro (The Idiot)
#         2m42s  @18:06EST on 27Oct2022 Colab Pro (The Idiot)
#         2m37s  @23:20EST on 27Oct2022 Colab Pro (The Idiot)

# Call external get_sentimentr::get_sentimentr_values with Python loop over all novels

line_ls = sentiment_df['line'].to_list()

# Convert Python List of Strings to a R vector of characters
# https://rpy2.github.io/doc/v3.0.x/html/generated_rst/pandas.html
sentence_v = robjects.StrVector(line_ls)
sentiment_df_r = get_sentimentr_function_r(sentence_v)

# Convert rpy2.robjects.vectors.DataFrame to pandas.core.frame.DataFrame
# https://stackoverflow.com/questions/20630121/pandas-how-to-convert-r-dataframe-back-to-pandas 
print(f'type(sentiment_df_r): {type(sentiment_df_r)}')
temp_df = pd.DataFrame.from_dict({ key : np.asarray(sentiment_df_r.rx2(key)) for key in sentiment_df_r.names })
print(f'type(temp_df): {type(temp_df)}')

# Create new SentimentR DataFrame to save results
# sentimentr_df = sentiment_df[['line_no', 'line']].copy(deep=True)
sentimentr_df = pd.DataFrame()

# This works for Novels New Corpus Texts
sentimentr_df['sentimentr_jockersrinker'] = temp_df['sentimentr_jockersrinker']
sentimentr_df['sentimentr_jockers'] = temp_df['sentimentr_jockers']
sentimentr_df['sentimentr_huliu'] = temp_df['sentimentr_huliu']
sentimentr_df['sentimentr_nrc'] = temp_df['sentimentr_nrc']
sentimentr_df['sentimentr_senticnet'] = temp_df['sentimentr_senticnet']
sentimentr_df['sentimentr_sentiword'] = temp_df['sentimentr_sentiword']
sentimentr_df['sentimentr_loughran_mcdonald'] = temp_df['sentimentr_loughran_mcdonald']
sentimentr_df['sentimentr_socal_google'] = temp_df['sentimentr_socal_google'] 

sentimentr_df.head()

In [None]:
sentimentr_df.columns.to_list()

In [None]:
win_per = 0.1
win_size = int(win_per * sentimentr_df.shape[0])

sentimentr_model_ls = [
    'sentimentr_jockersrinker',
    'sentimentr_jockers',
    'sentimentr_huliu',
    'sentimentr_nrc',
    'sentimentr_senticnet',
    'sentimentr_sentiword',
    'sentimentr_loughran_mcdonald',
    'sentimentr_socal_google']

_ = sentimentr_df[sentimentr_model_ls].rolling(win_size, center=True).mean().plot(figsize=(12,6), grid=True)

In [None]:
# Save SyuzhetR Models' Sentiment Time Series

save_df2csv_and_download(sentimentr_df, '_sentimentr.csv', nodate=True)

## Option (2): Statistical ML

* https://towardsdatascience.com/building-a-sentiment-classifier-using-scikit-learn-54c8e7c5d2f0

In [None]:
# Example: Naive Bayes

# https://www.datacamp.com/tutorial/simplifying-sentiment-analysis-python

In [None]:
# Example: SVM

# https://www.kaggle.com/code/bansodesandeep/sentiment-analysis-support-vector-machine

## Option (3): Connectionist: Transformers

**WARNING:** This takes a LONG TIME to run to completion (~45mins).

Accelerate Large Models:

* https://ponder.io/faster-hugging-face-with-modin/ ***

* https://huggingface.co/blog/accelerate-large-models

* (Moden) https://github.com/modin-project/modin
* (Moden+HF) https://github.com/ponder-org/ponder-blog/blob/main/Modin%20%2B%20Hugging%20Face%20Tutorial.ipynb 

* https://heartbeat.comet.ml/optimizing-a-huggingface-transformer-model-for-toxic-speech-detection-6d59e66f615a

In [None]:
!pip install -q transformers

In [None]:
!pip install sentencepiece

In [None]:
from transformers import pipeline

from transformers import AutoTokenizer, AutoModelWithLMHead  # T5Base 50k
from transformers import AutoModelForSequenceClassification, Trainer
from transformers import AutoModelForSeq2SeqLM, AutoModelWithLMHead

from transformers import BertTokenizer, BertForSequenceClassification

import sentencepiece

In [None]:
# Create class for data preparation

class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

### HF: RoBERTa Lg 15 Datas

siebert/sentiment-roberta-large-english

* https://colab.research.google.com/github/chrsiebert/sentiment-roberta-large-english/blob/main/sentiment_roberta_prediction_example.ipynb

In [None]:
# Load tokenizer and model, create trainer

model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

In [None]:
# Create list of texts (can be imported from .csv, .xls etc.)

# Test
line_ls = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

# Novel Lines
line_ls = sentiment_df['line'].to_list()

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(line_ls,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
%%time

# NOTE: 4m00s 23:57EST on 27Oct2022 Colab Pro (The Idiot)
#       4m18s 02:27EST on 27Oct2022 Colab Pro (The Idiot)

# Run predictions
predictions = trainer.predict(pred_dataset)

In [None]:
type(predictions)

In [None]:
# Transform predictions to labels
sentiment_ls = predictions.predictions.argmax(-1)
labels_ls = pd.Series(sentiment_ls).map(model.config.id2label)
scores_ls = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
line_no_ls = list(range(len(preds)))

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
roberta15lg_df = pd.DataFrame(list(zip(line_no_ls, line_ls,sentiment_ls,labels_ls,scores_ls)), columns=['line_no','line','roberta15lg','label','score'])
roberta15lg_df.head()

In [None]:
roberta15lg_df['label'].unique()

In [None]:
win_per = 0.1
win_size = int(win_per * roberta15lg_df.shape[0])

_ = roberta15lg_df['sentiment'].rolling(win_size, center=True).mean().plot(grid=True)

In [None]:
# Save Model Sentiment Time Series

save_df2csv_and_download(roberta15lg_df, '_roberta15lg.csv', nodate=True)

### HF: Default DistilBERT

distilbert-base-uncased-finetuned-sst-2-english

* https://huggingface.co/docs/transformers/task_summary

In [None]:
# Load tokenizer and model, create trainer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

In [None]:
# Create list of texts (can be imported from .csv, .xls etc.)

# Test
line_ls = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

# Novel Lines
line_ls = sentiment_df['line'].to_list()

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(line_ls,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
%%time

# NOTE: 0m40s 02:49EST on 28Oct2022 Colab Pro (The Idiot)


# Run predictions
predictions = trainer.predict(pred_dataset)

In [None]:
type(predictions)

In [None]:
# Transform predictions to labels
sentiment_ls = predictions.predictions.argmax(-1)
labels_ls = pd.Series(sentiment_ls).map(model.config.id2label)
scores_ls = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
line_no_ls = list(range(len(sentiment_ls)))

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
distilbert_df = pd.DataFrame(list(zip(line_no_ls, line_ls,sentiment_ls,labels_ls,scores_ls)), columns=['line_no','line','distilbert','label','score'])
distilbert_df.head()

In [None]:
distilbert_df['label'].unique()

In [None]:
win_per = 0.1
win_size = int(win_per * distilbert_df.shape[0])

_ = distilbert_df['sentiment'].rolling(win_size, center=True).mean().plot(grid=True)

In [None]:
# Save Model Sentiment Time Series

save_df2csv_and_download(roberta15lg_df, '_distilbert.csv', nodate=True)

In [None]:
# Delete to end

In [None]:
classifier = pipeline("sentiment-analysis")

In [None]:
# Test

result = classifier("I hate you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

result = classifier("I love you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

In [None]:
%%time

# NOTE: 48m20s 00:25EST on 28Oct2022 Colab Pro (The Idiot)

line_ls = sentiment_df['line'].to_list()

distilbert_tup_ls = [(classifier(x)[0]['label'], round(classifier(x)[0]['score'],4)) for x in tq.tqdm(line_ls)]

In [None]:
type(distilbert_tup_ls)
print('\n')
print(distilbert_tup_ls)
print('\n\n')
type(distilbert_tup_ls[0])
print('\n')
print(distilbert_tup_ls[0])
print('\n')

In [None]:
label_ls, prob_ls = list(zip(*distilbert_tup_ls))

pred_ls = ['' ]


print(f'label_ls: {sentiment_ls[:5]}')
print(f'prob_ls: {prob_ls[:5]}')

In [None]:
# Convert NEGATIVE/POSITIVE into 0/1 int values

pred_ls = [1 if x=='POSITIVE' else 0 for x in label_ls]
print(f'pred_ls: {pred_ls[:5]}')

In [None]:
# Transform predictions to labels
"""
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
"""

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
distilbert_df = pd.DataFrame(list(zip(line_ls,pred_ls,label_ls,prob_ls)), columns=['line','distilbert','label','prob'])
distilbert_df.head()

In [None]:
distilbert_df['label'].unique()

In [None]:
win_per = 0.1
win_size = int(win_per * distilbert_df.shape[0])

_ = distilbert_df['distilbert'].rolling(win_size, center=True).mean().plot(grid=True)

In [None]:
# Save Model Sentiment Time Series

save_df2csv_and_download(distilbert_df, '_distilbert.csv')

### HF: MultiBERT NLPTown

nlptown/bert-base-multilingual-uncased-sentiment

* https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment?text=I+like+you.+I+love+you

In [None]:
# Load tokenizer and model, create trainer

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

In [None]:
# Create list of texts (can be imported from .csv, .xls etc.)

# Test
# line_ls = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

# Novel Lines
line_ls = sentiment_df['line'].to_list()

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(line_ls,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
%%time

# NOTE:  4m00s 23:57EST on 27Oct2022 Colab Pro (The Idiot)
#        1m28s 01:24EST on 28Oct2022 Colab Pro (The Idiot)
#        1m27s 02:42EST on 28Oct2022 Colab Pro (The Idiot)

# Run predictions
predictions = trainer.predict(pred_dataset)

In [None]:
# Transform predictions to labels
sentiment_ls = predictions.predictions.argmax(-1)
labels_ls = pd.Series(sentiment_ls).map(model.config.id2label)
scores_ls = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
line_no_ls = list(range(len(sentiment_ls)))

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
nlptown_df = pd.DataFrame(list(zip(line_no_ls,line_ls,sentiment_ls,labels_ls,scores_ls)), columns=['line_no','line','nlptown','label','score'])
nlptown_df.head()

In [None]:
nlptown_df.shape

In [None]:
nlptown_df['label'].unique()

In [None]:
win_per = 0.1
win_size = int(win_per * nlptown_df.shape[0])

_ = nlptown_df['sentiment'].rolling(win_size, center=True).mean().plot(grid=True)

In [None]:
# Save Model Sentiment Time Series

save_df2csv_and_download(nlptown_df, '_nlptown.csv', nodate=True)

In [None]:
db_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# classifier = pipeline("sentiment-analysis", model="j-hartmann/emotion-english-distilrobertabase",max_length=512,truncation=True, tokenizer= db_tokenizer)
classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment",max_length=512,truncation=True, tokenizer= db_tokenizer)

def sentiment_classifier(text):
    classifier_results = classifier(text)[0]
    return classifier_results['label'],classifier_results['score']

In [None]:
len(test_lines_ls)

In [None]:
stars_ls, star_prob_ls = zip(*[sentiment_classifier(x) for x in test_lines_ls])
# stars_ls = [x[1] for x in sentiment_ls]
stars_ls
print(type(stars_ls))
star_prob_ls

In [None]:
sentiment_df.head()
sentiment_df.shape

In [None]:
line_ls = sentiment_df['line'].to_list()
line_ls[:5]

In [None]:
%time

# NOTE: 42m 9:15EST, Tues, 27 Oct 2022

stars_ls, star_prob_ls = zip(*[sentiment_classifier(x) for x in line_ls])
# stars_ls = [x[1] for x in sentiment_ls]
stars_ls
print(type(stars_ls))
star_prob_ls

In [None]:
sentiment_df.head()

In [None]:
line_no_ls = list(range(len(stars_ls)))
print(f'Length(stars_ls): {len(line_no_ls)}')

In [None]:
# Save Raw to file

temp_df = pd.DataFrame({'line_no':line_no_ls, 'stars':stars_ls, 'prob':star_prob_ls})
temp_df.head()

In [None]:
sentiment_df.head()

In [None]:
# Merge 'line' on 'line_no'

temp_df = temp_df.merge(sentiment_df[['line_no','line']],how='left', on='line_no', suffixes=('','_y'))
temp_df.head()

In [None]:

# Move last column to the first
df = pd.DataFrame(technologies)
temp_cols=df.columns.tolist()
new_cols=temp_cols[-1:] + temp_cols[:-1]
df=df[new_cols]
print(df)


In [None]:
# Reorder Columns

cols_order_ls = ['line_no','line','stars','prob']
temp_df = temp_df.reindex(columns=cols_order_ls) 
temp_df.head()

In [None]:
# Ensure no Empty values

temp_df['stars'].isna().sum()

In [None]:
# Check all possible variations

temp_df['stars'].value_counts()

In [None]:
plt.hist(star_prob_ls, bins=100)
plt.show();

In [None]:
plt.hist(stars_ls, bins=100)
plt.show();

In [None]:
# Plot Star Ratings in Order

star_types_ls = list(temp_df['stars'].unique())
star_types_ls.sort(reverse=False)
print(star_types_ls)

from matplotlib.ticker import MaxNLocator

ax = temp_df.stars.value_counts().loc[star_types_ls].plot.bar()
ax.yaxis.set_major_locator(MaxNLocator(integer=True))


In [None]:
def fivestar2float(star_str, star_prob):
  '''
  INPUT: a '{n} star(s)' string rating WITH assoc probability/confidence
  OUTPUT: a float value {n}

  TODO: Weight probability into conversion
  '''

  star_fl = float(star_str.split()[0])

  # TODO: Adjust based upon probability/confidence

  return star_fl

# Test

test_str = '5 stars'
fivestar2float(test_str, 0.3)

In [None]:
# Convert NLPTown star ratings into Floating point

temp_df['nlptown'] = temp_df['stars'].apply(lambda x: float(x.split()[0]))
temp_df.head()

In [None]:
# Save to DataFrame and download copy

save_df2csv_download(temp_df, '_bert-nlptown.txt')

In [None]:
# Test

data = ["I love you", "I hate you"]
sentiment_test_ls = sentiment_pipeline(data)

print(sentiment_test_ls)

In [None]:
# Test Edge Cases

edge_sentence_str = "I'm not sure if I hate you, but I certainly don't care for your attitude young man!"

sentiment_score = sentiment_pipeline(edge_sentence_str)

print(sentiment_score)

In [None]:
print(sentiment_score[0]['label'])
print(sentiment_score[0]['score'])

In [None]:
sentiment_df.head()

In [None]:
sentiment_sample_df = sentiment_df[sentiment_df['line_no'] < 20].copy()
sentiment_sample_df.head()

In [None]:
sample_line_no = 19
sentiment_sample_df.iloc[sample_line_no]

In [None]:
print(f'Line: {sentiment_sample_df.iloc[sample_line_no]["line"]}\n\n')

print(sentiment_pipeline(sentiment_sample_df.iloc[sample_line_no]['line']))

In [None]:
sentiment_pipeline(sentiment_sample_df.iloc[19]['line'])[0].values()

In [None]:
temp_df = pd.DataFrame()
temp_df['sentiment'], temp_df['score'] = zip(*sentiment_sample_df.apply(lambda x: sentiment_pipeline(x['line'])[0].values(), axis=1))
temp_df.head()

In [None]:
sentiment_sample_df['sentiment'],sentiment_sample_df['score'] = zip(*sentiment_sample_df.apply(lambda x: sentiment_pipeline(x['line'])[0].values(),axis=1))
sentiment_sample_df.head()

In [None]:
%%time

# NOTE: 30m30s @ 16:33EST on Weds 20221026 w/GPU 

# CAUTION: Make sure you have selected a GPU runtime type, this will take awhile

sentiment_ls = []

sent_ct = len(novel_clean_ls)

# for i, asentence in enumerate(novel_clean_ls):
for asentence in tqdm(novel_clean_ls):
  asentiment = sentiment_pipeline(asentence)
  alabel = asentiment[0]['label']
  if alabel == 'NEGATIVE':
    score_sign_fl = -1.0
  else:
    score_sign_fl = 1.0

  ascore_fl = score_sign_fl * float(asentiment[0]['score'])

  # print(f'{i}/{sent_ct}')

  # print(f'{i}/{sent_ct} asentiment: {asentiment[0]}')
  # print(f'     label: {alabel}')
  # print(f'     score: {ascore_fl}')

  sentiment_ls.append(ascore_fl)

In [None]:
# Save to file and download copy

novel_sentiments_filename = novel_name_str.split('.')[0] + '_sentiments.csv'

sentiment_df['hf'] = pd.Series(sentiment_ls)
sentiment_df.to_csv(novel_sentiments_filename)

files.download(novel_sentiments_filename)

### HF T5 IMDB

mrm8488/t5-base-finetuned-imdb-sentiment

* https://huggingface.co/mrm8488/t5-base-finetuned-imdb-sentiment

In [None]:
# from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

def get_sentiment(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

  output = model.generate(input_ids=input_ids,
               max_length=2)
  
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label
  
get_sentiment("I dislike a lot that film")

# Output: 'negative'


In [None]:
from transformers import T5Tokenizer, T5Model

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5Model.from_pretrained('t5-small')

input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")  # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)

last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

In [None]:
model_name = "mrm8488/t5-base-finetuned-imdb-sentiment"
tokenizer = AutoModelWithLMHead.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(model_name)
trainer = Trainer(model=model)

In [None]:
# Create list of texts (can be imported from .csv, .xls etc.)

# Test
pred_texts = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

# Novel Lines
pred_texts = line_ls

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts) # ,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
%%time

# NOTE:  4m00s 23:57EST on 27Oct2022 Colab Pro (The Idiot)
#        1m28s 01:24EST on 28Oct2022 Colab Pro (The Idiot)

# Run predictions
predictions = trainer.predict(pred_dataset)

In [None]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
nlptown_df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['line','pred','label','score'])
nlptown_df.head()

In [None]:
nlptown_df['label'].unique()

In [None]:
win_per = 0.1
win_size = int(win_per * nlptown_df.shape[0])

_ = nlptown_df['pred'].rolling(win_size, center=True).mean().plot(grid=True)

In [None]:
# Save Model Sentiment Time Series

save_df2csv_and_download(nlptown_df, '_nlptown.csv')

In [None]:


def get_sentiment(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

  output = model.generate(input_ids=input_ids,
               max_length=2)
  
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label
  
get_sentiment("I dislike a lot that film")

# Output: 'negative'


In [None]:
from transformers import AutoModelForSeq2SeqLM

In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

In [None]:
# Load tokenizer and model, create trainer

model_name = "mrm8488/t5-base-finetuned-imdb-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
trainer = Trainer(model=model)

In [None]:
# Create list of texts (can be imported from .csv, .xls etc.)

# Test
pred_texts = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

# Novel Lines
pred_texts = line_ls

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
%%time

# NOTE: 4m00s 23:57EST on 27Oct2022 Colab Pro (The Idiot)

# Run predictions
predictions = trainer.predict(pred_dataset)

In [None]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
nlptown_df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','label','score'])
nlptown_df.head()

In [None]:
nlptown_df['label'].unique()

In [None]:
win_per = 0.1
win_size = int(win_per * nlptown_df.shape[0])

_ = nlptown_df['pred'].rolling(win_size, center=True).mean().plot(grid=True)

In [None]:
# Save VADER Model Sentiment Time Series

save_df2csv_and_download(nlptown_df, '_nlptown.csv')

### T5

mrm8488/t5-base-finetuned-span-sentiment-extraction"

* https://huggingface.co/mrm8488/t5-base-finetuned-imdb-sentiment

In [None]:
!pip install transformers[sentencepiece]

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer

model = "mrm8488/t5-base-finetuned-span-sentiment-extraction"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelWithLMHead.from_pretrained(model)

In [None]:


def get_sentiment_span(text):
  input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)  # Batch size 1
  
  generated_ids = model.generate(input_ids=input_ids, num_beams=1, max_length=80).squeeze()
  
  predicted_span = tokenizer.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    
  return predicted_span
  
get_sentiment_span("question: negative context: My bike was put on hold...should have known that.... argh total bummer")

# output: 'argh total bummer'

get_sentiment_span("question: positive context: On the monday, so i wont be able to be with you! i love you")

# output: 'i love you'


### HF DistilBERT

distilbert-base-uncased-finetuned-sst-2-english

* https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english?text=I+like+you.+I+love+you

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
# Create list of texts (can be imported from .csv, .xls etc.)

# Test
pred_texts = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

# Novel Lines
pred_texts = line_ls

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
inputs = tokenizer("Good lovely wonderfully cute", return_tensors="pt")
# inputs = tokenizer("Damn bad, evil and ugly", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
logits

In [None]:
# RoBERTa Large (trained on 15 datasets)

hf_model = 'siebert/sentiment-roberta-large-english'

classifier_sentiment = pipeline("sentiment-analysis",model=hf_model,max_length=512,truncation=True, tokenizer= db_tokenizer)
# Test
print(classifier_sentiment("I love this!"))

In [None]:
# Create new roberta15lg5cat  DataFrame to save results

roberta15lg5cat_df = sentiment_df[['line_no', 'line']].copy(deep=True)
roberta15lg5cat_df.head()

In [None]:
# Create new roberta15lg5cat  DataFrame to save results

roberta15lg5cat_df = sentiment_df[['line_no', 'line']].copy(deep=True)


progress_apply(lambda x: labelscore2fl(sa_model(x), sa_model=model_name))
   
roberta15lg5cat_df['roberta_15lg5cat'] = pd.Series(sentiment_vader_ls) 
roberta15lg5cat_df.head()

### HF Default: DistilBERT

In [None]:
# Test Dataset: List of TestSentiment Strings

test_lines_ls = [
    "I love you.",
    "You hate me.",
    "I'm not sure if I hate you, but I certainly don't care for your attitude young man!"
]

In [None]:
db_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

from transformers import pipeline

max_length=512,truncation=True, tokenizer= db_tokenizer

# Uses default DistilBERT: distilbert-base-uncased-finetuned-sst-2-english (as of 26 Oct 2022))
sentiment_pipeline = pipeline("sentiment-analysis")  

### HF Distil RoBERTa Base

In [None]:
db_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

classifier = pipeline("sentiment-analysis", model="j-hartmann/emotion-english-distilrobertabase",max_length=512,truncation=True, tokenizer= db_tokenizer)

def sentiment_classifier(text):

    classifier_results = classifier(text)[0]
    return classifier_results['label'],classifier_results['score']

### HF RoBERTa Large

In [None]:
%%time

# NOTE: 00m11s @12:39 on 20220301 Colab Pro 

sa_model = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

print(sa_model("I love this!"))

In [None]:
def sentiment_classifier(text):
    classifier_results = sa_model(text)[0]
    return classifier_results['label'],classifier_results['score']

In [None]:
polarity_ls, star_prob_ls = zip(*[sentiment_classifier(x) for x in test_lines_ls])
# stars_ls = [x[1] for x in sentiment_ls]
stars_ls
print(type(stars_ls))
star_prob_ls

In [None]:
stars_ls, star_prob_ls = zip(*[sentiment_classifier(x) for x in test_lines_ls])
# stars_ls = [x[1] for x in sentiment_ls]
stars_ls
print(type(stars_ls))
star_prob_ls

In [None]:
%whos

In [None]:
%%time

# NOTE: 30m30s @ 16:33EST on Weds 20221026 w/GPU 

# CAUTION: Make sure you have selected a GPU runtime type, this will take awhile

sentiment_ls = []

sent_ct = len(novel_clean_ls)

# for i, asentence in enumerate(novel_clean_ls):
for asentence in tqdm(novel_clean_ls):
  asentiment = sentiment_classifier(asentence)
  alabel = asentiment[0]['label']
  if alabel == 'NEGATIVE':
    score_sign_fl = -1.0
  else:
    score_sign_fl = 1.0

  ascore_fl = score_sign_fl * float(asentiment[0]['score'])

  # print(f'{i}/{sent_ct}')

  # print(f'{i}/{sent_ct} asentiment: {asentiment[0]}')
  # print(f'     label: {alabel}')
  # print(f'     score: {ascore_fl}')

  sentiment_ls.append(ascore_fl)

# Plot

## Dilate SentimentR Time Series

In [None]:
# Get all files with only one model sentiment time series

novel_root_str = novel_name_str.split('.')[0]

sentimentr_filename_csv = f'{novel_root_str}_sentimentr.csv'
sentimentr_all_df = pd.read_csv(sentimentr_filename_csv, index_col=[0])
sentimentr_model_ls = list(set(sentimentr_all_df.columns.to_list()) - set(['line_no','line']))

sentimentr_all_df.head()
sentimentr_all_df.info()
sentimentr_model_ls

In [None]:
win_per = 0.1
win_size = int(win_per * sentimentr_all_df.shape[0])

_ = sentimentr_all_df[sentimentr_model_ls].rolling(win_size, center=True).mean().plot(figsize=(12,8), grid=True)

In [None]:
sentimentr_all_df.shape

In [None]:
print(f'SentimentR rows: {sentimentr_all_df.shape[0]}')
print(f'    Others rows: {vader_file_df.shape[0]}\n')

print(f'     Difference: {sentimentr_all_df.shape[0] - vader_file_df.shape[0]}')


In [None]:
from sklearn.preprocessing import StandardScaler

# define standard scaler
scaler = StandardScaler()

# retrieve just the numeric input values
data = sentimentr_all_df.values[:,:]

# perform a robust scaler transform of the dataset
data = scaler.fit_transform(data)

# convert the array back to a dataframe
sentimentr_all_norm_df = pd.DataFrame(data)
sentimentr_all_norm_df.columns = sentimentr_model_ls

# summarize
print(sentimentr_all_norm_df.describe())

# histograms of the variables
sentimentr_all_norm_df.hist()
plt.show();

### SentimentR 8 Model Plot

In [None]:
win_per = 0.1
win_size = int(win_per * sentimentr_all_norm_df.shape[0])

model_cols = [
    'sentimentr_nrc',
    'sentimentr_jockers',
    'sentimentr_socal_google',
    'sentimentr_huliu',
    'sentimentr_senticnet',
    'sentimentr_sentiword',
    'sentimentr_loughran_mcdonald',
    'sentimentr_jockersrinker']

_ = sentimentr_all_norm_df[model_cols].rolling(win_size, center=True).mean().plot(figsize=(12,8), grid=True)

In [None]:
# Save Model Sentiment Time Series

save_df2csv_and_download(sentimentr_all_norm_df, '_sentimentr8norm.csv', nodate=True)

## Merge Model Data

In [None]:
model_name_ls = ['vader',
                 'syuzhetr',
                 'sentimentr',
                 'roberta15lg',
                 'distilbert',
                 'nlptown']

In [None]:
novel_root_str = novel_name_str.split('.')[0]

# Get all files with only one model sentiment time series
vader_filename_csv = f'{novel_root_str}_vader.csv'
vader_file_df = pd.read_csv(vader_filename_csv)
vader_file_ls = vader_file_df['vader'].to_list()

distilbert_filename_csv = f'{novel_root_str}_distilbert.csv'
distilbert_file_df = pd.read_csv(distilbert_filename_csv)
# distilbert_file_ls = distilbert_file_df['distilbert'].to_list()
distilbert_file_ls = distilbert_file_df['sentiment'].to_list()

nlptown_filename_csv = f'{novel_root_str}_distilbert.csv'
nlptown_file_df = pd.read_csv(nlptown_filename_csv)
# distilbert_file_ls = distilbert_filnlptown_file_dfe_df['distilbert'].to_list()
nlptown_file_ls = nlptown_file_df['sentiment'].to_list()

roberta15lg_filename_csv = f'{novel_root_str}_distilbert.csv'
roberta15lg_file_df = pd.read_csv(roberta15lg_filename_csv)
# distilbert_file_ls = roberta15lg_file_df['distilbert'].to_list()
roberta15lg_file_ls = roberta15lg_file_df['sentiment'].to_list()

# Append to Syuzhet with 4 models sentiment time series
syuzhetr_filename_csv = f'{novel_root_str}_syuzhetr.csv'
syuzhetr_file_df = pd.read_csv(syuzhetr_filename_csv, index_col=[0])

sentiment_all_df = syuzhetr_file_df.copy(deep=True)
sentiment_all_df['vader'] = vader_file_ls
sentiment_all_df['distilbert'] = distilbert_file_ls
sentiment_all_df['nlptown'] = nlptown_file_ls
sentiment_all_df['roberta15lg'] = roberta15lg_file_ls

sentiment_all_df.head()

In [None]:
sentiment_all_df.columns.to_list()

In [None]:
win_per = 0.1
win_size = int(win_per * sentiment_all_df.shape[0])

model_cols = [
    'syuzhetr_syuzhet',
    'syuzhetr_bing',
    'syuzhetr_afinn',
    'syuzhetr_nrc',
    'vader',
    'distilbert',
    'nlptown',
    'roberta15lg']

_ = sentiment_all_df[model_cols].rolling(win_size, center=True).mean().plot(figsize=(12,8), grid=True)

In [None]:
# Get list matching model *.csv files

file_model_ls = []

rootdir = "."
file_root = novel_name_str.split('.')[0]
print(file_root)
regex = re.compile("{}_[^-]*\.csv$".format(file_root))
# regex = re.compile('.*_.*\.csv$')

for root, dirs, files in os.walk(rootdir):
  for file in files:
    if regex.match(file):
      for amodel in model_name_ls:
        if amodel in file:
           file_model_ls.append(file)

print('Matching files:\n')
[x for x in file_model_ls]

In [None]:
file_nosentimentr_ls = []

sentimentr_fl = False

for afile in file_model_ls:
  if '_sentimentr.csv' in afile:
    print('found sentimentr')
    sentimentr_fl= True
  else:
    file_nosentimentr_ls.append(afile)

file_nosentimentr_ls

In [None]:
file_first = file_nosentimentr_ls[0]
print(f'file_first: {file_first}')
merged_df = pd.read_csv(file_first, index_col=None)

for file_next in file_nosentimentr_ls[1:]:
  print(f'file_next: {file_next}')
  new_df = pd.read_csv(file_next, index_col=None)
  merged_df = pd.merge(merged_df, new_df, on='line_no', how='left')

In [None]:
merged_df.info()

In [None]:
merged_df.head()

In [None]:

file_nosentimentr_ls = []

sentimentr_fl = False

for afile in file_model_ls:
  if '_sentimentr.csv' in afile:
    print('found sentimentr')
    sentimentr_fl= True
  else:
    file_nosentimentr_ls.append(afile)

file_nosentimentr_ls


# merging two csv files
sentiment_all_df = pd.concat(map(pd.read_csv, file_nosentimentr_ls), ignore_index=True, axis=1)
sentiment_all_df.head()

In [None]:
# importing pandas
import pandas as pd

# merging two csv files
df = pd.concat(map(pd.read_csv, ['mydata.csv', 'mydata1.csv']), ignore_index=True)

In [None]:
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(fname) for fname in file_model_ls])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')

In [None]:
!ls

In [None]:
# Review

sentiment_df.head()

In [None]:
sentiment_cols_ls = list(sentiment_df.columns)
sentiment_cols_ls.remove('sentence_no')
sentiment_cols_ls.remove('sentence_str')
sentiment_cols_ls

## Normalize Data

* https://stackoverflow.com/questions/64882432/sklearn-preprocessing-standardscaler-valueerror-expected-2d-array-got-1d-array

In [None]:
sentiment_all_df.head()
sentiment_all_df.info()

In [None]:
from sklearn.preprocessing import StandardScaler

# define standard scaler
scaler = StandardScaler()

# retrieve just the numeric input values
data = sentiment_all_df.values[:, 2:]

# perform a robust scaler transform of the dataset
data = scaler.fit_transform(data)

# convert the array back to a dataframe
sentiment_all_norm_df = pd.DataFrame(data)

# summarize
print(sentiment_all_norm_df.describe())

# histograms of the variables
sentiment_all_norm_df.hist()
plt.show();

In [None]:
sentiment_all_cols_ls = sentiment_all_df.columns.to_list()
sentiment_all_norm_df.columns = sentiment_all_cols_ls[2:]
sentiment_all_norm_df.head()

### Non-SentimentR 8 Model Plot

In [None]:
win_per = 0.1
win_size = int(win_per * sentiment_all_norm_df.shape[0])

model_cols = [
    'syuzhetr_syuzhet',
    'syuzhetr_bing',
    'syuzhetr_afinn',
    'syuzhetr_nrc',
    'vader',
    'distilbert',
    'nlptown',
    'roberta15lg']

_ = sentiment_all_norm_df[model_cols].rolling(win_size, center=True).mean().plot(figsize=(12,8), grid=True)

In [None]:
# Save Model Sentiment Time Series

save_df2csv_and_download(sentiment_all_norm_df, '_nonsentimentr8norm.csv', nodate=True)

In [None]:
# End

In [None]:
# Normalize Sentiment Time Series across different models

from sklearn.preprocessing import StandardScaler

sentiment_norm_df = pd.DataFrame()

scaler = StandardScaler() 
for acol in sentiment_cols_ls:
  t = t.reshape(-1,1)
  sentiment_norm_df[acol] = scaler.fit_transform(sentiment_df[acol].to_numpy()) 
# sentiment_norm_df = pd.DataFrame({sentiment_cols_ls[0]: scaled_values[:, 0], sentiment_cols_ls[1]: scaled_values[:, 1]})
sentiment_norm_df.insert(0, 'sentence_no', sentiment_df['sentence_no'])
sentiment_norm_df.insert(1, 'sentence_str', sentiment_df['sentence_str'])
sentiment_norm_df.head()

In [None]:
# Save to file and download copy

novel_sentiments_norm_filename = novel_name_str.split('.')[0] + '_sentiments_norm.csv'

sentiment_norm_df['hf'] = pd.Series(sentiment_ls)
sentiment_norm_df.to_csv(novel_sentiments_norm_filename)

files.download(novel_sentiments_norm_filename)

## Static Plots

In [None]:
win_per = .10

win_size = int(sentiment_df.shape[0] * win_per)

sentiment_norm_df[['vader','hf']].rolling(window=win_size, center=True).mean().plot(figsize=(20,10), grid=True)
plt.show();

## Interactive Plots

* https://plotly.com/python/time-series/

In [None]:
# Using graph_objects

import plotly.graph_objects as go
import plotly.express as px

In [None]:
# Interactive Plotly Time Series Charts

import pandas as pd
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/finance-charts-apple.csv')

# fig = go.Figure([go.Scatter(y=sentiment_norm_df['vader'].rolling(window=win_size, center=True).mean())])
# fig.show()

fig = go.Figure(fig.add_traces(
                 data=px.line(sentiment_norm_df, x='sentence_no', y='vader', hover_name="sentence_str")._data))
fig = go.Figure(fig.add_traces(
                 data=px.line(sentiment_norm_df, x='sentence_no', y='hf', hover_name="sentence_str")._data))
fig.update_layout(title='Diachronic Sentiment Analysis', showlegend=False)
fig.show();