<a href="https://colab.research.google.com/github/jon-chun/sentimentarcs/blob/main/sentimentarcs_part2_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Huggingface Transformers Sentiment Analysis at the Command Line**

Jon Chun
19 Jun 2019

References:

* https://github.com/barissayil/SentimentAnalysis

# Configuration (Auto)

In [1]:
!pip install transformers[sentencepiece]

Collecting transformers[sentencepiece]
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.1 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 29.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 49.0 MB/s 
Collecting sentencepiece==0.1.91
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |███████████████████████████████

In [2]:
# !pip install sentencepiece

In [3]:
import pandas as pd

In [4]:
from datetime import datetime

**Configure Jupyter Notebook**

In [5]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Configure Jupyter

# Enable multiple outputs from one code cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [7]:
# SKIP TO NEXT SECTION

**Global Configuration Constants**

In [8]:
# Hardcoded Sentiment Analysis Models

MODELS_LS = ['vader','textblob','stanza','afinn','bing','sentimentr','syuzhet','pattern','sentiword','senticnet','nrc']

MODELS_LEX_LS = ['vader','textblob','stanza','afinn','bing','sentimentr','syuzhet','pattern','sentiword','senticnet','nrc']
MODELS_TRANS_LS = ['distilbertsst', 'nlptown','roberta_lg15','albertbv2','bertuc_gapps','bert_imdb']
MODELS_ALL_LS = MODELS_LEX_LS + MODELS_TRANS_LS

# Minimum lengths for Sentences and Paragraphs
#   (Shorter Sents/Parags will be deleted)

MIN_PARAG_LEN = 2
MIN_SENT_LEN = 2

# Min/Max statistics on each lexicon's sentiment values applied to corpus
corpus_lexicons_stats_dt = {}


# Hardcoded Sentiment Analysis Models


            
# Minimum lengths for Sentences and Paragraphs
#   (Shorter Sents/Parags will be deleted)

MIN_CHAP_LEN = 5000
MIN_SECT_LEN = 5000  # Minimum char length to be included in section DataFrame
MIN_PARAG_LEN = 2
MIN_SENT_LEN = 2

# Min/Max statistics on each lexicon's sentiment values applied to corpus
corpus_lexicons_stats_dt = {}
corpus_cruxes_dt = {}

# Crux Points Dict key:model, value:list of crux point tuples (x,y)
corpus_cruxes_all_dt = {}


**Install Libraries**

In [9]:
!pip install transformers



In [10]:
# INSTALL LIBRARIES

!pip install sklearn



**Import Libraries**

In [11]:
import os
import sys
import io
import glob
import contextlib

In [12]:
# IMPORT LIBRARIES

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
import re
import string

In [14]:
import collections
from collections import OrderedDict

In [15]:
# Import libraries for logging

import logging
from datetime import datetime
import time                     # (TODO: check no dependencies and delete)
from time import gmtime, strftime

In [16]:
import nltk

# Download for sentence tokenization
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Download for nltk/VADER sentiment analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [17]:
# DTW

import json
import numpy.fft
from decimal import Decimal
import math
import random

In [18]:
from sklearn.preprocessing import MinMaxScaler   # To normalize time series
from sklearn.preprocessing import StandardScaler # To sandardize time series

In [19]:
# Smoothing

from scipy import interpolate
from scipy.interpolate import CubicSpline
from scipy import signal
from scipy.signal import argrelextrema
import scipy.stats

In [20]:
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess
from statsmodels import robust

In [21]:
# !pip install dtaidistance

In [22]:
# DTW

# from dtaidistance import dtw
# from dtaidistance import clustering
# from dtaidistance import dtw_visualisation as dtwvis

**Configure Jupyter Notebook**

In [23]:
# Configure Jupyter

%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 8]
plt.rcParams['figure.dpi'] = 100

# Enable multiple outputs from one code cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display
from ipywidgets import widgets, interactive

# Configure Google Colab

%load_ext google.colab.data_table

In [24]:
# Text wrap

from IPython.display import HTML

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

**Configuration Details Snapshot**

In [25]:
# Snap Shot of Time, Machine, Data and Library/Version Blueprint
# TODO:

# Pick ONE Method (a) or (b) to Get Corpus Textfile

**Choose either (a) OR (b), not both**

## **Connect to Google gDrive**

In [26]:
# Connect to Google gDrive

# Flag to indicate first run through code 
flag_first_run = True

from google.colab import drive, files
drive.mount('/gdrive')
%cd /gdrive/MyDrive/

Mounted at /gdrive
/gdrive/MyDrive


In [55]:
# Select the Corpus subdirectory on your Google gDrive

# Done

# gdrive_subdir = "./research/2021/sa_book_code/books_sa/fbaum_thewonderfulwizardofoz" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/cdickens_achristmascarol" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/cdickens_greatexpectations" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/ddefoe_robinsoncrusoe" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/emforster_howardsend" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/fbaum_thewonderfulwizardofoz" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/fdouglass_narrativelifeofaslave" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/fscottfitzgerald_thegreatgatsby" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/geliot_middlemarch" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/hjames_portraitofalady" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/homer-ewilson_odyssey" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/imcewan_machineslikeme" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/jausten_prideandprejudice" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/jconrad_heartofdarkness" #@param {type:"string"} 
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/jjoyce_portraitoftheartist" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/jkrowling_harrypotter" #@param {type:"string"}
gdrive_subdir = "./research/2021/sa_book_code/books_sa/mproust-mtreharne_3guermantesway" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/mtwain_huckleberryfinn" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/mshelley_frankenstein" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/staugustine_confessions9end" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/tmorrison_beloved" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/vwoolf_tothelighthouse" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/vwoolf_mrsdalloway" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/vwoolf_thewaves" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/vwoolf_orlando" #@param {type:"string"}
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/vnabokov_palefire" #@param {type:"string"}

# Current
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/homer_odyssey" #@param {type:"string"}

# To do
# gdrive_subdir = "./research/2021/sa_book_code/books_sa/geliot_middlemarch" #@param {type:"string"}

CORPUS_SUBDIR = gdrive_subdir
corpus_filename = CORPUS_SUBDIR

# Change to working subdirectory
if flag_first_run == True:
  full_path_str = gdrive_subdir
  flag_first_run = False
else:
  full_path_str = f'/gdrive/MyDrive{gdrive_subdir[1:]}'

%cd $full_path_str


/gdrive/MyDrive/research/2021/sa_book_code/books_sa/mproust-mtreharne_3guermantesway


### **Option (a) Load Corpus Raw Text DataFrames**

In [29]:
# Get DataFrame filenames

corpus_root = gdrive_subdir.split('/')[-1]
print(corpus_root)

jausten_prideandprejudice


In [56]:
!pwd
!ls -altr *

/gdrive/MyDrive/research/2021/sa_book_code/books_sa/mproust-mtreharne_3guermantesway
-rw------- 1 root root      151 Jan 26  2019 'In Search of Time in Semantic Space.gdoc'
-rw------- 1 root root      151 Feb  1  2019 'ADHO2019 Proust Reply 20190130.gdoc'
-rw------- 1 root root      151 Feb  1  2019 'ADHO2019 Proust Reply.gdoc'
-rw------- 1 root root  1449947 Aug  1 05:47  mproust_guermantes_en.txt
-rw------- 1 root root  2273275 Aug  1 07:28  sum_sentiments_sents_trans_mproust_guermantes.csv
-rw------- 1 root root  2273200 Aug  2 02:52  sum_sentiments_sents_trans_jrowling_thesorcerersstone.csv
-rw------- 1 root root   919306 Aug  2 04:22  sentimenttime_part1_lexrules_simple_zeta.ipynb
-rw------- 1 root root  1416756 Aug  2 13:01  mproust_guermantes_fr.txt
-rw------- 1 root root     2677 Aug  2 14:35 '=4.0'
-rw------- 1 root root  1602228 Aug  2 20:19  sum_sentiments_syuzhetR_4models_sentimenttimeraw_mproust_guermantes_en.csv
-rw------- 1 root root  2133402 Aug  2 20:20  sum_sentiments

In [57]:
# (Optional) Read Corpus Sentence Text Datafiles 

corpus_sents_text_filename = f'corpus_text_sents_raw_mproust-mtreharne_3guermantesway.csv'

corpus_sents_trans_df = pd.read_csv(corpus_sents_text_filename) # , index_col=[0])

corpus_sents_trans_df.rename(columns={'Unnamed: 0':'sent_no'}, inplace=True)

corpus_sents_trans_df.columns
corpus_sents_trans_df.shape

Index(['sent_no', 'sent_raw'], dtype='object')

(8388, 2)

In [58]:
corpus_sents_trans_df.head()
corpus_sents_trans_df.tail()
corpus_sents_trans_df.info()

Unnamed: 0,sent_no,sent_raw
0,0,The early-morning twitter of the birds sounded...
1,1,Every word from the maids quarters made her ju...
2,2,All this was because we had moved house.
3,3,It is true that the servants in our former hom...
4,4,But now she even made silence the object of he...


Unnamed: 0,sent_no,sent_raw
8383,8383,"And so, after he had gently steered us to the ..."
8384,8384,Now mind you dont let all this damned doctors ...
8385,8385,Theyre fools.
8386,8386,Youre in strapping shape.
8387,8387,Youll live to see us all in our graves!


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8388 entries, 0 to 8387
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sent_no   8388 non-null   int64 
 1   sent_raw  8388 non-null   object
dtypes: int64(1), object(1)
memory usage: 131.2+ KB


### **Option (b): Load Previously Computed Transformer Sentiment Datasets**

***Only do this if your Google subdirectory doesn't already contain a plain text file of your Corpus or you wish to overwrite it and use a newer version***

In [30]:
!ls -altr *.csv

-rw------- 1 root root  841102 Sep 16 23:25 sum_sentiments_syuzhetR_4models_jausten_prideandprejudice.csv
-rw------- 1 root root 1325588 Sep 16 23:27 sum_sentiments_sentimentR_7models_jausten_prideandprejudice.csv
-rw------- 1 root root 3853094 Sep 17 00:30 sum_sentiments_sents_syuzhetr_jausten_prideandprejudice.csv
-rw------- 1 root root 5928570 Sep 17 00:30 sum_sentiments_sents_sentimentr_jausten_prideandprejudice.csv
-rw------- 1 root root  712673 Sep 17 00:30 corpus_text_sents_raw_jausten_prideandprejudice.csv
-rw------- 1 root root  686232 Sep 17 00:30 corpus_text_sents_clean_jausten_prideandprejudice.csv
-rw------- 1 root root 9143814 Sep 17 00:30 corpus_sents_baseline_jausten_prideandprejudice.csv
-rw------- 1 root root 1393706 Sep 17 00:30 corpus_sects_baseline_jausten_prideandprejudice.csv
-rw------- 1 root root 3560200 Sep 17 00:30 corpus_parags_baseline_jausten_prideandprejudice.csv
-rw------- 1 root root 1406942 Sep 17 00:30 corpus_chaps_baseline_jausten_prideandprejudice.c

In [31]:
!ls -altr sum_sentiments_sents_trans_*.csv

-rw------- 1 root root 1427267 Sep 17 08:18 sum_sentiments_sents_trans_jausten_prideandprejudice.csv


In [34]:
sum_sentiments_transformer_series = 'sum_sentiments_sents_trans_jausten_prideandprejudice.csv'

corpus_sents_trans_df = pd.read_csv(sum_sentiments_transformer_series, index_col=[0])
corpus_sents_trans_df.head()

Unnamed: 0,sent_no,sent_raw,roberta15lg,yelp,nlptown,huggingface,hinglish,imdb2way,t5imdb50k
0,0,"It is a truth universally acknowledged, that a...",0.981554,0.529613,4.520703,0.995663,1.948635,0.945473,1
1,1,However little known the feelings or views of ...,0.993793,0.292682,2.423463,0.997287,1.78014,0.998224,1
2,2,"My dear Mr. Bennet, said his lady to him one d...",0.993223,0.27987,4.419394,-0.941989,0.91703,0.8791,1
3,3,Mr. Bennet replied that he had not.,-0.994574,0.718647,0.508904,-0.948743,0.556021,0.869533,-1
4,4,"But it is, returned she; for Mrs. Long has jus...",0.997717,3.317981,1.333009,0.971442,0.878002,0.995699,-1


In [35]:
# corpus_sents_trans_df.drop(columns=['Unnamed: 0'], inplace=True)
corpus_sents_trans_df['sent_raw'] = corpus_sents_trans_df['sent_raw'].astype('string')
corpus_sents_trans_df.head()
corpus_sents_trans_df.info()

Unnamed: 0,sent_no,sent_raw,roberta15lg,yelp,nlptown,huggingface,hinglish,imdb2way,t5imdb50k
0,0,"It is a truth universally acknowledged, that a...",0.981554,0.529613,4.520703,0.995663,1.948635,0.945473,1
1,1,However little known the feelings or views of ...,0.993793,0.292682,2.423463,0.997287,1.78014,0.998224,1
2,2,"My dear Mr. Bennet, said his lady to him one d...",0.993223,0.27987,4.419394,-0.941989,0.91703,0.8791,1
3,3,Mr. Bennet replied that he had not.,-0.994574,0.718647,0.508904,-0.948743,0.556021,0.869533,-1
4,4,"But it is, returned she; for Mrs. Long has jus...",0.997717,3.317981,1.333009,0.971442,0.878002,0.995699,-1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5891 entries, 0 to 5890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sent_no      5891 non-null   int64  
 1   sent_raw     5891 non-null   string 
 2   roberta15lg  5891 non-null   float64
 3   yelp         5891 non-null   float64
 4   nlptown      5891 non-null   float64
 5   huggingface  5891 non-null   float64
 6   hinglish     5891 non-null   float64
 7   imdb2way     5891 non-null   float64
 8   t5imdb50k    5891 non-null   int64  
dtypes: float64(6), int64(2), string(1)
memory usage: 460.2 KB


### **Option (c): Upload Corpus Sentiment Transformer Sentiment Datafiles**

***Only do this if your Google subdirectory doesn't already contain a plain text file of your Corpus or you wish to overwrite it and use a newer version***

In [None]:
# Execute this code cell to upload plain text file of corpus
#   Should be *.txt format with paragraphs separated by at least 2 newlines

uploaded = files.upload()

In [None]:
# Verify file was uploaded

# Get uploaded filename
corpus_filename = list(uploaded.keys())[0]
print(f'Uploaded Corpus filename is: {corpus_filename}')
CORPUS_FILENAME = corpus_filename

!ls -al $corpus_filename

# **Configuration (Manual)**

In [None]:
# Verify subdirectory change

!pwd
!ls *.txt

# TODO: Intelligently automate the filling of form based upon directory

/gdrive/My Drive/research/2021/sa_book_code/books_sa/mtwain_huckleberryfinn
mtwain_huckleberryfinn_peaks.txt
mtwain_huckleberryfinn_segs.txt
mtwain_huckleberryfinn.txt
mtwain_huckleberryfinn_valleys.txt
mtwian_huckleberryfinn_sentimentr_jr_peaks.txt
mtwian_huckleberryfinn_sentimentr_jr_valleys.txt


In [None]:
!ls -altr *


-rw------- 1 root root  571284 Feb 17  2021 mtwain_huckleberryfinn_segs.txt
-rw------- 1 root root  598857 Feb 17  2021 mtwain_huckleberryfinn_segs.csv
-rw------- 1 root root  119870 Feb 17  2021 mtwain_huckleberryfinn_sa_vader.csv
-rw------- 1 root root  118496 Feb 17  2021 mtwain_huckleberryfinn_sa_syuzhet.csv
-rw------- 1 root root   34853 Feb 17  2021 mtwain_huckleberryfinn_syuzhet_simple_plots.png
-rw------- 1 root root   30509 Feb 17  2021 mtwain_huckleberryfinn_syuzhet_raw.png
-rw------- 1 root root   38171 Feb 17  2021 mtwain_huckleberryfinn_sentimentr_simple_plots.png
-rw------- 1 root root   37348 Feb 17  2021 mtwain_huckleberryfinn_syuzhet_lps9.png
-rw------- 1 root root   37972 Feb 17  2021 mtwain_huckleberryfinn_sentimentr_lps7.png
-rw------- 1 root root   42473 Feb 17  2021 mtwain_huckleberryfinn_emoarc_peaks_loess10.png
-rw------- 1 root root   39865 Feb 17  2021 mtwain_huckleberryfinn_emoarc_valleys_loess10.png
-rw------- 1 root root   27051 Feb 17  2021 mtwain_hucklebe

In [59]:
!head -n 10 mtwain_huckleberryfinn.txt

head: cannot open 'mtwain_huckleberryfinn.txt' for reading: No such file or directory


In [60]:
# CORPUS_TITLE = 'Beloved' #@param {type:"string"}
# CORPUS_AUTHOR = "Toni Morrison" #@param {type:"string"}
# CORPUS_FILENAME = "tmorrison_beloved.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/tmorrison_belovedy"  #@param {type:"string"}

# CORPUS_TITLE = 'A Christmas Carol' #@param {type:"string"}
# CORPUS_AUTHOR = "Charles Dickens" #@param {type:"string"}
# CORPUS_FILENAME = "cdickens_achristmascarol.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/cdickens_achristmascarol"  #@param {type:"string"}

# CORPUS_TITLE = 'Confessions' #@param {type:"string"}
# CORPUS_AUTHOR = "Saint Augustine" #@param {type:"string"}
# CORPUS_FILENAME = "staugustine_confessions9end.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/staugustine_confessions9end"  #@param {type:"string"}

# CORPUS_TITLE = 'Frankenstein' #@param {type:"string"}
# CORPUS_AUTHOR = "Mary Shelley" #@param {type:"string"}
# CORPUS_FILENAME = "mshelley_frankenstein.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/mshelley_frankenstein" #@param {type:"string"}

# CORPUS_TITLE = 'Great Expectations' #@param {type:"string"}
# CORPUS_AUTHOR = "Charles Dickens" #@param {type:"string"}
# CORPUS_FILENAME = "cdickens_greatexpectations.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/cdickens_greatexpectations" #@param {type:"string"}

# CORPUS_TITLE = 'Heart of Darkness' #@param {type:"string"}
# CORPUS_AUTHOR = "Joseph Conrad" #@param {type:"string"}
# CORPUS_FILENAME = "jconrad_heartofdarkness.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/jconrad_heartofdarkness" #@param {type:"string"}

# CORPUS_TITLE = 'Howards End' #@param {type:"string"}
# CORPUS_AUTHOR = "EM Forster" #@param {type:"string"}
# CORPUS_FILENAME = "emforster_howardsend.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/emforster_howardsend" #@param {type:"string"}

# CORPUS_TITLE = 'Huckleberry Finn' #@param {type:"string"}
# CORPUS_AUTHOR = "Mark Twain" #@param {type:"string"}
# CORPUS_FILENAME = "mtwain_huckleberryfinn.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/mtwain_huckleberryfinn" #@param {type:"string"}

# CORPUS_TITLE = 'Machines Like Me' #@param {type:"string"}
# CORPUS_AUTHOR = "Ian McEwan" #@param {type:"string"}
# CORPUS_FILENAME = "imcewan_machineslikeme.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/imcewan_machineslikeme" #@param {type:"string"}

# CORPUS_TITLE = 'Middlemarch' #@param {type:"string"}
# CORPUS_AUTHOR = "George Eliot" #@param {type:"string"}
# CORPUS_FILENAME = "geliot_middlemarch_wprelude.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/geliot_middlemarch"  #@param {type:"string"}

# CORPUS_TITLE = 'Frankenstein' #@param {type:"string"}
# CORPUS_AUTHOR = "Mary Shelley" #@param {type:"string"}
# CORPUS_FILENAME = "mshelley_frankenstein.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/mshelley_frankenstein" #@param {type:"string"}

# CORPUS_TITLE = 'Mrs. Dalloway' #@param {type:"string"}
# CORPUS_AUTHOR = "Virginia Woolf" #@param {type:"string"}
# CORPUS_FILENAME = "vwoolf_mrsdalloway.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/vwoolf_mrsdalloway" #@param {type:"string"}

# CORPUS_TITLE = 'Narrative Life of Frederick Douglass' #@param {type:"string"}
# CORPUS_AUTHOR = "Frederick Douglass" #@param {type:"string"}
# CORPUS_FILENAME = "fdouglass_narrativelifeofaslave.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/fdouglass_narrativelifeofaslave"  #@param {type:"string"}

# CORPUS_TITLE = 'Orlando' #@param {type:"string"}
# CORPUS_AUTHOR = "Virginia Woolf" #@param {type:"string"}
# CORPUS_FILENAME = "vwoolf_orlando.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/vwoolf_orlando" #@param {type:"string"}

# CORPUS_TITLE = 'Palefire - Commentary' #@param {type:"string"}
# CORPUS_AUTHOR = "Vladimir Nabokov" #@param {type:"string"}
# CORPUS_FILENAME = "vnabokov_palefire_commentary.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/vnabokov_palefire" #@param {type:"string"}

# CORPUS_TITLE = 'Portrait of a Lady' #@param {type:"string"}
# CORPUS_AUTHOR = "Henry James" #@param {type:"string"}
# CORPUS_FILENAME = "hjames_portraitofalady.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/hjames_portraitofalady" #@param {type:"string"}

# CORPUS_TITLE = 'Portrait of the Artist as a Young Man' #@param {type:"string"}
# CORPUS_AUTHOR = "James Joyce" #@param {type:"string"}
# CORPUS_FILENAME = "jjoyce_portraitoftheartist.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/jjoyce_portraitoftheartist" #@param {type:"string"}

# CORPUS_TITLE = 'Pride and Prejudice' #@param {type:"string"}
# CORPUS_AUTHOR = "Jane Austen" #@param {type:"string"}
# CORPUS_FILENAME = "jausten_prideandprejudice.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/jausten_prideandprejudice" #@param {type:"string"}

# CORPUS_TITLE = 'Robinson Crusoe' #@param {type:"string"}
# CORPUS_AUTHOR = "Daniel Defoe" #@param {type:"string"}
# CORPUS_FILENAME = "ddefoe_robinsoncrusoe.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/ddefoe_robinsoncrusoe" #@param {type:"string"}

# CORPUS_TITLE = 'The Great Gatsby' #@param {type:"string"}
# CORPUS_AUTHOR = "F. Scott Fitzgerald" #@param {type:"string"}
# CORPUS_FILENAME = "fscottfitzgerald_thegreatgatsby.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/fscottfitzgerald_thegreatgatsby" #@param {type:"string"}

# CORPUS_TITLE = 'The Socerers Stone' #@param {type:"string"}
# CORPUS_AUTHOR = "J.K. Rowling" #@param {type:"string"}
# CORPUS_FILENAME = "jkrowling_1sorcerersstone.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/jkrowling_1sorcerersstone" #@param {type:"string"}

# CORPUS_TITLE = 'The Wonderful Wizard of Oz' #@param {type:"string"}
# CORPUS_AUTHOR = "Frank Baum" #@param {type:"string"}
# CORPUS_FILENAME = "fbaum_thewonderfulwizardofoz.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/fbaum_thewonderfulwizardofoz" #@param {type:"string"}

# CORPUS_TITLE = 'The Waves' #@param {type:"string"}
# CORPUS_AUTHOR = "Virginia Woolf" #@param {type:"string"}
# CORPUS_FILENAME = "vwoolf_thewaves.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/vwoolf_thewaves" #@param {type:"string"}

# CORPUS_TITLE = 'To The Lighthouse' #@param {type:"string"}
# CORPUS_AUTHOR = "Virginia Woolf" #@param {type:"string"}
# CORPUS_FILENAME = "vwoolf_tothelighthouse.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/vwoolf_tothelighthouse" #@param {type:"string"}

# CORPUS_TITLE = 'The Odyssey' #@param {type:"string"}
# CORPUS_AUTHOR = "Homer SButler" #@param {type:"string"}
# CORPUS_FILENAME = "sbutler_odyssey.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/sbutler_odyssey"  #@param {type:"string"}

# CORPUS_TITLE = 'The Odyssey' #@param {type:"string"}
# CORPUS_AUTHOR = "Homer trans. E.Wilson" #@param {type:"string"}
# CORPUS_FILENAME = "homer-ewilson_odyssey.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/homer-ewilson_odyssey"  #@param {type:"string"}

# CORPUS_TITLE = 'The Guermantes Way - English' #@param {type:"string"}
# CORPUS_AUTHOR = "Marcel Proust" #@param {type:"string"}
# CORPUS_FILENAME = "mproust_3guermantesway_mtreharne_en.txt" #@param {type:"string"}
# CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/mproust_time"  #@param {type:"string"}

CORPUS_TITLE = 'The Guermantes Way' #@param {type:"string"}
CORPUS_AUTHOR = "Marcel Proust trans M.Treharne" #@param {type:"string"}
CORPUS_FILENAME = "mproust-mtreharne_3guermantesway.txt" #@param {type:"string"}
CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/mproust-mtreharne_3guermantesway"  #@param {type:"string"}

CORPUS_LANGUAGE = "English" #@param ["English", "French"]

CHAPTER_HEADINGS = "CHAPTER" #@param ["CHAPTER", "BOOK", "None"]
CHAPTER_NUMBERING = "Roman (I,II,...)" #@param ["Arabic (1,2,...)", "Roman (I,II,...)"]
SECTION_HEADINGS = "None" #@param ["SECTION (ArabicNo)", "SECTION (RomanNo)", "----- (Hyphens)", "None"]

LEXICONS_SUBDIR = "./research/2021/sa_book_code/books_sa/lexicons" #@param {type:"string"}

CORPUS_FULL = f'{CORPUS_TITLE} by: {CORPUS_AUTHOR}'

PLOT_OUTPUT = "Major" #@param ["None", "Major", "All"]

FILE_OUTPUT = "Major" #@param ["None", "Major", "All"]


gdrive_subdir = CORPUS_SUBDIR
corpus_filename = CORPUS_FILENAME
CORPUS_LANGUAGE = CORPUS_LANGUAGE.lower()
author_str = ''.join(CORPUS_AUTHOR.split()).lower()
author_abbr_str = (CORPUS_AUTHOR.split(' ')[0][0]+CORPUS_AUTHOR.split(' ')[1]).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
title_str = re.sub(r'[^A-Za-z0-9]','', title_str).lower()

print(f'\nWorking Corpus Datafile: ------------------------------ \n\n    {CORPUS_SUBDIR}')
print(f'\nFull Corpus Title/Author: ------------------------------ \n\n    {CORPUS_FULL}')


if CHAPTER_HEADINGS == 'CHAPTER':
  if CHAPTER_NUMBERING == "Arabic (1,2,...)":
    # pattern_chap = r'CHAPTER [0123456789]{1,2} ' # [\.]?[^\n]*'
    pattern_chap = r'CHAPTER [0123456789]{1,2}[.]?[^\n]*' # [os.return]*'
  elif CHAPTER_NUMBERING == "Roman (I,II,...)":
    pattern_chap = r'CHAPTER[\s]{1,5}[IVXL]{1,10}[.:]?[\s]+' # [^\n]+'
    # pattern_chap = r'CHAPTER[\s]{1,}[IVXL]{1,10}[.:]?[^\n\r]*'
  else:
    print(f'ERROR: Illegal CHAPTER_NUMBERING value = {CHAPTER_NUMBERING}')

elif CHAPTER_HEADINGS == 'BOOK':
  if CHAPTER_NUMBERING == "Arabic (1,2,...)":
    pattern_chap = r'BOOK [0123456789]{1,2}[.]?[^\n]*'
  elif CHAPTER_NUMBERING == "Roman (I,II,...)":
    pattern_chap = r'[\s]*BOOK[\s]{1,5}[IVXL]{1,10}[.:]?[\s]+' # [.:]?[\s]*[^\n]*[\n\r]+' # ]{0,1}[^\n]*' # [^\n]*' # Problems with embedded 'Book'
  else:
    print(f'ERROR: Illegal CHAPTER_NUMBERING value = {CHAPTER_NUMBERING}')

elif CHAPTER_HEADINGS == "None":
  pattern_chap = r'CHAPTER [0123456789]{1,2}[.]?[^\n]*'

else:
  print(f'ERROR: Illegal CHAPTER_HEADINGS value = {CHAPTER_HEADINGS}')

# Default Section RegEx Pattern
pattern_sect = 'SECTION [0123456789]{1,2}[^\n]*'

if SECTION_HEADINGS == 'SECTION (ArabicNo)':
  # pattern_sect = r'SECTION [0-9]{1,2} [^\n]*'
  # TODO: [^\n] gets parsed into [^\\n] causing problems, so simplify
  pattern_sect = r'SECTION [0123456789]{1,2}[.:]?[^\n]*'
elif SECTION_HEADINGS == 'SECTION (RomanNo)':
  pattern_sect = r'SECTION [IVXL]{1,10}[.:]?[^\n\r]+' # } [A-Z \.-:—;-’\'"]*[\n]*'
elif SECTION_HEADINGS == '----- (Hyphens)':
  pattern_sect = r'^[- ]{3,}[^\n]*'
elif SECTION_HEADINGS == 'None':
  pass
else:
  print(f'ERROR: Illegal SECTION_HEADING value = {SECTION_HEADINGS}')

print(f'\nCHAPTER Headings: ------------------------------ \n\n    {CHAPTER_HEADINGS}')

print(f'\nSECTION Headings: ------------------------------ \n\n    {SECTION_HEADINGS}')


print(f'\nCorpus file information: ------------------------------ \n')
!ls -al $CORPUS_FILENAME

# Verify contents of Corpus File is Correctly Formatted
#   
# TODO: ./utils/verify_format.py



Working Corpus Datafile: ------------------------------ 

    ./research/2021/sa_book_code/books_sa/mproust-mtreharne_3guermantesway

Full Corpus Title/Author: ------------------------------ 

    The Guermantes Way by: Marcel Proust trans M.Treharne

CHAPTER Headings: ------------------------------ 

    CHAPTER

SECTION Headings: ------------------------------ 

    None

Corpus file information: ------------------------------ 

-rw------- 1 root root 1413451 Sep 17 21:59 mproust-mtreharne_3guermantesway.txt


In [61]:
!pwd

/gdrive/My Drive/research/2021/sa_book_code/books_sa/mproust-mtreharne_3guermantesway


# **Utility Functions**

## **General Setup**

In [62]:
from transformers import pipeline

## **Sentiment Translations**

In [63]:
"""
def adj_polarityprobability2float(pol_str, prob_fl):
  '''
  Given a Polarity string (Negative or Positive) and a Probability float (0.0-1.0)
  Return a Sentiment float value (-1.0 to 1.0)
  '''
  sign_fl = 1.0
  if pol_str.lower().startswith('neg'):
    # print(f'pol_str: {pol_str} is Negative')
    sign_fl = -1.0
  elif pol_str.lower().startswith('pos'):
    # print(f'pol_str: {pol_str} is Positive')
    pass
  else:
    print(f'ERROR: pol_str: {pol_str} is neither Negative nor Positive')
    sign_fl = 0.0

  return sign_fl * prob_fl

# Test
# polprob2sentiment('Positive', 0.91)
""";

In [64]:
def twoway_probability2sentiment(text_str, sentiment_2polarity_fn, pol_labels=['negative','positive']):
  '''
  Given a text string, sentiment_fn (return 0.0 to 1.0) and a list of 2 labels for negative and positive classes
    e.g. CamemBERT returns (LABEL_0/LABEL_1) for (Negative/Positive)
         xxx returns (LABEL_1/LABEL_2) for (Negative/Positive)
         xxx returns (NEGATIVE/POSITIVE)
         xxx returns (Neg/Pos)
  Get return a sign adjusted sentiment score -1.0 to 1.0
  '''

  model_score = sentiment_2polarity_fn(text_str)
  pol_str = model_score[0]['label']

  # print(f'pol_str = {pol_str} and is type:{type(pol_str)}')
  score_fl = float(model_score[0]['score'])
  # print(f'score_fl = {score_fl} and is type{type(score_fl)}')

  # print(f'pol_str.lower: {pol_str.lower()} and pol_labels[0]: {pol_labels[0]}')
  if (pol_str.lower() in pol_labels[0].lower()):
    # print('negative')
    sign_fl = -1.0
  elif (pol_str.lower() in pol_labels[1].lower()):
    # print('positive')
    sign_fl = 1.0
  else:
    print(f'ERROR polarity string: {pol_str} must be one of two values (e.g. [Nn]egative|[Pp]ositive)')
    return -99
    
  return sign_fl * score_fl

# Test
# test_fl = wrapper_polprob2sentiment('I hate your guts you bastard!') # sentiment_analysis('section')[0]['label'],sentiment_analysis('section')[0]['score']))
# print(f'test_fl: {test_fl}')

In [65]:
"""

def twoway_probability2sentiment(text_str, sentiment_2polarity_fn):
  '''
  Given a text string and sentiment_fn that returns ['negative|positive', float(0.0-1.0)]
  Get return a sign adjusted sentiment score -1.0 to 1.0
  '''
  model_score = sentiment_2polarity_fn(text_str)
  pol_str = model_score[0]['label']
  # print(f'pol_str = {pol_str} and is type:{type(pol_str)}')
  score_fl = float(model_score[0]['score'])
  # print(f'score_fl = {score_fl} and is type{type(score_fl)}')

  if (pol_str.lower().startswith('neg')) | (pol_str in ['LABEL_1','0']):
    # print('negative')
    sign_fl = -1.0
  elif (pol_str.lower().startswith('pos')) | (pol_str in ['LABEL_2','1']):
    # print('positive')
    sign_fl = 1.0
  else:
    print(f'ERROR polarity string: {pol_str} must be one of two values (e.g. [Nn]egative|[Pp]ositive)')

  return sign_fl * score_fl

# Test
# test_fl = wrapper_polprob2sentiment('I hate your guts you bastard!') # sentiment_analysis('section')[0]['label'],sentiment_analysis('section')[0]['score']))
# print(f'test_fl: {test_fl}')

""";

In [66]:
def threeway_probability2sentiment(text_str, sentiment_2polarity_fn):
  '''
  Given a text string and sentiment_fn that returns ['negative|positive|neutral', float(0.0-1.0)]
  Get return a sign adjusted sentiment score -1.0 to 1.0
  '''
  sign_fl = 1.0
  
  # Special case for Pysentimiento
  score_pysentimiento_fl = -99.0    # Use Pysentimeinto score as flag with val -99.0
  if False: # sentiment_2polarity_fn == analyzer.predict:
    # from pysentimiento import SentimentAnalyzer
    # analyzer = SentimentAnalyzer(lang="en")
    # print('Using Pysentimiento')
    text_str_ls = text_str.split()[:125]
    text_125_str = ' '.join(text_str_ls)
    pol_object = analyzer.predict(text_125_str)
    pol_str = pol_object.output
    if pol_str == 'NEG':
      sign_fl = -1.0
    elif pol_str == 'NEU':
      sign_fl = 1.0
    else:
      # Polarity is 'POS' by default
      sign_fl = 1.0
    score_pysentimiento_fl = sign_fl * pol_object.probas[pol_str]

    # Distribute the Neutral values between -0.5 and +0.5
    if pol_str == 'NEU':
      score_pysentimiento_fl = score_pysentimiento_fl - 0.5

  # General case for other 3-way sentiment models
  else:
    # print('Not using Pysentimiento')
    model_score = sentiment_2polarity_fn(text_str)
    pol_str = model_score[0]['label']
    # print(f'pol_str = {pol_str} and is type:{type(pol_str)}')
    score_fl = float(model_score[0]['score'])
    # print(f'score_fl = {score_fl} and is type{type(score_fl)}')

    if (pol_str.lower().startswith('neu')) | (pol_str in ['NEU','LABEL_0']):
      # print('negative')
      if score_fl < 0.5:
        sign_fl = -1.0
      else:
        sign_fl = +1.0
      adj_base = 0.0
    elif (pol_str.lower().startswith('neg')) | (pol_str in ['NEG','LABEL_1']):
      # print('positive')
      sign_fl = -1.0
      adj_base = -1.0
    elif (pol_str.lower().startswith('pos')) | (pol_str in ['POS','LABEL_2']):
      # print('positive')
      sign_fl = 1.0
      adj_base = 1.0
    else:
      print(f'ERROR polarity string: {pol_str} must be one of two values (e.g. [Nn]egative|[Pp]ositive)')

  if score_pysentimiento_fl == -99.0:
    adj_score = (sign_fl * score_fl) + adj_base
  else:
    adj_score = score_pysentimiento_fl

  return adj_score # , adj_base

# Test
# test_fl = wrapper_polprob2sentiment('I hate your guts you bastard!') # sentiment_analysis('section')[0]['label'],sentiment_analysis('section')[0]['score']))
# print(f'test_fl: {test_fl}')

In [67]:
def fiveway_probability2sentiment(text_str, sentiment_5star_fn):
  '''
  Given a text string and sentiment_fn that returns '1 star' to '5 stars' rating with probability]
  Get return a sign adjusted sentiment score 0.0 to 5.0
  '''

  model_score = sentiment_5star_fn(text_str)
  pol_str = model_score[0]['label']
  # print(f'pol_str = [{pol_str}]')
  if pol_str in ['1 star','LABEL_0']:
    score_base = 0.0
  elif pol_str in ['2 stars','LABEL_1']:
    score_base = 1.0
  elif pol_str in ['3 stars','LABEL_2']:
    score_base = 2.0
  elif pol_str in ['4 stars','LABEL_3']:
    score_base = 3.0
  elif pol_str in ['5 stars','LABEL_4']:
    score_base = 4.0
  else:
    print(f"ERROR: polarity string = {pol_str} must be in [1-5] 'stars'")
    score_base = 2.0

  score_fl = score_base + model_score[0]['score']

  return score_fl

# Test
# test_fl = wrapper_polprob2sentiment('I hate your guts you bastard!') # sentiment_analysis('section')[0]['label'],sentiment_analysis('section')[0]['score']))
# print(f'test_fl: {test_fl}')


# **Transformer Sentiment Models**

* https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb


In [None]:
# Be sure you have loaded in Corpus Sentences into corpus_sents_trans_df at top of Notebook

In [68]:
corpus_sents_trans_df.head()

Unnamed: 0,sent_no,sent_raw
0,0,The early-morning twitter of the birds sounded...
1,1,Every word from the maids quarters made her ju...
2,2,All this was because we had moved house.
3,3,It is true that the servants in our former hom...
4,4,But now she even made silence the object of he...


## **(5-way) RoBERTa Large 15 Datasets**

* https://huggingface.co/siebert/sentiment-roberta-large-english
* https://huggingface.co/roberta-base 

In [69]:
from transformers import pipeline

sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
print(sentiment_analysis("I love this!"))

Downloading:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9988656044006348}]


In [70]:
# Direct Test

sentiment_analysis('I love wonderful good things')
print('\n')
sentiment_analysis('I hate your guts you filthy bastard')
print('\n')
sentiment_analysis('It is')

[{'label': 'POSITIVE', 'score': 0.9985595345497131}]





[{'label': 'NEGATIVE', 'score': 0.9992327690124512}]





[{'label': 'POSITIVE', 'score': 0.9937145113945007}]

In [71]:
# Test

model_adj_score = twoway_probability2sentiment('I love wonderful good things', sentiment_analysis, pol_labels=['NEGATIVE','POSITIVE'])
model_adj_score
print('\n')
model_adj_score = threeway_probability2sentiment('It is not good', sentiment_analysis) # , pol_labels=['NEGATIVE','POSITIVE'])
model_adj_score

0.9985595345497131





-1.999458372592926

In [72]:
# Test

model_adj_score = twoway_probability2sentiment('I hate your guts you bastard', sentiment_analysis, pol_labels=['NEGATIVE','POSITIVE'])
model_adj_score


-0.9986940026283264

In [73]:
# Verify Transformer DataFrame content

corpus_sents_trans_df.head(2)
corpus_sents_trans_df.tail(2)


Unnamed: 0,sent_no,sent_raw
0,0,The early-morning twitter of the birds sounded...
1,1,Every word from the maids quarters made her ju...


Unnamed: 0,sent_no,sent_raw
8386,8386,Youre in strapping shape.
8387,8387,Youll live to see us all in our graves!


In [74]:
%%time

# NOTE: ~15-20m
#       28m50s (20210915 at 13:26) Colab Pro: GPU+RAM (jausten_prideandprejudice)
#       ? m ? s mins (20210915 at 13:26) Colab Pro: GPU+RAM (mshelley_frankenstein)
#       6m10s (20210915 at 13:26) Colab Pro: GPU+RAM (cdickens_achristmascarol)
#       8m44s (20210915 at 13:26) Colab Pro: GPU+RAM (fbaum_thewonderfulwizardofoz)
#       24m24s (20210917 at 07:37) Colab Pro: GPU+RAM (mtwain_huckleberryfinn)
#       51m22s (20210915 at 13:26) Colab Pro: GPU+RAM (roust-mtreharne_3guermanteswa)

# corpus_sents_trans_df['roberta15lg'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: twoway_probability2sentiment(x[:510], sentiment_analysis, pol_labels=['NEGATIVE','POSITIVE']))
# corpus_sents_trans_df['roberta15lg'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: twoway_probability2sentiment(x[:510], sentiment_analysis, pol_labels=['Negative','Positive']))
corpus_sents_trans_df['roberta15lg'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: threeway_probability2sentiment(x[:510], sentiment_analysis)) # , pol_labels=['Negative','Positive']))
corpus_sents_trans_df.head()


CPU times: user 1h 42min 31s, sys: 33.1 s, total: 1h 43min 4s
Wall time: 51min 22s


In [75]:
corpus_sents_trans_df.head(2)

Unnamed: 0,sent_no,sent_raw,roberta15lg
0,0,The early-morning twitter of the birds sounded...,-1.999154
1,1,Every word from the maids quarters made her ju...,-1.997634


In [76]:
author_abbr_str

'mproust'

In [77]:
title_str

'theguermantesway'

In [78]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

Saving to file: sum_sentiments_sents_trans_mproust_theguermantesway.csv


## **(5-way) Yelp Sentiment Finetuned**

* https://huggingface.co/gilf/english-yelp-sentiment

In [79]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("gilf/english-yelp-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("gilf/english-yelp-sentiment")

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/748 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [80]:
sentiment_analysis = pipeline("sentiment-analysis",model="gilf/english-yelp-sentiment")


In [81]:
# Test Directly

test_result_pos = sentiment_analysis('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis('I hate bad evil dislike')
test_result_neg # [0]['label']

[{'label': 'LABEL_4', 'score': 0.7975343465805054}]





[{'label': 'LABEL_0', 'score': 0.33993467688560486}]





[{'label': 'LABEL_2', 'score': 0.28286439180374146}]

In [82]:
# Test Indirectly

sentence_str = 'I love wonderful good things'
polarity = fiveway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'It is what it is'
polarity = fiveway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = fiveway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}')


Positive Sentence Polarity: 4.547934353351593
    Text: I love wonderful good things

Neutral Sentence Polarity: 2.7140876054763794
    Text: It is what it is

Negative Sentence Polarity: 0.9732653498649597
    Text: I hate your stinking guts you filthy lying stealing cheating bastard.


In [83]:
%%time

# NOTE: 2 minutes up
#       7m53s (20210915 at 13:26) Colab Pro: GPU+RAM (jausten_prideandprejudice)
#       4m06s (20210915 at 13:26) Colab Pro: GPU+RAM (mshelley_frankenstein)
#       1m39s (20210915 at 13:26) Colab Pro: GPU+RAM (cdickens_achristmascarol)
#       2m15s (20210915 at 13:26) Colab Pro: GPU+RAM (fbaum_thewonderfulwizardofoz)
#       6m49s (20210917 at 07:37) Colab Pro: GPU+RAM (mtwain_huckleberryfinn)
#       14m14s (20210915 at 13:26) Colab Pro: GPU+RAM (roust-mtreharne_3guermanteswa)

corpus_sents_trans_df['yelp'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: fiveway_probability2sentiment(x[:510], sentiment_analysis))
corpus_sents_trans_df.head()


CPU times: user 28min 21s, sys: 13.1 s, total: 28min 34s
Wall time: 14min 14s


In [84]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

Saving to file: sum_sentiments_sents_trans_mproust_theguermantesway.csv


## **(5-way) MULTILINGUAL NLPTown BERT**

* https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment

In [85]:
from transformers import pipeline

nlptown_sentiment_analysis = pipeline("sentiment-analysis",model="nlptown/bert-base-multilingual-uncased-sentiment")
print(nlptown_sentiment_analysis("I love this!"))

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

[{'label': '5 stars', 'score': 0.9236246943473816}]


In [86]:
sentiment_analysis = pipeline("sentiment-analysis",model="nlptown/bert-base-multilingual-uncased-sentiment")

In [87]:
# Test Directly
test_result_pos = sentiment_analysis('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis('I hate bad evil dislike')
test_result_neg # [0]['label']

[{'label': '5 stars', 'score': 0.9404123425483704}]





[{'label': '1 star', 'score': 0.3800172507762909}]





[{'label': '1 star', 'score': 0.6309829354286194}]

In [88]:
# Test Indirectly

sentence_str = 'I love wonderful good things'
polarity = fiveway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'It is what it is'
polarity = fiveway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = fiveway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}')


Positive Sentence Polarity: 4.767589509487152
    Text: I love wonderful good things

Neutral Sentence Polarity: 4.464488178491592
    Text: It is what it is

Negative Sentence Polarity: 0.7658354043960571
    Text: I hate your stinking guts you filthy lying stealing cheating bastard.


In [89]:
# Test Directly

nlptown_sentiment_analysis('I love wonderful good things')
print('\n')
test_result = nlptown_sentiment_analysis('I hate your guts you filthy bastard')
test_result[0]['label']

[{'label': '5 stars', 'score': 0.7675895094871521}]





'1 star'

In [90]:
# Test Indirectly

fiveway_probability2sentiment('I love wonderful good things', nlptown_sentiment_analysis)
print('\n')
fiveway_probability2sentiment('I hate your stinking guts you filthy lying stealing cheating bastard.', nlptown_sentiment_analysis)

4.767589509487152





0.7658354043960571

In [91]:
%%time

# NOTE: ~5-7 minutes runtime
#       28m50s (20210915 at 13:26) Colab Pro: GPU+RAM (jausten_prideandprejudice)
#       4m13s (20210915 at 13:26) Colab Pro: GPU+RAM (mshelley_frankenstein)
#       1m41s (20210915 at 13:26) Colab Pro: GPU+RAM (cdickens_achristmascarol)
#       2m15s (20210915 at 13:26) Colab Pro: GPU+RAM (fbaum_thewonderfulwizardofoz)
#       6m48s (20210917 at 07:37) Colab Pro: GPU+RAM (mtwain_huckleberryfinn)
#       14m25s (20210915 at 13:26) Colab Pro: GPU+RAM (roust-mtreharne_3guermanteswa)

corpus_sents_trans_df['nlptown'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: fiveway_probability2sentiment(x[:510], sentiment_analysis))
corpus_sents_trans_df.head()


CPU times: user 28min 44s, sys: 13 s, total: 28min 57s
Wall time: 14min 25s


In [None]:
# END

In [None]:
# import torch

In [None]:
"""
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
""";

In [None]:
"""

# Predict Tokens
tokens = tokenizer.encode("It wasn't the worst i've seen, in fact, it was the opposite", return_tensors='pt')
# tokens[0]
# tokenizer.decode(tokens[0])
result = model(tokens)
result

""";

In [None]:
"""

predict_sentiment = int(torch.argmax(result.logits))+1
predict_sentiment

""";

In [None]:
"""

def nlptown_sentiment_score(text):
  '''
  Given a text string (sentence or paragraph)
  Return a floating point sentiment value
  '''

  # tokens = tokenizer.encode(text, return_tensors='pt')
  # result = model(tokens)
  # sentiment_int = int(torch.argmax(result.logits))+1
  # sentiment_fl = sentiment_int + result.logits[sentiment_int-1]
  # return sentiment_fl

  tokens = tokenizer.encode(text, return_tensors='pt')
  result = model(tokens)
  type(result)
  prob_ls = list(result.logits)[0].tolist()
  # print(f'prob_ls: {prob_ls}')
  # prob_ls_sum = sum(prob_ls)
  prob_ls_sum = sum(map(abs, prob_ls))
  prob_norm_ls = [abs(i/prob_ls_sum) for i in prob_ls]
  # prob_ls_min = min(prob_ls)
  # prob_ls_max = max(prob_ls)
  # prob_norm_ls = [(x-prob_ls_min)/(prob_ls_max-prob_ls_min) for x in prob_ls]
  # print(f'prob_norm_ls {prob_norm_ls}')
  prob_int = int(torch.argmax(result.logits))
  # print(f'prob_int {prob_int}')
  prob_frac = abs(float(prob_norm_ls[prob_int]))
  # print(f'prob_frac {prob_frac}')
  
  return prob_int + prob_frac # int(torch.argmax(result.logits))+1
""";

In [None]:
# nlptown_sentiment_score('i love the smell of beautiful flowers, the make me happy')

In [None]:
%time

# NOTE: 10m Long-running process

# Calculate Sentence Sentiment Scores using the NLPTown BERT fine-grained, fine-tuned, multi-lingual model

# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment

# This a bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews in 
#    six languages: English, Dutch, German, French, Spanish and Italian. 
#    It predicts the sentiment of the review as a number of stars (between 1 and 5).

# corpus_sents_df['nlptown'] = corpus_sents_df['sent_raw'].astype('str').apply(lambda x: nlptown_sentiment_score(x))

In [92]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

Saving to file: sum_sentiments_sents_trans_mproust_theguermantesway.csv


## **Huggingface Distill BERT SST**

* https://www.machinecurve.com/index.php/2020/12/23/easy-sentiment-analysis-with-machine-learning-and-huggingface-transformers/

* https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english 

In [93]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [94]:
results = classifier(["We are very happy to show you the 🤗 Transformers library.",
                      "We hope you don't hate it."])

for result in results:
  print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

print('\n')
type(results[0])

label: POSITIVE, with score: 0.9998
label: NEGATIVE, with score: 0.5309




dict

In [95]:
# Test Directly

test_result_pos = classifier('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = classifier('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = classifier('I hate bad evil dislike')
test_result_neg # [0]['label']

[{'label': 'POSITIVE', 'score': 0.9998840093612671}]





[{'label': 'NEGATIVE', 'score': 0.9996901750564575}]





[{'label': 'NEGATIVE', 'score': 0.9387207627296448}]

In [96]:
# Test Indirectly

sentence_str = 'I love wonderful good things'
polarity = twoway_probability2sentiment(sentence_str, classifier, pol_labels=['NEGATIVE','POSITIVE'])
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'is'
polarity = twoway_probability2sentiment(sentence_str, classifier, pol_labels=['NEGATIVE','POSITIVE'])
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = twoway_probability2sentiment(sentence_str, classifier, pol_labels=['NEGATIVE','POSITIVE'])
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}');


Positive Sentence Polarity: 0.9998846054077148
    Text: I love wonderful good things

Neutral Sentence Polarity: 0.9274749159812927
    Text: is

Negative Sentence Polarity: -0.9986554384231567
    Text: I hate your stinking guts you filthy lying stealing cheating bastard.


In [97]:
%%time

# NOTE: !5 minutes runtime
#       28m50s (20210915 at 13:26) Colab Pro: GPU+RAM (jausten_prideandprejudice)
#       2m00s (20210915 at 13:26) Colab Pro: GPU+RAM (mshelley_frankenstein)
#       0m50s (20210915 at 13:26) Colab Pro: GPU+RAM (cdickens_achristmascarol)
#       1m08s (20210915 at 13:26) Colab Pro: GPU+RAM (fbaum_thewonderfulwizardofoz)
#       3m25s (20210917 at 07:37) Colab Pro: GPU+RAM (mtwain_huckleberryfinn)
#       7m07s (20210915 at 13:26) Colab Pro: GPU+RAM (roust-mtreharne_3guermanteswa)

corpus_sents_trans_df['huggingface'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: twoway_probability2sentiment(x[:510], classifier, pol_labels=['NEGATIVE','POSITIVE']))
corpus_sents_trans_df.head()

CPU times: user 14min 10s, sys: 6.82 s, total: 14min 17s
Wall time: 7min 7s


In [98]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

Saving to file: sum_sentiments_sents_trans_mproust_theguermantesway.csv


## **(3-way) BERT Multilingual Mixed Code Hinglish**

* https://huggingface.co/rohanrajpal/bert-base-multilingual-codemixed-cased-sentiment

In [99]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("rohanrajpal/bert-base-multilingual-codemixed-cased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("rohanrajpal/bert-base-multilingual-codemixed-cased-sentiment")

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/828 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712M [00:00<?, ?B/s]

In [100]:
sentiment_analysis = pipeline("sentiment-analysis",model="rohanrajpal/bert-base-multilingual-codemixed-cased-sentiment")


In [101]:
# Test Directly

test_result_pos = sentiment_analysis('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis('I hate bad evil dislike')
test_result_neg # [0]['label']

[{'label': 'LABEL_2', 'score': 0.9715988636016846}]





[{'label': 'LABEL_0', 'score': 0.8095260262489319}]





[{'label': 'LABEL_1', 'score': 0.9319002032279968}]

In [102]:
# Test Indirectly

sentence_str = 'I love wonderful good things'
polarity = threeway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'It is what it is'
polarity = threeway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = threeway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}')


Positive Sentence Polarity: 1.971106469631195
    Text: I love wonderful good things

Neutral Sentence Polarity: 1.7358180284500122
    Text: It is what it is

Negative Sentence Polarity: -1.8960636258125305
    Text: I hate your stinking guts you filthy lying stealing cheating bastard.


In [105]:
corpus_filename

'mproust-mtreharne_3guermantesway.txt'

In [103]:
%%time

# NOTE: ~6-10 minutes runtime
#       7m17s (20210915 at 13:26) Colab Pro: GPU+RAM (jausten_prideandprejudice)
#       4m15s (20210915 at 13:26) Colab Pro: GPU+RAM (mshelley_frankenstein)
#       1m44s (20210915 at 13:26) Colab Pro: GPU+RAM (cdickens_achristmascarol)
#       ? m ? s (20210915 at 13:26) Colab Pro: GPU+RAM (fbaum_thewonderfulwizardofoz)
#       6m53s (20210917 at 07:37) Colab Pro: GPU+RAM (mtwain_huckleberryfinn)
#       14m47s (20210917 at 07:37) Colab Pro: GPU+RAM (mproust-mtreharne_3guermantesway)

corpus_sents_trans_df['hinglish'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: threeway_probability2sentiment(x[:510], sentiment_analysis))
corpus_sents_trans_df.head()

CPU times: user 29min 27s, sys: 13 s, total: 29min 40s
Wall time: 14min 47s


In [104]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

Saving to file: sum_sentiments_sents_trans_mproust_theguermantesway.csv


## **(2-way) IMDB Sentiment**

* https://huggingface.co/abhishek/autonlp-imdb_sentiment_classification-31154 (metrics)

In [112]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("abhishek/autonlp-imdb_sentiment_classification-31154")

model = AutoModelForSequenceClassification.from_pretrained("abhishek/autonlp-imdb_sentiment_classification-31154")

Downloading:   0%|          | 0.00/283 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/816 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

In [113]:
sentiment_analysis = pipeline("sentiment-analysis",model="abhishek/autonlp-imdb_sentiment_classification-31154")

In [114]:
# Test Directly

test_result_pos = sentiment_analysis('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis('I hate bad evil dislike')
test_result_neg # [0]['label']

[{'label': '1', 'score': 0.9978368878364563}]





[{'label': '0', 'score': 0.9467214345932007}]





[{'label': '0', 'score': 0.9924196600914001}]

In [115]:
# Test Indirectly

sentence_str = 'I love wonderful good things'
polarity = twoway_probability2sentiment(sentence_str, sentiment_analysis, pol_labels=['0','1'])
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'It is what it is'
polarity = twoway_probability2sentiment(sentence_str, sentiment_analysis, pol_labels=['0','1'])
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = twoway_probability2sentiment(sentence_str, sentiment_analysis, pol_labels=['0','1'])
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}')


Positive Sentence Polarity: 0.9855077862739563
    Text: I love wonderful good things

Neutral Sentence Polarity: 0.9945908784866333
    Text: It is what it is

Negative Sentence Polarity: -0.7816630601882935
    Text: I hate your stinking guts you filthy lying stealing cheating bastard.


In [116]:
corpus_sents_trans_df.columns

Index(['sent_no', 'sent_raw', 'roberta15lg', 'yelp', 'nlptown', 'huggingface',
       'hinglish'],
      dtype='object')

In [117]:
%%time

# NOTE: ~5 minutes runtime
#       3m33s mins (20210915 at 13:26) Colab Pro: GPU+RAM (jausten_prideandprejudice)
#       1m58s mins (20210915 at 13:26) Colab Pro: GPU+RAM (mshelley_frankenstein)
#       0m50s (20210915 at 13:26) Colab Pro: GPU+RAM (cdickens_achristmascarol)
#       ? m ? s (20210915 at 13:26) Colab Pro: GPU+RAM (fbaum_thewonderfulwizardofoz)
#       3m24s (20210915 at 13:26) Colab Pro: GPU+RAM (mtwain_huckleberryfinn)
#       6m47s (20210915 at 13:26) Colab Pro: GPU+RAM (roust-mtreharne_3guermanteswa)

corpus_sents_trans_df['imdb2way'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: twoway_probability2sentiment(x[:510], sentiment_analysis, pol_labels=['0','1']))
corpus_sents_trans_df.head()

CPU times: user 14min 15s, sys: 6.71 s, total: 14min 21s
Wall time: 7min 9s


In [118]:
corpus_sents_trans_df.columns

Index(['sent_no', 'sent_raw', 'roberta15lg', 'yelp', 'nlptown', 'huggingface',
       'hinglish', 'imdb2way'],
      dtype='object')

In [119]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

Saving to file: sum_sentiments_sents_trans_mproust_theguermantesway.csv


## **(Binary) T5Base 50k Finetuned IMDB Sentiment Extraction**

* https://huggingface.co/mrm8488/t5-small-finetuned-imdb-sentiment 
* https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-small-finetuned-imdb-sentiment")

model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-small-finetuned-imdb-sentiment")

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [None]:
def get_t5imdb50k_sentiment(text):
  '''
  Given a plain text string
  Return a binary integer [-1,1] (negative,positive) sentiment value
  '''

  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

  output = model.generate(input_ids=input_ids,
               max_length=2)

  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]

  if 'positive' in label:
    score_int = 1
  elif 'negative' in label:
    score_int = -1
  else:
    score_int = 0

  return score_int

# Test
# get_sentiment("I dislike a lot that film")

In [None]:
# Test Directly

test_result_pos = get_t5imdb50k_sentiment('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = get_t5imdb50k_sentiment('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = get_t5imdb50k_sentiment('I hate bad evil dislike')
test_result_neg # [0]['label']

1





-1





-1

In [None]:
# Test Indirectly

# Production
sentence_str = 'I love wonderful good things'
polarity = get_t5imdb50k_sentiment(sentence_str)
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'This'
polarity = get_t5imdb50k_sentiment(sentence_str)
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = get_t5imdb50k_sentiment(sentence_str)
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}')


Positive Sentence Polarity: 1
    Text: I love wonderful good things

Neutral Sentence Polarity: 0
    Text: This

Negative Sentence Polarity: -1
    Text: I hate your stinking guts you filthy lying stealing cheating bastard.


In [None]:
%%time

# NOTE: ~5 minutes runtime
#       3m16s mins (20210915 at 13:26) Colab Pro: GPU+RAM (jausten_prideandprejudice)
#       1m50s mins (20210915 at 13:26) Colab Pro: GPU+RAM (mshelley_frankenstein)
#       0m46s (20210915 at 13:26) Colab Pro: GPU+RAM (cdickens_achristmascarol)
#       ? m ? s (20210915 at 13:26) Colab Pro: GPU+RAM (fbaum_thewonderfulwizardofoz)
#       ? m ? s (20210915 at 13:26) Colab Pro: GPU+RAM (mtwain huckleberryfinn)
#       3m34s (20210915 at 13:26) Colab Pro: GPU+RAM (roust-mtreharne_3guermanteswa)

corpus_sents_trans_df['t5imdb50k'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: get_t5imdb50k_sentiment(x[:510]))
corpus_sents_trans_df.head()

CPU times: user 7min 6s, sys: 3.15 s, total: 7min 9s
Wall time: 3min 34s


In [None]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

Saving to file: sum_sentiments_sents_trans_mtwain_huckleberryfinn.csv


## **(3-way: slow) MULTILINGUAL RoBERTa XLM Twitter 8 Languages**

* http://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment

In [None]:
# !pip install sentencepiece

In [47]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")

Downloading:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [48]:
sentiment_analysis = pipeline("sentiment-analysis",model="cardiffnlp/twitter-xlm-roberta-base-sentiment")


In [49]:
# Test Directly

test_result_pos = sentiment_analysis('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis('I hate bad evil dislike')
test_result_neg # [0]['label']

[{'label': 'Positive', 'score': 0.9455917477607727}]





[{'label': 'Neutral', 'score': 0.4261215627193451}]





[{'label': 'Negative', 'score': 0.9544662237167358}]

In [50]:
# Test English

sentence_str = 'I love wonderful good things'
polarity = threeway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'It is what it is'
polarity = threeway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = threeway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}')


Positive Sentence Polarity: 1.9224474430084229
    Text: I love wonderful good things

Neutral Sentence Polarity: 0.5882502794265747
    Text: It is what it is

Negative Sentence Polarity: -1.9525753259658813
    Text: I hate your stinking guts you filthy lying stealing cheating bastard.


In [51]:
# Test French

test_result_pos = sentiment_analysis("Je déteste le livre.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Bien.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Mal.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("J'aime et j'apprécie les journées ensoleillées avec des enfants rieurs qui jouent joyeusement pendant l'été insouciant.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Il est.")
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis("Cette phrase est vide.")
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis("Je déteste et méprise le mal affreux qui infecte notre organisation.")
test_result_neg # [0]['label']

[{'label': 'Negative', 'score': 0.961158275604248}]





[{'label': 'Positive', 'score': 0.7114393711090088}]





[{'label': 'Negative', 'score': 0.7386776804924011}]





[{'label': 'Positive', 'score': 0.9386938810348511}]





[{'label': 'Neutral', 'score': 0.5012816190719604}]





[{'label': 'Negative', 'score': 0.9402057528495789}]





[{'label': 'Negative', 'score': 0.9703734517097473}]

In [52]:
%%time

# NOTE: started 9:05
#       28m50s  (20210915 at 13:26) Colab Pro: GPU+RAM (jausten_prideandprejudice)
#       ? m ? s (20210915 at 21:05) Colab Pro: GPU+RAM (jausten_prideandprejudice)
#       4m12s (20210915 at 13:26) Colab Pro: GPU+RAM (mshelley_frankenstein)
#       1m44s (20210915 at 13:26) Colab Pro: GPU+RAM (cdickens_achristmascarol)
#       ? m ? s (20210915 at 13:26) Colab Pro: GPU+RAM (fbaum_thewonderfulwizardofoz)
#       ? m ? s (20210915 at 13:26) Colab Pro: GPU+RAM (mtwain_huckleberryfinn)
#       8m38s (20210915 at 13:26) Colab Pro: GPU+RAM (roust-mtreharne_3guermanteswa)

corpus_sents_trans_df['robertaxml8lang'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: threeway_probability2sentiment(x[:510], sentiment_analysis))
corpus_sents_trans_df.head()

CPU times: user 17min 13s, sys: 7.52 s, total: 17min 20s
Wall time: 8min 38s


In [106]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

Saving to file: sum_sentiments_sents_trans_mproust_theguermantesway.csv


In [None]:
corpus_sents_trans_df.shape

(5775, 10)

In [107]:
corpus_sents_trans_df.head()

Unnamed: 0,sent_no,sent_raw,roberta15lg,yelp,nlptown,huggingface,hinglish
0,0,The early-morning twitter of the birds sounded...,-1.999154,1.35255,1.457167,-0.998478,0.937866
1,1,Every word from the maids quarters made her ju...,-1.997634,1.553909,4.66003,-0.936327,1.671487
2,2,All this was because we had moved house.,-1.994677,0.33538,4.386611,-0.82905,0.741141
3,3,It is true that the servants in our former hom...,1.998172,3.652816,3.422785,0.996813,0.8347
4,4,But now she even made silence the object of he...,-1.998231,0.7407,0.362371,-0.979988,-1.753888


In [108]:
corpus_sents_trans_df.columns

Index(['sent_no', 'sent_raw', 'roberta15lg', 'yelp', 'nlptown', 'huggingface',
       'hinglish'],
      dtype='object')

In [None]:
corpus_sents_filename

'sum_sentiments_sents_trans_mtwain_huckleberryfinn.csv'

## **END ENGLISH**

In [None]:
"""
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
""";

In [None]:
# import numpy as np
# from scipy.special import softmax

In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
"""

MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
""";

In [None]:
"""

text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")
""";

## **(0.0-1.0) FRENCH Multilingual NLPTown**

* https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [None]:
sentiment_analysis = pipeline("sentiment-analysis",model="nlptown/bert-base-multilingual-uncased-sentiment")

In [None]:
# Test Directly
test_result_pos = sentiment_analysis("Je déteste le livre.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Bien.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Mal.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("J'aime et j'apprécie les journées ensoleillées avec des enfants rieurs qui jouent joyeusement pendant l'été insouciant.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Il est.")
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis("Cette phrase est vide.")
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis("Je déteste et méprise le mal affreux qui infecte notre organisation.")
test_result_neg # [0]['label']

In [None]:
corpus_sents_trans_df.columns

In [None]:
win_s1per = int(1/100 * corpus_sents_trans_df.shape[0])

In [None]:
corpus_sents_trans_df['nlptown'][:20]

In [None]:
corpus_sents_trans_df['nlptown_roll10'] = corpus_sents_trans_df['nlptown'].rolling(10*win_s1per, center=True).mean()

corpus_sents_trans_df['robertaxml8lang_roll10'] = corpus_sents_trans_df['robertaxml8lang'].rolling(10*win_s1per, center=True).mean()

corpus_sents_trans_df['camembert_roll10'] = corpus_sents_trans_df['camembert'].rolling(10*win_s1per, center=True).mean()

In [None]:
corpus_sents_trans_df['nlptown_roll10'].plot()
plt.title(f'{CORPUS_FULL}\nMultilingual NLPTown SMA 10%')

In [None]:
corpus_sents_trans_df['robertaxml8lang_roll10'].plot()
plt.title(f'{CORPUS_FULL}\nMultilingual RoBERTa XML 8 Languages SMA 10%')

In [None]:
corpus_sents_trans_df['camembert_roll10'].plot()
plt.title(f'{CORPUS_FULL}\nFrench CamemBERT Transformer SMA 10%')

## **(0.0-1.0) FRENCH CamemBERT (tuned on movie reviews)**

* https://huggingface.co/mrm8488/camembert-base-finetuned-movie-review-sentiment-analysis


Tf/Keras - Not Working (_C not installed)
* https://github.com/TheophileBlard/french-sentiment-analysis-with-bert

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("mrm8488/camembert-base-finetuned-movie-review-sentiment-analysis")

model = AutoModelForSequenceClassification.from_pretrained("mrm8488/camembert-base-finetuned-movie-review-sentiment-analysis")

In [None]:
sentiment_analysis = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

print(sentiment_analysis("Je suis plutôt confiant."))

In [None]:
# Test Directly
test_result_pos = sentiment_analysis("Je déteste le livre.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Bien.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Mal.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("J'aime et j'apprécie les journées ensoleillées avec des enfants rieurs qui jouent joyeusement pendant l'été insouciant.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Il est.")
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis("Cette phrase est vide.")
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis("Je déteste et méprise le mal affreux qui infecte notre organisation.")
test_result_neg # [0]['label']

In [None]:
# Test 

test_str = "Je déteste et méprise le mal affreux qui infecte notre organisation."
# test_str = "J'aime et j'apprécie les journées ensoleillées avec des enfants rieurs qui jouent joyeusement pendant l'été insouciant."

twoway_probability2sentiment(test_str, sentiment_analysis, pol_labels=['LABEL_0','LABEL_1'])

In [None]:
# NOTE: ~5-7 minutes runtime

corpus_sents_trans_df['camembert'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: twoway_probability2sentiment(x[:510], sentiment_analysis, pol_labels=['LABEL_0','LABEL_1']))
corpus_sents_trans_df.head()


## **(0.0-1.0) (POOR) FRENCH FlauBERT**

* https://huggingface.co/models?search=sentiment
* https://huggingface.co/DemangeJeremy/4-sentiments-with-flaubert?text=Je+t%27appr%C3%A9cie+beaucoup.+Je+t%27aime. 

Returns [0.0 to 1.0]:
* MIXED
* NEGATIVE
* OBJECTIVE
* POSITIVE

In [None]:
# !pip install Cython

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

loaded_tokenizer = AutoTokenizer.from_pretrained('flaubert/flaubert_large_cased')
loaded_model = AutoModelForSequenceClassification.from_pretrained("DemangeJeremy/4-sentiments-with-flaubert")

sentiment_analysis = pipeline('sentiment-analysis', model=loaded_model, tokenizer=loaded_tokenizer)

print(sentiment_analysis("Je suis plutôt confiant."))

In [None]:
sentiment_analysis = pipeline('sentiment-analysis', model=loaded_model, tokenizer=loaded_tokenizer)

print(sentiment_analysis("Je suis plutôt confiant."))

In [None]:
# Test Directly
test_result_pos = sentiment_analysis("Je déteste le livre.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Bien.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Mal.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("J'aime et j'apprécie les journées ensoleillées avec des enfants rieurs qui jouent joyeusement pendant l'été insouciant.")
test_result_pos # [0]['label']
print('\n')
test_result_pos = sentiment_analysis("Il est.")
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis("Cette phrase est vide.")
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis("Je déteste et méprise le mal affreux qui infecte notre organisation.")
test_result_neg # [0]['label']

In [None]:
# Test Indirectly

sentence_str = 'I love wonderful good things'
polarity = threeway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'It is what it is'
polarity = threeway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = threeway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}')

In [None]:
# NOTE: started 9:05

corpus_sents_trans_df['robertaxml8lang'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: threeway_probability2sentiment(x[:510], sentiment_analysis))
corpus_sents_trans_df.head()

# **Save Results to File**

In [None]:
corpus_sents_trans_df.head(2)

In [None]:
corpus_sents_trans_df.shape

In [None]:
title_str

In [None]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

# Paragraphs
# corpus_parags_filename = f'corpus_parags_clean_{author_str}_{title_str}_{datetime_now}.csv'
# print(f'Saving to file: {corpus_parags_filename}')

# corpus_parags_df.to_csv(corpus_parags_filename)

In [None]:
# Save Preprocessed Corpus Sentences DataFrame

# author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'sum_sentiments_sents_trans_{author_abbr_str}_{title_str}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_trans_df.to_csv(corpus_sents_filename)

In [None]:
# Verify

# !head -n 5 sum_sentiments_sents_trans_hsbutler_theodyssey.csv

# **END OF WORKING TRANSFORMERS**

## **(5-way: slow) T5 Small IMDB**

* https://huggingface.co/mrm8488/t5-small-finetuned-imdb-sentiment



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-small-finetuned-imdb-sentiment")

model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-small-finetuned-imdb-sentiment")

In [None]:
def t5smimdb_sentiment(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

  output = model.generate(input_ids=input_ids,
               max_length=2)

  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label

get_sentiment("I dislike a lot that film")

In [None]:
# Test Directly
test_result_pos = t5smimdb_sentiment('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = t5smimdb_sentiment('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = t5smimdb_sentiment('I hate bad evil dislike')
test_result_neg # [0]['label']

In [None]:
# Test Indirectly

sentence_str = 'I love wonderful good things'
polarity = twoway_probability2sentiment(sentence_str, classifier)
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'is'
polarity = twoway_probability2sentiment(sentence_str, classifier)
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = twoway_probability2sentiment(sentence_str, classifier)
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}');

In [None]:
# NOTE: started 8:42

corpus_sents_trans_df['huggingface'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: twoway_probability2sentiment(x[:510], classifier))
corpus_sents_trans_df.head()

## **(5-way: slow) T5Base Finetuned Span Sentiment Extraction**

* https://huggingface.co/mrm8488/t5-base-finetuned-span-sentiment-extraction



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-span-sentiment-extraction")

model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-span-sentiment-extraction")

## **SpaCy BERT**

* https://explosion.ai/blog/spacy-transformers

In [None]:
!pip install spacy-transformers
!python -m spacy download en_core_web_trf

In [None]:
import spacy
import torch
import numpy
from numpy.testing import assert_almost_equal

is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

nlp = spacy.load("en_core_web_trf")
doc = nlp("Here is some text to encode.")

## **BERT SST**

* https://huggingface.co/barissayil/bert-sentiment-analysis-sst
* NOTE: Should be fine-tuned, not the same as Huggingface default

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("barissayil/bert-sentiment-analysis-sst")

model = AutoModelForSequenceClassification.from_pretrained("barissayil/bert-sentiment-analysis-sst")

In [None]:
sentiment_analysis = pipeline("sentiment-analysis",model="barissayil/bert-sentiment-analysis-sst")


In [None]:
# Test Directly
test_result_pos = sentiment_analysis('I love wonderful good things')
test_result_pos # [0]['label']
print('\n')
test_result_neg = sentiment_analysis('I hate your guts you filthy bastard')
test_result_neg # [0]['label']

In [None]:
# Test Indirectly

fivestar_probability2sentiment('I love wonderful good things', nlptown_sentiment_analysis)
print('\n')
fivestar_probability2sentiment('I hate your stinking guts you filthy lying stealing cheating bastard.', nlptown_sentiment_analysis)

In [None]:
# NOTE: started 4:13

corpus_sents_trans_df['nlptown'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: fivestar_probability2sentiment(x[:510], sentiment_analysis))
corpus_sents_trans_df.head()


In [None]:
corpus_sents_trans_df.head()

## **(3-way) BERTweet Base Bilingual Pysentimiento**

* https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis

* https://github.com/pysentimiento/pysentimiento

In [None]:
!pip install pysentimiento

In [None]:
!pwd

In [None]:
import torch

In [None]:
from pysentimiento import SentimentAnalyzer
# from pysentimiento import EmotionAnalyzer

# Bilingual es/en for both sentiment/emotions
analyzer = SentimentAnalyzer(lang="en")
# emotion_analyzer = EmotionAnalyzer(lang="en")

In [None]:
def get_pysentimiento_sentiment(text):
  '''
  Given a plain text string
  Return a three-way sentiment integer [-1.5 to +1.5] (negative,positive) sentiment value
      with each neg,neu,pos equally distributed over the same range (-1.5 to -0.5, -0.5 to 0.5, 0.5 to 1.5)
  '''

    # from pysentimiento import SentimentAnalyzer
    # analyzer = SentimentAnalyzer(lang="en")
    # print('Using Pysentimiento')
    text_str_ls = text_str.split()[:125]
    text_125_str = ' '.join(text_str_ls)
    pol_object = analyzer.predict(text_125_str)
    pol_str = pol_object.output
    if pol_str == 'NEG':
      sign_fl = -1.0
      score_base = -0.5
    elif pol_str == 'NEU':
      sign_fl = 1.0
      socre_base = 0
    else:
      # Polarity is 'POS' by default
      sign_fl = 1.0
      score_base = 0.5

    score_fl = (sign_fl * pol_object.probas[pol_str]) + score_base


  return score_fl



  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

  output = model.generate(input_ids=input_ids,
               max_length=2)

  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]

  if 'positive' in label:
    score_int = 1
  elif 'negative' in label:
    score_int = -1
  else:
    score_int = 0

  return score_int


get_sentiment("I dislike a lot that film")

In [None]:
# Test Directly
test_result_pos = analyzer.predict('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = analyzer.predict('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = analyzer.predict('I hate bad evil dislike')
test_result_neg # [0]['label']
print('\n')

test_result_neg.output
print('\n')
type(test_result_neg.probas)
test_result_neg.probas['NEG']

In [None]:
# Test Indirectly

sentence_str = 'I love wonderful good things'
polarity = threeway_probability2sentiment(sentence_str, analyzer.predict)
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'This'
polarity = threeway_probability2sentiment(sentence_str, analyzer.predict)
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = threeway_probability2sentiment(sentence_str, analyzer.predict)
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}');

In [None]:
# Test Indirectly

sentence_str = 'I love wonderful good things'
polarity = threeway_probability2sentiment(sentence_str, analyzer.predict)
print(f'\nPositive Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'It is what it is'
polarity = threeway_probability2sentiment(sentence_str, analyzer.predict)
print(f'\nNeutral Sentence Polarity: {polarity}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = threeway_probability2sentiment(sentence_str, analyzer.predict)
print(f'\nNegative Sentence Polarity: {polarity}\n    Text: {sentence_str}')

In [None]:
# NOTE: started 5:42

corpus_sents_trans_df['pysentimiento'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: threeway_probability2sentiment((' '.join(x.split()).split()[:124], analyzer.predict))
corpus_sents_trans_df.head()

In [None]:
corpus_sents_trans_df.head()

## **(3-way) FinEstEn 3Multilingual**

* https://huggingface.co/EMBEDDIA/finest-bert

* https://arxiv.org/pdf/2006.07890.pdf


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
  
tokenizer = AutoTokenizer.from_pretrained("EMBEDDIA/finest-bert")

model = AutoModelForMaskedLM.from_pretrained("EMBEDDIA/finest-bert")

## **(5-way) (No-Tigrinya) RoBERTa Base Sentiment**

* https://huggingface.co/fgaim/roberta-base-ti-sentiment (no metrics)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("fgaim/roberta-base-ti-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("fgaim/roberta-base-ti-sentiment")

In [None]:
sentiment_analysis = pipeline("sentiment-analysis",model="fgaim/roberta-base-ti-sentiment")


In [None]:
# Test Directly
test_result_pos = sentiment_analysis('I absolutely love this great and wonderful opportunity to enjoy lovely and good things')
test_result_pos # [0]['label']
print('\n')
test_result_neu = sentiment_analysis('blank')
test_result_neu # [0]['label']
print('\n')
test_result_neg = sentiment_analysis('I hate bad evil dislike')
test_result_neg # [0]['label']

In [None]:
# Test Indirectly

"""
# Testing
sentence_str = 'I love wonderful good things'
polarity, base_adj = fiveway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nPositive Sentence Polarity: {polarity} and Adjustment: {base_adj}\n    Text: {sentence_str}')

sentence_str = 'This'
polarity, base_adj = fiveway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNeutral Sentence Polarity: {polarity} and Adjustment: {base_adj}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity, base_adj = fiveway_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNegative Sentence Polarity: {polarity} and Adjustment: {base_adj}\n    Text: {sentence_str}')
""";

sentence_str = 'I love wonderful good things'
polarity = threepolarity_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nPositive Sentence Polarity: {polarity} and Adjustment: {base_adj}\n    Text: {sentence_str}')

sentence_str = 'This'
polarity = threepolarity_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNeutral Sentence Polarity: {polarity} and Adjustment: {base_adj}\n    Text: {sentence_str}')

sentence_str = 'I hate your stinking guts you filthy lying stealing cheating bastard.'
polarity = threepolarity_probability2sentiment(sentence_str, sentiment_analysis)
print(f'\nNegative Sentence Polarity: {polarity} and Adjustment: {base_adj}\n    Text: {sentence_str}')

In [None]:
# NOTE: started 5:10

corpus_sents_trans_df['hinglish'] = corpus_sents_trans_df['sent_raw'].apply(lambda x: threepolarity_probability2sentiment(x, sentiment_analysis))
corpus_sents_trans_df.head()

In [None]:
%whos DataFrame

In [None]:
corpus_sents_trans_df.shape

# **Utility Functions (Auto)**

## **Files**

In [None]:
# Generate full path and timestamp for new filepath/filename

def gen_pathfiletime(file_str, subdir_str=''):

  # Geenreate compressed author and title substrings
  author_raw_str = ''.join(CORPUS_AUTHOR.split()).lower()
  title_raw_str = ''.join(CORPUS_TITLE.split()).lower()

  # Generate current/unique datetime string
  datetime_str = str(datetime.now().strftime('%Y%m%d%H%M%S'))

  # Built fullpath+filename string
  file_base, file_ext = file_str.split('.')

  author_str = re.sub('[^A-Za-z0-9]+', '', author_raw_str)
  title_str = re.sub('[^A-Za-z0-9]+', '', title_raw_str)

  full_filepath_str = f'{subdir_str}{file_base}_{author_str}_{title_str}_{datetime_str}.{file_ext}'

  # print(f'Returning from gen_savepath() with full_filepath={full_filepath}')

  return full_filepath_str

# Test
# pathfilename_str = gen_pathfiletime('hist_paraglen.png')
# print(pathfilename_str)

In [None]:
# Tokenize into Sentences

def parag2sents(corpus_parags_ls):
  '''
  Given a list of paragraphs,
  Return a list of lists of Sentences [sent_no, parag_no, asent(text)]
  '''

  sent_no = 0
  # sent_base = 0
  corpus_sents_row_ls = []
  for parag_no,aparag in enumerate(corpus_parags_ls):
    sents_ls = sent_tokenize(aparag)
    # Delete (whitespace only) sentences
    sents_ls = [x.strip() for x in sents_ls if len(x.strip()) > MIN_SENT_LEN]
    # Delete (punctuation only) sentences
    sents_ls = [x for x in sents_ls if len((re.sub(r'[^\w\s]','',x)).strip()) > MIN_SENT_LEN]
    # Delete numbers (int or float) sentences
    # TODO: may want to keep
    for s,asent in enumerate(sents_ls):
      corpus_sents_row_ls.append([sent_no, parag_no, asent])
      sent_no += 1

    # print(f'Returning with corpus_sents_row_ls length = {len(corpus_sents_row_ls)}')
  
  return corpus_sents_row_ls

# Test

'''
print(f'Length {len(corpus_parags_raw_ls)}')
corpus_sents_row_ls = parag2sents(corpus_parags_raw_ls)

print(f'First row {corpus_sents_row_ls[0]}')
print('\n')
print(f'Last row {corpus_sents_row_ls[-1]}')
''';

In [None]:
#This function converts to lower-case, removes square bracket, removes numbers and punctuation
 
def text_clean(text):
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[\n]', ' ', text)  # Replace newline with space
    return text

In [None]:
# Read corpus into a single string then split into paragraphs

'''
if len(corpus_filename) == 0:
  # If now file uploaded, use the file in Google gDrive
  corpus_filename = CORPUS_FILENAME
else:
  # The uploaded file has priority over the gDrive Corpus file
  pass
'''

def read_corpus_parags(corpus_filename):
  '''
  Given a corpus_filename (assuming already %cd into correct subdir)
  Return a list of min preprocessed raw paragraphs (corpus_parags_raw_temp_ls)
  '''

  with open(corpus_filename, "r", encoding=CORPUS_ENCODING) as infp:
    corpus_raw_str = infp.read()

  corpus_parags_raw_temp_ls = corpus_raw_str.split('\n\n')
  print(f'Corpus Paragraph Raw Count: {len(corpus_parags_raw_temp_ls)}')

  # Strip excess whitespace and drop empty lines
  corpus_parags_raw_temp_ls = [x.strip() for x in corpus_parags_raw_temp_ls if len(x.strip()) > MIN_PARAG_LEN]
  print(f'Corpus Paragraph -(whitespace only) Count: {len(corpus_parags_raw_temp_ls)}')

  # Drop lines that only contain punctuation (e.g. '"', '.', '...', etc)
  corpus_parags_raw_temp_ls = [x for x in corpus_parags_raw_temp_ls if len((re.sub(r'[^\w\s]','',x)).strip()) > MIN_PARAG_LEN]
  print(f'Corpus Paragraph -(punctuation only) Count: {len(corpus_parags_raw_temp_ls)}')

  return corpus_parags_raw_temp_ls

# Test  
'''
corpus_parags_raw_ls = read_corpus_parags(CORPUS_FILENAME)
print(f'We found #{len(corpus_parags_raw_ls)} lines\n')

print('\nThe first 10 lines of the Corpus:')
print('-----------------------------------\n')
corpus_parags_raw_ls[:10]

print('\nThe last 10 lines of the Corpus:')
print('-----------------------------------\n')
corpus_parags_raw_ls[-10:]
print('\n')
print(sorted(corpus_parags_raw_ls, key=lambda x: (len(x), x)))
''';

In [None]:
# Verify saved under newest filename

def get_recentfile(file_type='csv'):
  '''
  Given a file extension type,
  Return the most recently created file of that type 
  in the current directory
  '''
  file_pattern = "./*." + file_type
  print(f'file_pattern: {file_pattern}')
  list_of_files = glob.glob(file_pattern) # * means all if need specific format then *.csv
  latest_file = max(list_of_files, key=os.path.getmtime)

  return latest_file

# Test

# get_recentfile('txt')

In [None]:
!pip install chardet

In [None]:
import chardet
name = b"\x4a\x6f\x73\xe9"
detection = chardet.detect(name)
print(detection)
encoding = detection["encoding"]
print(name.decode(encoding))


In [None]:
!pip install cchardet

In [None]:
# https://dev.to/bowmanjd/character-encodings-and-detection-with-python-chardet-and-cchardet-4hj7

import cchardet as chardet

from pathlib import Path
import sys

def get_file_encoding(filename):
    """Detect encoding and return decoded text, encoding, and confidence level."""
    filepath = Path(filename)

    # We must read as binary (bytes) because we don't yet know encoding
    blob = filepath.read_bytes()

    detection = chardet.detect(blob)
    encoding = detection["encoding"]
    confidence = detection["confidence"]
    text = blob.decode(encoding)

    return text, encoding, confidence


In [None]:
!ls -altr *.txt

In [None]:
CORPUS_FILENAME

In [None]:
# Try to discover Corpus text Encoding scheme (default to 'utf-8', but often 'iso-8859-1', 'windows-1252', 'cp1252', or 'ascii')
CORPUS_ENCODING = 'utf-8'
corpus_str, CORPUS_ENCODING, encoding_confidence = get_file_encoding(CORPUS_FILENAME)

# print(text)
# print(f"Encoding was detected as {CORPUS_ENCODING}.")
# print(f'             Confidence: {encoding_confidence}')

if encoding_confidence > 0.6:
  print(f"{encoding_confidence*100:.2f}% confidence Encoding = '{CORPUS_ENCODING}' for '{CORPUS_FILENAME}'")
else:
  print(f"ERROR: Less than 60% confidence estimating Encoding scheme for '{CORPUS_FILENAME}'")
  print(f"       Only {encoding_confidence*100:.2f}% confidence Encoding = '{CORPUS_ENCODING}'")
  print(f"       Manually verify corpus file '{CORPUS_FILENAME}' encoding, set as GLOBAL_CONSTATANT and rerun")

In [None]:
CORPUS_ENCODING

In [None]:
print("\x73\x70\x61\x6d")

In [None]:
b"\x73\x70\x61\x6d".decode("ascii")


In [None]:
b"\x4a\x6f\x73\xe9".decode("iso-8859-1")

**Start of New Files Section**

In [None]:
def corpus2chaps(corpus_filename):
  '''
  Given a corpus_filename (assuming already %cd into correct subdir)
  Return a list of min preprocessed raw CHAPTERs (corpus_parags_raw_temp_ls)
  '''

  with open(corpus_filename, "r", encoding=CORPUS_ENCODING) as infp:
    corpus_raw_str = infp.read()

  # Filter out SECTION [\d]{1,2}[^\n]* patterns from raw text corpus
  pattern = r'SECTION [\d]{1,2}[^\n]*'
  # Replace all occurrences of character s with an empty string
  corpus_raw_str = re.sub(pattern, '', corpus_raw_str)

  # print(f'len(corpus_raw_str) = {len(corpus_raw_str)}')
  corpus_chaps_ls = re.split(r'(CHAPTER [\d]{1,2}[^\n]*)', corpus_raw_str, flags=re.I) # , flags=re.I)
    
  # Strip off whitespace
  corpus_chaps_ls = [x.strip() for x in corpus_chaps_ls]

  # Filter out empty lines
  corpus_chaps_ls = [x for x in corpus_chaps_ls if not (len(x.strip()) <= MIN_PARAG_LEN)]

  # Filter out CHAPTER lines
  corpus_chaps_ls = [x for x in corpus_chaps_ls if not (x.strip().startswith('CHAPTER '))]

  return corpus_chaps_ls, corpus_raw_str

In [None]:
def corpus2sects(corpus_filename):
  '''
  Given a corpus_filename (assuming already %cd into correct subdir)
  Return a list of min preprocessed raw sections/CHAPTERs (corpus_parags_raw_temp_ls)
  '''

  with open(corpus_filename, "r", encoding='cp1252', errors='ignore') as infp: # encoding='utf-8', errors='ignore') as infp: # encoding=CORPUS_ENCODING) as infp:
    corpus_raw_str = infp.read()

  corpus_sects_ls = re.split(r'(CHAPTER [\d]{1,2}[^\n]*|SECTION [\d]{1,2}[^\n]*|-----)', corpus_raw_str)

  # Strip off whitespace
  corpus_sects_ls = [x.strip() for x in corpus_sects_ls]

  # Filter out empty lines
  corpus_sects_ls = [x for x in corpus_sects_ls if not (len(x.strip()) <= MIN_PARAG_LEN)]

  # Filter out the Section separator '-----' lines
  corpus_sects_ls = [x for x in corpus_sects_ls if not (x.strip().startswith('----- '))]

  # Filter out the Section separator 'SECTION ' lines
  corpus_sects_ls = [x for x in corpus_sects_ls if not (x.startswith('SECTION '))]

  # Filter out the Chapter separator 'CHAPTER ' lines
  # Keep for now, messy but enables proper SECTION assignments to appropraite CHAPTERs
  corpus_sects_ls = [x for x in corpus_sects_ls if not (x.startswith('CHAPTER '))]

  return corpus_sects_ls, corpus_raw_str

In [None]:
CORPUS_FILENAME

In [None]:
!ls -altr *hand_clean.txt

In [None]:
CORPUS_ENCODING.lower()

In [None]:
corpus_sects_ls, corpus_str_raw = corpus2sects(CORPUS_FILENAME)
print(f'Length corpus_str_raw: {len(corpus_sects_ls)}')
print(f'Length corpus_sects_ls: {len(corpus_sects_ls)}')

In [None]:
def corpus2parags(corpus_filename):
  '''
  Given a corpus_filename (assuming already %cd into correct subdir)
  Return a list of min preprocessed raw paragraphs (corpus_parags_ls)
  '''

  with open(corpus_filename, "r", encoding=CORPUS_ENCODING) as infp:
    corpus_raw_str = infp.read()

  corpus_parags_ls = re.split(r'[\n]{2,}', corpus_raw_str)
  print(f'Corpus Paragraph Raw Count: {len(corpus_parags_ls)}')

  # Strip off whitespace
  corpus_parags_ls = [x.strip() for x in corpus_parags_ls]

  # Filter out empty lines
  corpus_parags_ls = [x for x in corpus_parags_ls if (len(x.strip()) >= MIN_PARAG_LEN)]

  # Filter out lines containing only punctuation (e.g. '"', '.', '...', etc)
  parag_before_punctstrip_ct = len(corpus_parags_ls)
  corpus_parags_ls = [x for x in corpus_parags_ls if len((re.sub(r'[^\w\s]','',x)).strip()) > MIN_PARAG_LEN]
  print(f'Punctuation only Paragraph Count: {len(corpus_parags_ls) - parag_before_punctstrip_ct}')

  # Filter out the Section separator '-----' lines
  corpus_parags_ls = [x for x in corpus_parags_ls if not (x.strip().startswith('----- '))]

  # Filter out the Section separator 'SECTION ' lines
  corpus_parags_ls = [x for x in corpus_parags_ls if not (x.startswith('SECTION '))]

  # Filter out the Chapter separator 'CHAPTER ' lines
  corpus_parags_ls = [x for x in corpus_parags_ls if not (x.startswith('CHAPTER '))]

  return corpus_parags_ls, corpus_raw_str


In [None]:
def parag2sents(corpus_parags_ls):
  '''
  Given a list of paragraphs,
  Return a list of lists of Sentences [sent_no, parag_no, asent(text)]
  '''

  sent_no = 0
  # sent_base = 0
  corpus_sents_ls = []
  for parag_no,aparag in enumerate(corpus_parags_ls):
    sents_ls = sent_tokenize(aparag)
    # Delete (whitespace only) sentences
    sents_ls = [x.strip() for x in sents_ls if len(x.strip()) > MIN_SENT_LEN]
    # Delete (punctuation only) sentences
    sents_ls = [x for x in sents_ls if len((re.sub(r'[^\w\s]','',x)).strip()) > MIN_SENT_LEN]
    # Delete numbers (int or float) sentences
    sents_ls = [x for x in sents_ls if not (x.strip().isnumeric())]
    # TODO: may want to keep
    for s,asent in enumerate(sents_ls):
      corpus_sents_ls.append([sent_no, parag_no, asent])
      sent_no += 1

    # print(f'Returning with corpus_sents_ls length = {len(corpus_sents_ls)}')
  
  return corpus_sents_ls


**End of Files Section**

In [None]:
def get_sentiments(model_base, sentiment_fn, sentiment_type='lexicon'):
  '''
  Given a model_base name and sentiment evaluation function
  Calculate all the Sentence, Paragraph, Section and Chapter Sentiment Scores and Standardized variants
  '''

  # Calculate Sentiment Polarities

  if sentiment_type == 'lexicon':
    print(f'Processing Lexicon/Sentences...')
    corpus_sents_df[model_base] = corpus_sents_df['sent_clean'].apply(lambda text: sentiment_fn(str(text)))
    print(f'Processing Lexicon/Paragraphs...')
    corpus_parags_df[model_base] = corpus_parags_df['parag_clean'].apply(lambda text: sentiment_fn(str(text)))
    print(f'Processing Lexicon/Sections...')
    corpus_sects_df[model_base] = corpus_sects_df['sect_clean'].apply(lambda text: sentiment_fn(str(text)))
    print(f'Processing Lexicon/Chapters...')
    corpus_chaps_df[model_base] = corpus_chaps_df['chap_clean'].apply(lambda text: sentiment_fn(str(text)))
  
  elif sentiment_type == 'compound':
    # VADER

    # Calculate dictionary of {neg/neu/pos/compound} values for sent_clean
    print(f'Processing Lexicon/Sentences...')
    corpus_sents_df['scores'] = corpus_sents_df['sent_clean'].apply(lambda text: sentiment_fn(str(text)))
    print(f'Processing Lexicon/Paragraphs...')
    corpus_parags_df['scores'] = corpus_parags_df['parag_clean'].apply(lambda text: sentiment_fn(str(text)))
    print(f'Processing Lexicon/Sections...')
    corpus_sects_df['scores'] = corpus_sects_df['sect_clean'].apply(lambda text: sentiment_fn(str(text)))
    print(f'Processing Lexicon/Chapters...')
    corpus_chaps_df['scores'] = corpus_chaps_df['chap_clean'].apply(lambda text: sentiment_fn(str(text)))

    # Extract Compound Sentiment
    corpus_sents_df[model_base]  = corpus_sents_df['scores'].apply(lambda score_dict: score_dict['compound'])
    corpus_parags_df[model_base]  = corpus_parags_df['scores'].apply(lambda score_dict: score_dict['compound'])
    corpus_sects_df[model_base]  = corpus_sects_df['scores'].apply(lambda score_dict: score_dict['compound'])
    corpus_chaps_df[model_base]  = corpus_chaps_df['scores'].apply(lambda score_dict: score_dict['compound'])

  elif sentiment_type == 'function':
    # TextBlob

    # Calculate dictionary of {neg/neu/pos/compound} values for sent_clean
    print(f'Processing Lexicon/Sentences...')
    corpus_sents_df[model_base] = corpus_sents_df['sent_clean'].apply(lambda text: sentiment_fn(str(text)))
    print(f'Processing Lexicon/Paragraphs...')
    corpus_parags_df[model_base] = corpus_parags_df['parag_clean'].apply(lambda text: sentiment_fn(str(text)))
    print(f'Processing Lexicon/Sections...')
    corpus_sects_df[model_base] = corpus_sects_df['sect_clean'].apply(lambda text: sentiment_fn(str(text)))
    print(f'Processing Lexicon/Chapters...')
    corpus_chaps_df[model_base] = corpus_chaps_df['chap_clean'].apply(lambda text: sentiment_fn(str(text)))

  else:
    print(f'ERROR: sentiment_type={sentiment_type} but must be one of (lexicon, compound, function)')
    return

  # Create new column names
  col_meanstd = f'{model_base}_meanstd'
  col_medianiqr = f'{model_base}_medianiqr'
  col_lnorm_meanstd = f'{model_base}_lnorm_meanstd'
  col_lnorm_medianiqr = f'{model_base}_lnorm_medianiqr'


  # Get Chapter Standardization with MeanSTD and RobustStandardization with MedianIQRScaling
  corpus_chaps_df[col_meanstd]  = mean_std_scaler.fit_transform(np.array(corpus_chaps_df[model_base]).reshape(-1, 1))
  corpus_chaps_df[col_medianiqr]  = median_iqr_scaler.fit_transform(np.array(corpus_chaps_df[model_base]).reshape(-1, 1))
  # Normalize the Chapter Sentiment by dividing by Chapter Length
  chaps_len_ls = list(corpus_chaps_df['token_len'])
  chaps_sentiment_ls = list(corpus_chaps_df[model_base])
  chaps_sentiment_norm_ls = [chaps_sentiment_ls[i]/chaps_len_ls[i] for i in range(len(chaps_len_ls))]
  # RobustStandardize Chapter sentiment values
  corpus_chaps_df[col_lnorm_meanstd]  = mean_std_scaler.fit_transform(np.array(pd.Series(chaps_sentiment_norm_ls)).reshape(-1, 1))
  corpus_chaps_df[col_medianiqr]  = median_iqr_scaler.fit_transform(np.array(corpus_chaps_df[model_base]).reshape(-1, 1))
  corpus_chaps_df[col_lnorm_medianiqr]  = median_iqr_scaler.fit_transform(np.array(pd.Series(chaps_sentiment_norm_ls)).reshape(-1, 1))

  # Get Section Standardization with MeanSTD and RobustStandardization with MedianIQRScaling
  corpus_sects_df[col_meanstd]  = mean_std_scaler.fit_transform(np.array(corpus_sects_df[model_base]).reshape(-1, 1))
  corpus_sects_df[col_medianiqr]  = median_iqr_scaler.fit_transform(np.array(corpus_sects_df[model_base]).reshape(-1, 1))
  # Normalize the Section Sentiment by dividing by Section Length
  sects_len_ls = list(corpus_sects_df['token_len'])
  sects_sentiment_ls = list(corpus_sects_df[model_base])
  sects_sentiment_norm_ls = [sects_sentiment_ls[i]/sects_len_ls[i] for i in range(len(sects_len_ls))]
  # RobustStandardize Section sentiment values
  corpus_sects_df[col_lnorm_meanstd]  = mean_std_scaler.fit_transform(np.array(pd.Series(sects_sentiment_norm_ls)).reshape(-1, 1))
  corpus_sects_df[col_medianiqr]  = median_iqr_scaler.fit_transform(np.array(corpus_sects_df[model_base]).reshape(-1, 1))
  corpus_sects_df[col_lnorm_medianiqr]  = median_iqr_scaler.fit_transform(np.array(pd.Series(sects_sentiment_norm_ls)).reshape(-1, 1))


  # Normalize the Paragraph Sentiment by dividing by Chapter Length
  parags_len_ls = list(corpus_parags_df['token_len'])
  parags_sentiment_ls = list(corpus_parags_df[model_base])
  parags_sentiment_norm_ls = [parags_sentiment_ls[i]/parags_len_ls[i] for i in range(len(parags_len_ls))]
  # RobustStandardize Paragraph sentiment values
  corpus_parags_df[col_lnorm_meanstd]  = mean_std_scaler.fit_transform(np.array(pd.Series(parags_sentiment_norm_ls)).reshape(-1, 1))
  corpus_parags_df[col_medianiqr]  = median_iqr_scaler.fit_transform(np.array(corpus_parags_df[model_base]).reshape(-1, 1))
  corpus_parags_df[col_lnorm_medianiqr]  = median_iqr_scaler.fit_transform(np.array(pd.Series(parags_sentiment_norm_ls)).reshape(-1, 1))

  # Normalize the Sentence Sentiment by dividing by Chapter Length
  sents_len_ls = list(corpus_sents_df['token_len'])
  sents_sentiment_ls = list(corpus_sents_df[model_base])
  sents_sentiment_norm_ls = [sents_sentiment_ls[i]/sents_len_ls[i] for i in range(len(sents_len_ls))]
  # RobustStandardize Sentence sentiment values
  corpus_sents_df[col_lnorm_meanstd]  = mean_std_scaler.fit_transform(np.array(pd.Series(sents_sentiment_norm_ls)).reshape(-1, 1))
  corpus_sents_df[col_medianiqr]  = median_iqr_scaler.fit_transform(np.array(corpus_sents_df[model_base]).reshape(-1, 1))
  corpus_sents_df[col_lnorm_medianiqr]  = median_iqr_scaler.fit_transform(np.array(pd.Series(sents_sentiment_norm_ls)).reshape(-1, 1))

  return

In [None]:
# Read in lexicon at given path into Dict[word]=polarity

def get_lexicon(lexicon_name, lexicon_format=2):
    """
    Read sentiment lexicon.csv file at lexicon_path
    into appropriate Dict[word]=polarity

    1. lexicon_dt[word] = <polarity value>

    Args:
        sa_lib (str, optional): [description]. Defaults to 'syuzhet'.
    """
    
    # global lexicon_df

    lexicon_df = pd.DataFrame()
    
    # print(os.getcwd())
    """
    lexicons_ls = os.listdir('../sa_lexicons/')
    if (lexicon_name in lexicons_ls):
      print(f'Found {lexicon_name} in lexicon_directory)')
    # print(glob.glob('*.csv'))
    cp_cmd = f'copy ../sa_lexicons/{lexicon_name} ./'
    print(f'cp_cmd = {cp_cmd}')
    os.system(cp_cmd)
    os.system('cp ../sa_lexicons/' + lexicon_name.strip() + ' ./')
    os.listdir('.')  
    """;
    
    try:
      lexicon_df = pd.read_csv(lexicon_name)
      lexicon_df.info()
      # lexicon_df = lexicon_tmp_df.copy()
      # print(lexicon_df.head())
      return lexicon_df
    except:
      print(f'ERROR: Cannot read lexicon.csv at {lexicon_name}')
      return -1

'''
    print
    if (sa_lexicon == 'default'):
        lexicon_df = pd.read_csv(LEXICON_PATH)
        lexicon_df.columns = ['index_no', 'word', 'polarity']
        lexicon_df.drop(['index_no'], axis=1, inplace=True)
        lexicon_df.dropna(inplace=True)
        lexicon_dt = lexicon_df.set_index('word').T.to_dict('list')
        # unlist the polarity to type: float
        for key in lexicon_dt:
            lexicon_dt[key] = float(lexicon_dt[key][0])
        
    ### print(f"Exit get_sa_lex() with {len(lexicon_dt.keys())} entries in syuzhet_dt")
    return lexicon_dt
''';

In [None]:
# Sentence to Sentiment Polarity according to passed in Lexicon Dictionary

def text2sentiment(text_str, lexicon_dt):
  '''
  Given a text_str and lexicon_dt, calculate 
  the sentimety polarity.
  '''

  # Remove all not alphanumeric and whitespace characters
  text_str = re.sub(r'[^\w\s]', '', text_str) 

  text_str = text_str.strip().lower()
  if (len(text_str) < 1):
      print(f"ERROR: text2sentiment() given empty/null/invalid string: {text_str}")

  text_ls = text_str.split()
  # print(f'text_ls: {text_ls}')

  # Accumulated Total Sentiment Polarity for entire Sentence
  text_sa_tot = 0.0

  for aword in text_ls:
      # print(f'getting sa for word: {aword}')
      try:
          word_sa_fl = float(lexicon_dt[aword])
          text_sa_tot += word_sa_fl
          # print(f">>{aword} has a sentiment value of {word_sa_fl}")
      except TypeError: # KeyError:
          # aword is not in lexicon so it adds 0 to the sentence sa sum
          # print(f"TypeError: cannot convert {lexicon_dt[aword]} to float")
          continue
      except KeyError:
          # print(f"KeyError: missing key {aword} in defaultdict syuzhet_dt")
          continue
      except:
          e = sys.exc_info()[0]
          # print(f"ERROR {e}: sent2lex_sa() cannot catch aword indexing into syuzhet_dt error")
  
  # print(f"Leaving sent2lex_sa() with sentence sa value = {str(text_sa_tot)}")
  
  return text_sa_tot


# Test

# sent2sentiment('I hate and despise and abhor and dislike and am disgusted by Mondays.', lexicon_jockersrinker_dt)
# sent2sentiment('hate Mondays.', lexicon_jockersrinker_dt)

In [None]:
def plot_smas(section_view=True, model_name='vader', text_unit='sentence', wins_ls=[20], alpha=0.5, subtitle_str='', y_height=0, save2file=False):
  '''
  Given a model, text_unit
  Plot a SMA using default values and wrapping the function get_smas()
  '''

  if (section_view == True) and not any(x == text_unit for x in ['sentence', 'paragraph']):
    print(f'ERROR: You can only plot SMA within a Section with Sentence or Paragraph text units')
    return -99

  if text_unit == 'sentence':
    if section_view == False:
      ts_df = corpus_sents_df
    else:
      ts_df = section_sents_df
    wins_ls = [5,10,20]
  elif text_unit == 'paragraph':
    if section_view == False:
      ts_df = corpus_parags_df
    else:
      ts_df = section_parags_df
    wins_ls = [5,10,20]
  elif text_unit == 'section':
    ts_df = corpus_sects_df
    wins_ls=[20]
  else:
    print(f'ERROR: {text_unit} must be sentence, paragraph or section')

  sectno_loc = ts_df[model_name].min()

  if section_view ==False:
    # At Section boundries draw blue vertical lines 
    section_boundries_ls = list(corpus_sects_df['sent_no_start'])
    for i, sent_no in enumerate(section_boundries_ls):
      plt.text(sent_no, y_height, f'Sec#{i}', alpha=0.2, rotation=90)
      plt.axvline(sent_no, color='blue', alpha=0.1)
      # 'BigNews1', xy=(sent_no, 0.5), xytext=(-10, 25), textcoords='offset points',                   rotation=90, va='bottom', ha='center', annotation_clip=True)

      # plt.text(sent_no, -.5, 'goodbye',rotation=90, zorder=0)

    # At Chapter boundaries draw red vertical lines
    chapter_boundries_ls = list(corpus_chaps_df['sent_no_start'])
    for i, sent_no in enumerate(chapter_boundries_ls):
      plt.axvline(sent_no, color='navy', alpha=0.1)
      # plt.text(sent_no, .5, 'hello', rotation=90, zorder=0)

  get_smas(ts_df, model_name=model_name, text_unit=text_unit, wins_ls=wins_ls, alpha=alpha, subtitle_str=subtitle_str, save2file=save2file)

  if (save2file == True):
    # Save graph to file.
    plot_filename = f'plot_sma_sents_{model_name}.png'
    plotpathfilename_str = gen_pathfiletime(plot_filename)
    plt.savefig(plotpathfilename_str, format='png', dpi=300)
    print(f'Plot saved: {plot_filename}');

  return

In [None]:
# SMA 5% Sentiment of Sentence Sentiment

def get_smas(ts_df, model_name, text_unit='sentence', wins_ls=[5,10], alpha=0.5, scale_factor=1., subtitle_str='', mean_adj=0., do_plot=True, save2file=False):
  '''
  Given a model_name and time series DataFrame and list of win_rolls in percentages
  Return the rolling means of the time series using the window sizes in win_rolls
  '''

  temp_roll_df = pd.DataFrame() # TODO: save sma rolling values into temp_df and return this value

  win_1per = int(ts_df.shape[0]*0.01)
  if text_unit ==  'sentence':
    # win_1per = win_s1per
    x_idx = 'sent_no'
    fname_abbr = 'sents'
  elif text_unit == 'paragraph':
    # win_1per = win_p1per
    x_idx = 'parag_no'
    fname_abbr = 'parags'
  elif text_unit == 'section':
    win_1per = 1
    wins_ls = [int(0.1 * corpus_sects_df.shape[0])]  # Edge case to deal with very few Section data points
    x_idx = 'sect_no'
    fname_abbr = 'sects'
  else:
    print(f'ERROR: text_unit={text_unit} but must be either sentence, paragraph or section')
  
  for i, awin_size in enumerate(wins_ls):
    if len(str(awin_size)) == 1:
      awin_str = '0'+str(awin_size)+'0'
    else:
      awin_str = str(awin_size)+ '0'
    col_roll_str = f'{model_name}_mean_roll{awin_str}'
    win_size = awin_size*win_1per
    ts_df[col_roll_str] = ts_df[model_name].rolling(window=win_size, center=True).mean()
  
    if do_plot == True:
      alabel = f'{model_name} (win={awin_size})'
      ts_df['y_scaled'] = ts_df[col_roll_str]*scale_factor + mean_adj 
      sns.lineplot(data=ts_df, x=x_idx, y='y_scaled', legend='brief', label=alabel, alpha=alpha)
      
  plt.title(f'{CORPUS_FULL} (Model: {model_name}: {subtitle_str}) \nSMA Smoothed {text_unit} Sentiment Plot (windows={wins_ls})')
  # plt.legend(loc='best')

  if save2file == True:
    # Save graph to file.
    plot_filename = f'plot_{fname_abbr}_sa_mean_050100sma.png'
    plotpathfilename_str = gen_pathfiletime(plot_filename)
    plt.savefig(plotpathfilename_str, format='png', dpi=300)
    print(f'Plot saved: {plot_filename}');

  return temp_roll_df

In [None]:
def get_lexstats(ts_df, model_name, text_unit='sentence'):
  '''
  Given a model name
  calculate, store and return time series stats
  '''
  
  global corpus_lexicons_stats_dt

  temp_dt = {}
  
  if text_unit == 'sentence':
    stat_idx = f'{model_name}_sents'
  elif text_unit == 'paragraph':
    stat_idx = f'{model_name}_parags'
  elif text_unit == 'section':
    stat_idx = f'{model_name}_sects'
  elif text_unit == 'chapter':
    stat_idx = f'{model_name}_chaps'
  else:
    print(f'ERROR: {text_unit} must either be sentence, paragraph, or section')

  sentiment_min = ts_df[model_name].min()
  sentiment_max = ts_df[model_name].max()

  temp_dt = {'sentiment_min' : sentiment_min,
             'sentiment_max' : sentiment_max}

  corpus_lexicons_stats_dt[stat_idx] = temp_dt
                                     
  return 

# Test
# get_lexstats('afinn')
# corpus_lexicons_stats_dt

In [None]:
def lex_discrete2continous_sentiment(text, lexicon):
  '''
  Given a plain text string, give it to
    Stanford Stanza (OpenNLP) to calculate sentiment for each word on a 3 point scale 0-2
  Return a sentiment value for the entire sentence (sum of word sentiments/log(len of sentence)) 
    that approximates a normal distribution for all values
    In order to get more fine grained measure of overall Sentence sentiment
    Sentiment values will be Normalized/Standardized so absolute precision is not required
  '''
  text_sentiment_tot = 0.
  text_ls = text.split()
  text_len = len(text_ls)
  for aword in text_ls:
    word_sentiment = text2sentiment(str(aword), lexicon)
    text_sentiment_tot += word_sentiment
  text_sentiment_norm = text_sentiment_tot/(np.log(text_len)+0.01)

  return text_sentiment_norm

In [None]:
def clip_outliers(floats_ser):
  '''
  Given a pd.Series of float values
  Return a list with outliers removed, values limited within 3 median absolute deviations from median
  '''
  # https://www.statsmodels.org/stable/generated/statsmodels.robust.scale.mad.html#statsmodels.robust.scale.mad

  # Old mean/std, less robust
  # ser_std = floats_ser.std()
  # ser_median = floats_ser.mean() # TODO: more robust: asym/outliers -> median/IQR or median/median abs deviation

  floats_np = np.array(floats_ser)
  ser_median = floats_ser.median()
  ser_mad = robust.mad(floats_np)
  print(f'ser_median = {ser_median}')
  print(f'ser_mad = {ser_mad}')

  if ser_mad == 0:
    # for TS with small ranges (e.g. -1.0 to +1.0) Median Abs Deviation = 0
    #   so pass back the original time series
    floats_clip_ls = list(floats_ser)

  else:
    ser_oldmax = floats_ser.max()
    ser_oldmin = floats_ser.min()
    print(f'ser_max = {ser_oldmax}')
    print(f'ser_min = {ser_oldmin}')

    ser_upperlim = ser_median + 2.5*ser_mad
    ser_lowerlim = ser_median - 2.5*ser_mad
    print(f'ser_upperlim = {ser_upperlim}')
    print(f'ser_lowerlim = {ser_lowerlim}')

    # Clip outliers to max or min values
    floats_clip_ls = np.clip(floats_np, ser_lowerlim, ser_upperlim)
    # print(f'max floast_ls {floats_ls.max()}')

    # def map2range(value, low, high, new_low, new_high):
    #   '''map a value from one range to another'''
    #   return value * 1.0 / (high - low + 1) * (new_high - new_low + 1)

    # Map all float values to range [-1.0 to 1.0]
    # floats_clip_sig_ls = [map2range(i, ser_oldmin, ser_oldmax, ser_upperlim, ser_lowerlim) for i in floats_clip_ls]

    # listmax_fl = float(max(floats_ls))
    # floats_ls = [i/listmax_fl for i in floats_ls]
    #floats_ls = [1/(1+math.exp(-i)) for i in floats_ls]

  return floats_clip_ls  # floats_clip_sig_ls

# Test
# Will not work on first run as corpus_sents_df is not defined yet
'''
data = np.array([1, 4, 4, 7, 12, 13, 16, 19, 22, 24])
test_ls = clip_outliers(corpus_sents_df['vader'])
print(f'new min is {min(test_ls)}')
print(f'new max is {max(test_ls)}')
''';

## **Sentiment**

In [None]:
# Read in lexicon at given path into Dict[word]=polarity

def get_lexicon(lexicon_name, lexicon_format=2):
    """
    Read sentiment lexicon.csv file at lexicon_path
    into appropriate Dict[word]=polarity

    1. lexicon_dt[word] = <polarity value>

    Args:
        sa_lib (str, optional): [description]. Defaults to 'syuzhet'.
    """
    
    # global lexicon_df

    lexicon_df = pd.DataFrame()
    
    # print(os.getcwd())
    lexicons_ls = os.listdir('../sa_lexicons/')
    if (lexicon_name in lexicons_ls):
      print(f'Found {lexicon_name} in lexicon_directory)')
    # print(glob.glob('*.csv'))
    cp_cmd = f'copy ../sa_lexicons/{lexicon_name} ./'
    print(f'cp_cmd = {cp_cmd}')
    os.system(cp_cmd)
    os.system('cp ../sa_lexicons/' + lexicon_name.strip() + ' ./')
    os.listdir('.')  

    try:
      lexicon_df = pd.read_csv(lexicon_name)
      lexicon_df.info()
      # lexicon_df = lexicon_tmp_df.copy()
      # print(lexicon_df.head())
      return lexicon_df
    except:
      print(f'ERROR: Cannot read lexicon.csv at {lexicon_name}')
      return -1

'''
    print
    if (sa_lexicon == 'default'):
        lexicon_df = pd.read_csv(LEXICON_PATH)
        lexicon_df.columns = ['index_no', 'word', 'polarity']
        lexicon_df.drop(['index_no'], axis=1, inplace=True)
        lexicon_df.dropna(inplace=True)
        lexicon_dt = lexicon_df.set_index('word').T.to_dict('list')
        # unlist the polarity to type: float
        for key in lexicon_dt:
            lexicon_dt[key] = float(lexicon_dt[key][0])
        
    ### print(f"Exit get_sa_lex() with {len(lexicon_dt.keys())} entries in syuzhet_dt")
    return lexicon_dt
''';

In [None]:
def sentiment_sents2parags(ts_df, model_name='roberta_lg15'):
  '''
  Given a DataFrame and a model_name column containing float sentiment values (one row per Sentence)
  Return a list of model paragraph sentiment values by aggregated/summing sentence sentiment values
  '''
  
  # global corpus_parags_df # Don't create hidden entanglement with global vars if not necessary

  parags_sentiment_ls = []
  parag_ptr = 0
  parag_sentiment_tot = 0

  for index, row in corpus_sents_df.iterrows():
    this_sent_no = row['sent_no']
    this_parag_no = row['parag_no']
    this_sentiment = row[model_name]
    # print(f'Sent #{this_sent_no}, Parag #{this_parag_no}, Sentiment: {this_sentiment}')
    # print(row['sent_no'], row['parag_no'], row['roberta_lg15'])
    if parag_ptr == this_parag_no:
      parag_sentiment_tot += this_sentiment
    else:
      # corpus_parags_df.iloc[this_parag_no][model_name] = parag_sentiment_tot # See above note
      parags_sentiment_ls.append(parag_sentiment_tot)
      parag_sentiment_tot = this_sentiment
      parag_ptr += 1

  parags_sentiment_ls.append(parag_sentiment_tot) # Add the last remaining Paragraph Sentiment Value 

  return parags_sentiment_ls

# Test
"""
parags_sentiment_ls = sentiment_sents2parags(corpus_sents_df)
len(parags_sentiment_ls)

corpus_parags_df['parag_no'].duplicated().any()
corpus_parags_df['parag_raw'].astype('str').apply(lambda x: len(x)==0).any()
""";

In [None]:
def polprob2sentiment(pol_str, prob_fl):
  '''
  Given a Polarity string (Negative or Positive) and a Probability float (0.0-1.0)
  Return a Sentiment float value (-1.0 to 1.0)
  '''
  sign_fl = 1.0
  if pol_str.lower().startswith('neg'):
    # print(f'pol_str: {pol_str} is Negative')
    sign_fl = -1.0
  elif pol_str.lower().startswith('pos'):
    # print(f'pol_str: {pol_str} is Positive')
    pass
  else:
    print(f'ERROR: pol_str: {pol_str} is neither Negative nor Positive')
    sign_fl = 0.0

  return sign_fl * prob_fl

# Test
polprob2sentiment('Positive', 0.91)

In [None]:
# Sentence to Sentiment Polarity according to passed in Lexicon Dictionary

def text2sentiment(text_str, lexicon_dt):
  '''
  Given a text_str and lexicon_dt, calculate 
  the sentimety polarity.
  '''

  # Remove all not alphanumeric and whitespace characters
  text_str = re.sub(r'[^\w\s]', '', text_str) 

  text_str = text_str.strip().lower()
  if (len(text_str) < 1):
      print(f"ERROR: text2sentiment() given empty/null/invalid string: {text_str}")

  text_ls = text_str.split()
  # print(f'text_ls: {text_ls}')

  # Accumulated Total Sentiment Polarity for entire Sentence
  text_sa_tot = 0.0

  for aword in text_ls:
      # print(f'getting sa for word: {aword}')
      try:
          word_sa_fl = float(lexicon_dt[aword])
          text_sa_tot += word_sa_fl
          # print(f">>{aword} has a sentiment value of {word_sa_fl}")
      except TypeError: # KeyError:
          # aword is not in lexicon so it adds 0 to the sentence sa sum
          # print(f"TypeError: cannot convert {lexicon_dt[aword]} to float")
          continue
      except KeyError:
          # print(f"KeyError: missing key {aword} in defaultdict syuzhet_dt")
          continue
      except:
          e = sys.exc_info()[0]
          # print(f"ERROR {e}: sent2lex_sa() cannot catch aword indexing into syuzhet_dt error")
  
  # print(f"Leaving sent2lex_sa() with sentence sa value = {str(text_sa_tot)}")
  
  return text_sa_tot


# Test

# sent2sentiment('I hate and despise and abhor and dislike and am disgusted by Mondays.', lexicon_jockersrinker_dt)
# sent2sentiment('hate Mondays.', lexicon_jockersrinker_dt)

In [None]:
def lex_discrete2continous_sentiment(text, lexicon):
  '''
  Given a plain text string, give it to
    Stanford Stanza (OpenNLP) to calculate sentiment for each word on a 3 point scale 0-2
  Return a sentiment value for the entire sentence (sum of word sentiments/log(len of sentence)) 
    that approximates a normal distribution for all values
    In order to get more fine grained measure of overall Sentence sentiment
    Sentiment values will be Normalized/Standardized so absolute precision is not required
  '''
  text_sentiment_tot = 0.
  text_ls = text.split()
  text_len = len(text_ls)
  for aword in text_ls:
    word_sentiment = text2sentiment(str(aword), lexicon)
    text_sentiment_tot += word_sentiment
  text_sentiment_norm = text_sentiment_tot/(np.log(text_len)+0.01)

  return text_sentiment_norm

In [None]:
def get_lexstats(ts_df, model_name, text_unit='sentence'):
  '''
  Given a model name
  calculate, store and return time series stats
  '''
  
  global corpus_lexicons_stats_dt

  temp_dt = {}
  
  if text_unit == 'sentence':
    stat_idx = f'{model_name}-sents'
  elif text_unit == 'paragraph':
    stat_idx = f'{model_name}-parags'
  else:
    print(f'ERROR: {text_unit} must either be sentence or paragraph')

  sentiment_min = ts_df[model_name].min()
  sentiment_max = ts_df[model_name].max()

  temp_dt = {'sentiment_min' : sentiment_min,
             'sentiment_max' : sentiment_max}

  corpus_lexicons_stats_dt[stat_idx] = temp_dt
                                     
  return 

# Test
# get_lexstats('afinn')
# corpus_lexicons_stats_dt

## **Normalize, Standardize and Outliers**

In [None]:
def norm2negpos1(data_ser):
  '''
  Given a series of floating number
  Return a a list of same values normed between -1.0 and +1.0
  '''
  # data_np = np.matrix(data_ser)

  scaler=MinMaxScaler(feature_range=(-1.0, 1.0))
  temp_ser = scaler.fit_transform(np.matrix(data_ser))
  
  return temp_ser

# Test
'''
temp_np = norm2negpos1(corpus_all_df[['xlnet_sst5']])
print(type(temp_np))
temp_np.shape
''';

In [None]:
def standardize_ts(data_ser):
  '''
  Given a series of floating number
  Return a a list of same values normed between -1.0 and +1.0
  '''
  # data_np = np.matrix(data_ser)

  std_scaler = StandardScaler()
  df_std = std_scaler.fit_transform(np.array(data_ser))
  
  return df_std

# Test
'''
temp_np = norm2negpos1(corpus_all_df[['xlnet_sst5']])
print(type(temp_np))
temp_np.shape
temp_np
''';

In [None]:
def clip_outliers(floats_ser):
  '''
  Given a pd.Series of float values
  Return a list with outliers removed, values limited within 3 median absolute deviations from median
  '''
  # https://www.statsmodels.org/stable/generated/statsmodels.robust.scale.mad.html#statsmodels.robust.scale.mad

  # Old mean/std, less robust
  # ser_std = floats_ser.std()
  # ser_median = floats_ser.mean() # TODO: more robust: asym/outliers -> median/IQR or median/median abs deviation

  floats_np = np.array(floats_ser)
  ser_median = floats_ser.median()
  ser_mad = robust.mad(floats_np)
  print(f'ser_median = {ser_median}')
  print(f'ser_mad = {ser_mad}')

  if ser_mad == 0:
    # for TS with small ranges (e.g. -1.0 to +1.0) Median Abs Deviation = 0
    #   so pass back the original time series
    floats_clip_ls = list(floats_ser)

  else:
    ser_oldmax = floats_ser.max()
    ser_oldmin = floats_ser.min()
    print(f'ser_max = {ser_oldmax}')
    print(f'ser_min = {ser_oldmin}')

    ser_upperlim = ser_median + 2.5*ser_mad
    ser_lowerlim = ser_median - 2.5*ser_mad
    print(f'ser_upperlim = {ser_upperlim}')
    print(f'ser_lowerlim = {ser_lowerlim}')

    # Clip outliers to max or min values
    floats_clip_ls = np.clip(floats_np, ser_lowerlim, ser_upperlim)
    # print(f'max floast_ls {floats_ls.max()}')

    # def map2range(value, low, high, new_low, new_high):
    #   '''map a value from one range to another'''
    #   return value * 1.0 / (high - low + 1) * (new_high - new_low + 1)

    # Map all float values to range [-1.0 to 1.0]
    # floats_clip_sig_ls = [map2range(i, ser_oldmin, ser_oldmax, ser_upperlim, ser_lowerlim) for i in floats_clip_ls]

    # listmax_fl = float(max(floats_ls))
    # floats_ls = [i/listmax_fl for i in floats_ls]
    #floats_ls = [1/(1+math.exp(-i)) for i in floats_ls]

  return floats_clip_ls  # floats_clip_sig_ls

# Test
# Will not work on first run as corpus_sents_df is not defined yet
'''
data = np.array([1, 4, 4, 7, 12, 13, 16, 19, 22, 24])
test_ls = clip_outliers(corpus_sents_df['vader'])
print(f'new min is {min(test_ls)}')
print(f'new max is {max(test_ls)}')
''';

## **Smoothing**

In [None]:
# SMA 5% Sentiment of Sentence Sentiment

def get_smas(ts_df, model_name, text_unit='sentence', win_ls=[5,10], scale_factor=1., mean_adj=0., do_plot=True, save2file=False):
  '''
  Given a model_name and time series DataFrame and list of win_rolls in percentages
  Return the rolling means of the time series using the window sizes in win_rolls
  '''

  temp_roll_df = pd.DataFrame() # TODO: save sma rolling values into temp_df and return this value

  if text_unit ==  'sentence':
    win_1per = win_s1per
    x_idx = 'sent_no'
    fname_abbr = 'sents'
  elif text_unit == 'paragraph':
    win_1per = win_p1per
    x_idx = 'parag_no'
    fname_abbr = 'parags'
  else:
    print(f'ERROR: text_unit={text_unit} but must be either sentence or paragraph')
  
  for i, awin_size in enumerate(win_ls):
    if len(str(awin_size)) == 1:
      awin_str = '0'+str(awin_size)+'0'
    else:
      awin_str = str(awin_size)+ '0'
    col_roll_str = f'{model_name}_mean_roll{awin_str}'
    ts_df[col_roll_str] = ts_df[model_name].rolling(awin_size*win_1per, center=True).mean()
  
    if do_plot == True:
      alabel = f'{model_name} (win={awin_size})'
      ts_df['y_scaled'] = ts_df[col_roll_str]*scale_factor + mean_adj 
      sns.lineplot(data=ts_df, x=x_idx, y='y_scaled', legend='brief', label=alabel)
      
  plt.title(f'{CORPUS_FULL} (Model: {model_name}) \nSMA Smoothed {text_unit} Sentiment Plot (windows={win_ls})')
  # plt.legend(loc='best')

  if save2file == True:
    # Save graph to file.
    plot_filename = f'plot_{fname_abbr}_sa_050100mean_afinn.png'
    plotpathfilename_str = gen_pathfiletime(plot_filename)
    plt.savefig(plotpathfilename_str, format='png', dpi=300)
    print(f'Plot saved: {plot_filename}');

  return temp_roll_df;

In [None]:
def plot_lowess(ts_df, df_cols_ls, do_plot=True, afrac=1./10, ait=5):
  '''
  Given a DataFrame, list of column to plot, LOWESS params fraction and iterations,
  Return a DataFrame with LOWESS values
  If 'plot=True', also output plot
  '''

  # global corpus_sents_df

  lowess_df = pd.DataFrame()

  for i,acol in enumerate(df_cols_ls):
    sm_x, sm_y = sm_lowess(endog=ts_df[acol].values, exog=ts_df.index.values,  frac=afrac, it=ait, return_sorted = True).T
    col_new = f'{acol}_lowess'
    lowess_df[col_new] = pd.Series(sm_y)
    if do_plot:
      afrac_label = f'{afrac:.3f}'
      plt.plot(sm_x, sm_y, label=afrac_label, alpha=0.5, linewidth=2)

      frac_str = str(round(100*afrac))
      plt.title(f'{CORPUS_FULL} \n LOWESS (frac={frac_str} Sentence Sentiment ({sa_model}, frac={afrac})')
      plt.legend(title='LOWESS fraction')

  return lowess_df

# Test
"""
new_lowess_col = f'{sa_model}_lowess'
my_frac = 1./10
my_frac_per = round(100*my_frac)
new_lowess_col = f'{sa_model}_lowess_{my_frac_per}'
corpus_sents_df[new_lowess_col] = plot_lowess(corpus_sents_df, [sa_model], afrac=my_frac)
corpus_sents_df.head()
""";

## **Classifiers**

## **Pandas**

In [None]:
def rename_cols(ts_df, col_old_ls, suffix_str='_raw'):
  '''
  Given a DataFrame, list of columns in DataFrame and a suffix,
  Return a Dictionary mapping old col names to new col name (orig+suffix)
  '''

  col_new_ls = []
  for acol in col_old_ls:
    acol_new = acol + suffix_str
    col_new_ls.append(acol_new)

  # Create dict for col mapping: keys=old col names, value=new col names
  col_rename_dt = dict(zip(col_old_ls, col_new_ls))

  # ts_df.rename(columns=col_rename_dt, errors="raise")

  return col_rename_dt

# test_ls = [col for col in corpus_sents_df.columns if not(renaming_fun(col) is None)]
# print(f'test_ls: {test_ls}')

# Test
# col_rename_dt = rename_cols(corpus_sents_df, sentiment_only_cols_ls)
# col_rename_dt

In [None]:
def get_cols_regex(ts_df, find_regex, ignore_regex, strict_match=False):
  ''' 
  Given a DataFrame and 2 regex string to match/find and ignore/discard (strict_match: return if substring only)
  Return all the column names that match that regex
  '''

  cols_ls = ts_df.columns
  cols_match = []

  if strict_match:
    find_regex = f'^{find_regex}$'

  find_comp = re.compile(find_regex)

  for acol in cols_ls:
    match_str = re.search(find_comp, acol)
    if match_str:
      ignore_str = re.search(ignore_regex, acol)
      if ignore_str:
        continue
        # print(f'Ignore: {acol}')
        pass
      else: 
        # print(f'Found {match_str.group(0)} in {acol}')
        cols_match.append(acol)

  return cols_match



## **Stanford ASAP**

In [None]:
# ASAP smooth_simple and support functions

def moving_average(data, _range):
    ret = np.cumsum(data, dtype=float)
    ret[_range:] = ret[_range:] - ret[:-_range]
    return ret[_range - 1:] / _range

def SMA(data, _range, slide):
    ret = moving_average(data, _range)[::slide]
    return list(ret)

def kurtosis(values):
    return scipy.stats.kurtosis(values)

def roughness(vals):
    return np.std(np.diff(vals))

def smooth_simple(data, max_window=5, resolution=None):
    data = np.array(data)
    # Preaggregate according to resolution
    window_size = 1
    slide_size = 1
    if resolution:
        slide_size = int(len(data) / resolution)
        if slide_size > 1:
            data = SMA(data, slide_size, slide_size)
    orig_kurt   = kurtosis(data)
    min_obj     = roughness(data)
    range_lim = int(len(data) / max_window + 1)  # 20210621 insert: Fix JChun
    for w in range(2, range_lim):                #          edit:   range_lim
        w_int = int(w)                     
        smoothed = SMA(data, w_int, 1)   
        if kurtosis(smoothed) >= orig_kurt:
            r = roughness(smoothed)
            if r < min_obj:
                min_obj = r
                window_size = w
    return window_size, slide_size

In [None]:
# ASAP asap_smooth and utility functions

class Metrics(object):
    def __init__(self, values):
        self.set_values( values )

    def set_values(self, values):
        self.values = values
        self.r = self.k = None

    @property
    def kurtosis(self):
        if self.k is None:
            self.k = scipy.stats.kurtosis(self.values)
        return self.k

    @property
    def roughness(self):
        if self.r is None:
            self.r = np.std(np.diff(self.values))
        return self.r

class ACF(Metrics):
    CORR_THRESH = 0.2
    def __init__(self, values, max_lag=None):
        super(ACF, self).__init__(values)
        if max_lag is None:
            max_lag = len(values) / 5
        self.max_lag = int(max_lag)
        self.max_acf = 0.0

        # Calculate autocorrelation via FFT
        # Demean
        demeaned = values - np.mean(values)
        # Pad data to power of 2 
        l = int(2.0 ** (int(math.log(len(demeaned),2.0)) + 1))
        padded = np.append(demeaned, ([0.0] * (l - len(demeaned))))    
        # FFT and inverse FFT
        F_f = numpy.fft.fft( padded )
        R_t = numpy.fft.ifft( F_f * np.conjugate(F_f) )
        max_lag = int(max_lag)
        self.correlations = R_t[:max_lag].real / R_t[0].real 
        
        # Find autocorrelation peaks 
        self.peaks = []
        if len(self.correlations) >1 :
            positive = self.correlations[1] > self.correlations[0]
            max = 1
            for i in range(2, len(self.correlations)):
                if not positive and self.correlations[i] > self.correlations[i-1]:
                    max = i
                    positive = not positive
                elif positive and self.correlations[i] > self.correlations[max]:
                    max = i
                elif positive and self.correlations[i] < self.correlations[i-1]:
                    if max > 1 and self.correlations[max] > self.CORR_THRESH:
                        self.peaks.append(max)
                        if self.correlations[max] > self.max_acf:
                            self.max_acf = self.correlations[max]
                    positive = not positive
        # If there is no autocorrelation peak within the MAX_WINDOW boundary,
        # try windows from the largest to the smallest 
        if len(self.peaks) <= 1:
            self.peaks = range(2, len(self.correlations))
                    
def moving_average(data, _range):
    ret = np.cumsum(data)
    ret[_range:] = ret[_range:] - ret[:-_range]
    return ret[_range - 1:] / _range

def SMA(data, _range, slide):
    ret = moving_average(data, _range)[::slide]
    return list(ret)
                    
def binary_search(head,tail,data,min_obj,orig_kurt,window_size):
    while head <= tail:
        w = int(round((head + tail) / 2.0))
        smoothed = SMA(data,w,1)
        metrics  = Metrics(smoothed)
        if metrics.kurtosis >= orig_kurt:
            if metrics.roughness < min_obj:
                window_size = w
                min_obj = metrics.roughness
            head = w + 1
        else:
            tail = w - 1
    return window_size

def smooth_ASAP(data, max_window=5, resolution=None):
    data = np.array(data)
    # Preaggregate according to resolution
    slide_size = 1
    window_size = 1
    if resolution and len(data) >= 2 * resolution:
        slide_size = int(len(data) / resolution)  # 20210621 JChun
        data = SMA(data, slide_size, slide_size)
    acf         = ACF(data, max_lag=len(data) / max_window)
    peaks       = acf.peaks
    orig_kurt   = acf.kurtosis
    min_obj     = acf.roughness
    lb          = 1
    largest_feasible = -1
    tail = int(len(data) / max_window)  # 20210621 JChun
    for i in range(len(peaks) - 1, -1, -1):
        w = peaks[i]

        if w < lb or w == 1:
            break
        elif math.sqrt(1 - acf.correlations[w]) * window_size > math.sqrt(1 - acf.correlations[window_size]) * w:
            continue

        smoothed = SMA(data, w, 1)
        metrics = Metrics(smoothed)
        if metrics.roughness < min_obj and metrics.kurtosis >= orig_kurt:
            min_obj = metrics.roughness
            window_size = w
            lb = round( max(w*math.sqrt( (acf.max_acf -1) / (acf.correlations[w]-1) ), lb) )
    if largest_feasible > 0:
        if largest_feasible < len(peaks) - 2:
            tail = peaks[largest_feasible + 1]
        lb = max(lb, peaks[largest_feasible] + 1)

    window_size = binary_search(lb, tail, data, min_obj, orig_kurt, window_size)
    return window_size, slide_size

In [None]:
# ASAP utility function to read data from CSV (not used)

def load_csv(fname, input_column=1):
    import csv
    with open(fname, 'r') as ifh:
        icsv = csv.reader(ifh)
        if sys.version_info.major == 2:
            header = icsv.next()
        else:
            header = next(icsv)
        rows = list(icsv)

        try:
            data = [ float(x[input_column]) for x in rows ]
        except ValueError:
            print("couldn't convert input-column={0} float".format(input_column))
            if rows:
                print("first row:")
                for idx,x in enumerate(rows[0]):
                    print('  column {:3d}: {}'.format(idx,x))
            exit(1)
    return data

In [None]:
# ASAP Simple (Brute Force)
"""
def moving_average(data, _range):
    ret = np.cumsum(data, dtype=float)
    ret[_range:] = ret[_range:] - ret[:-_range]
    return ret[_range - 1:] / _range

def SMA(data, _range, slide):
    ret = moving_average(data, _range)[::slide]
    return list(ret)

def kurtosis(values):
    return scipy.stats.kurtosis(values)

def roughness(vals):
    return np.std(np.diff(vals))

def smooth_simple(data, max_window=5, resolution=None):
    data = np.array(data)
    # Preaggregate according to resolution
    window_size = 1
    slide_size = 1
    if resolution:
        slide_size = int(len(data) / resolution)
        if slide_size > 1:
            data = SMA(data, slide_size, slide_size)
    orig_kurt   = kurtosis(data)
    min_obj     = roughness(data)
    for w in range(2, int(len(data) / max_window + 1)):
        smoothed = SMA(data, w, 1)
        if kurtosis(smoothed) >= orig_kurt:
            r = roughness(smoothed)
            if r < min_obj:
                min_obj = r
                window_size = w
    return window_size, slide_size
""";

In [None]:
# ASAP Plot time series before and after smoothing

def plot(data, window_size, slide_size, plot_title):
    plt.clf()
    plt.figure()
    data = SMA(data, slide_size, slide_size)
    method_names = ["SMA Smoothed", "ASAP Smoothed"]
    fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
    # smoothed = SMA(data, window_size, 1)
    smoothed = smooth_simple(data, window_size, 1)
    smoothed_range = range(int(window_size/2), int(window_size/2) + len(smoothed))
    ax1.set_xlim(0, len(data))
    ax1.plot(data, linestyle='-', linewidth=1.5)
    # ax1.set_title('SMA Smoothed')
    ax2.plot(smoothed_range, smoothed, linestyle='-', linewidth=1.5)
    # ax2.set_title('Stanford ASAP Smoothed')
    axes = [ax1, ax2]
    for i in range(2):
        axes[i].get_xaxis().set_visible(False)
        axes[i].text(0.02, 0.8, "%s" %(method_names[i]),
            verticalalignment='center', horizontalalignment='left',
            transform=axes[i].transAxes, fontsize=25)

    fig.set_size_inches(16, 12)
    plt.tight_layout(w_pad=1)
    plt.title(plot_title)
    plt.show()

    return smoothed_range, smoothed

In [None]:
# Plot both SMA and ASAP Smoothed Sentiment Analysis Time Series

def plot_asap(model_name, data, window_size, slide_size, do_plot=True, save2file=False):
    plt.clf()
    plt.figure()
    data = SMA(data, slide_size, slide_size)
    method_names = ["SMA Smoothed", "ASAP Smoothed"]
    fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
    smoothed = SMA(data, window_size, 1)
    smoothed_range = range(int(window_size/2), int(window_size/2) + len(smoothed))
    ax1.set_xlim(0, len(data))
    ax1.plot(data, linestyle='-', linewidth=1.5)
    title_str = f'Stanford ASAP Smoothing of {model_name} (win={window_size}, slide={slide_size}'
    ax2.title.set_text(title_str)
    ax2.plot(smoothed_range, smoothed, linestyle='-', linewidth=1.5)
    axes = [ax1, ax2]
    for i in range(2):
        axes[i].get_xaxis().set_visible(False)
        axes[i].text(0.02, 0.8, "%s" %(method_names[i]),
            verticalalignment='center', horizontalalignment='left',
            transform=axes[i].transAxes, fontsize=25)

    fig.set_size_inches(16, 6)
    plt.tight_layout(w_pad=1)


    if save2file:
      # Save Plot to file.
      plot_filename = f'plot_sent_asap_{model_name}_{author_str}_{title_str}.png'
      # plotpathfilename_str = gen_pathfiletime(plot_filename)
      plt.savefig(plot_filename, format='png', dpi=300)
      print(f'Plot saved: {plot_filename}');
    
    plt.show()

    return smoothed_range, smoothed;

## **Time Series**

In [None]:
def norm2negpos1(data_ser):
  '''
  Given a series of floating number
  Return a a list of same values normed between -1.0 and +1.0
  '''
  # data_np = np.matrix(data_ser)

  scaler=MinMaxScaler(feature_range=(-1.0, 1.0))
  temp_ser = scaler.fit_transform(np.matrix(data_ser))
  
  return temp_ser

# Test
'''
temp_np = norm2negpos1(corpus_all_df[['xlnet_sst5']])
print(type(temp_np))
temp_np.shape
''';

In [None]:
def standardize_ts(data_ser):
  '''
  Given a series of floating number
  Return a a list of same values normed between -1.0 and +1.0
  '''
  # data_np = np.matrix(data_ser)

  std_scaler = StandardScaler()
  df_std = std_scaler.fit_transform(np.array(data_ser))
  
  return df_std

# Test
'''
temp_np = norm2negpos1(corpus_all_df[['xlnet_sst5']])
print(type(temp_np))
temp_np.shape
temp_np
''';

In [None]:
# This must be defined AFTER the corpus_sects_df DataFrame is created in the Preprocessing Step below

# Raw Plot of Section Sentiments (Adjusted for (x-axis) mid-Section Sentence No and (y-axis) Sentiment weighted by Section length )

# corpus_sects_df = pd.DataFrame()  # Create empty early as required by some utility functions

def plot_crux_sections(model_names_ls, semantic_type='section', subtitle_str='', label_token_ct=0, title_xpos = 0.8, title_ypos=0.2, sec_y_height=0, save2file=False):
  '''
  Given a Sections DataFrame, model_name and semantic type,
  Return a Plot of the Cruxes
  '''

  crux_points_dt = {}
  model_stand_names_ls = []
  section_boundries_ls = []


  # print(f'Using model_names: {model_names_ls}')

  # sns.lineplot(data=ts_df, x='sent_no_mid', y=amodel_stand, markers=['o'], alpha=0.5, label=amodel_stand); # .set_title(f'{CORPUS_FULL} \n Plot Section Sentiment (Bing Lexicon)')


  # At Section boundries draw blue vertical lines 
  section_boundries_ls = list(corpus_sects_df['sent_no_start'])
  for i, sent_no in enumerate(section_boundries_ls):
    plt.text(sent_no, sec_y_height, f'Sec#{i}', alpha=0.2, rotation=90)
    plt.axvline(sent_no, color='blue', alpha=0.1);

  # At Chapter boundaries draw red vertical lines
  chapter_boundries_ls = list(corpus_chaps_df['sent_no_start'])
  for i, sent_no in enumerate(chapter_boundries_ls):
    plt.axvline(sent_no, color='navy', alpha=0.1);

  # Error check and assign DataFrame associated with each semantic_type
  if semantic_type == 'section':
    # Get midpoints of each Section
    ts_df=corpus_sects_df
    midpoints_ls = list(corpus_sects_df['sent_no_mid'])
  elif semantic_type == 'chapter':
    # Get midpoints of each Chapter
    ts_df=corpus_chaps_df
    midpoints_ls = list(corpus_chaps_df['sent_no_mid'])
  else:
    print(f"ERROR: semantic_type={semantic_type} must be either 'section' or 'chapter'")
    return -1

  # How many sentiment time series are we plotting?
  if len(model_names_ls) == 1:
    
    # Plotting only one model
    model_name_full = str(model_names_ls[0])
    model_name_root = model_name_full.split('_')[0]
    print(f'model_name_full: {model_name_full} and model_name_root: {model_name_root}')
    if model_name_root in MODELS_LS:
      # Plot
      print(f'about to sns.lineplot model: ') # {ts_df}')
      g = sns.lineplot(data=ts_df, x='sent_no_mid', y=model_name_full, markers=['o'], alpha=0.5, label=model_name_full) # .set_title(f'{CORPUS_FULL} \n Plot Section Sentiment and Cruxes (Model: {models_names_ls[0].capitalize()})')
      # g._legend.remove()
      # print(f'model_name_full={model_name_full}')
      # plt.plot(ts_df.sent_no_mid, ts_df[model_name_full], markers="o", alpha=0.5, label=model_name_full)
    else:
      print(f'ERROR: model_names_ls[0]={model_name_root} is invalid,\n    must be one of {MODELS_LS}')
      return -1

    # If plotting only one model, add labels
    midpoints_sentiment_ls = list(ts_df[model_name_full])
    sect_ct = 0
    for x,y in zip(midpoints_ls, midpoints_sentiment_ls): 
      label_token_int = int(label_token_ct)
      if label_token_int < 0:
        label = ''
      elif label_token_int == 0:
        # if arg label_token_ct == 0, just print sent_no
        label = f"#{x}({sect_ct})"
      else:
        # if arg label_token_ct > 0, print the first label_token_ct words of sentence at crux point
        label = f"#{x}({sect_ct}) {' '.join(corpus_sents_df.iloc[x-1]['sent_raw'].split()[:label_token_int])}"; # \nPolarity: {y:.2f}'

      # Save Crux point in crux_points_dt Dictionary if plotting Cruxes for a single/specific Model
      crux_full_str = ' '.join(corpus_sents_df.iloc[x]['sent_raw'].split())
      crux_points_dt[x] = [y, crux_full_str]

      plt.annotate(label,
                   (x,y),
                   textcoords='offset points',
                   xytext=(0,10),
                   ha='center',
                   rotation=90)
      sect_ct += 1

    plt.title(f'{CORPUS_FULL} \n Plot {semantic_type.capitalize()} Sentiment ({model_name_full.capitalize()})\n{subtitle_str}', x=title_xpos, y=title_ypos);
    # Plot
    plt.plot(midpoints_ls, midpoints_sentiment_ls, marker="o", ms=6) # , markevery=[0,1])

  else:
    # If plotting multiple models
    model_names_str = 'Multiple Models'
    for i, model_name_full in enumerate(model_names_ls):
      # Error check and assign correct model names
      model_name_root = model_name_full.split('_')[0]
      if model_name_root in MODELS_LS:
        # Plot
        g = sns.lineplot(data=ts_df, x='sent_no_mid', y=model_name_full, markers=['o'], alpha=0.5, label=model_name_full) # .set_title(f'{CORPUS_FULL} \n Plot Section Sentiment and Cruxes (Model: {models_names_ls[0].capitalize()})')
        # g._legend.remove()
        # plt.plot(ts_df.sent_no_mid, ts_df[model_name_full], marker="o", alpha=0.5, label=model_name_full)
      else:
        print(f'ERROR: model_names_ls[]={model_name_root} is invalid,\n    must be one of {MODELS_LS}')
        return -1

      # Plot
      g = sns.lineplot(data=ts_df, x='sent_no_mid', y=model_name_full, markers=['o'], alpha=0.5, label=model_name_full) # .set_title(f'{CORPUS_FULL} \n Plot Section Sentiment and Cruxes (Model: {models_names_ls[0].capitalize()})')
      # g._legend.remove()

    plt.title(f'{CORPUS_FULL} \n Plot {semantic_type.capitalize()} Sentiment (Standardized Models)\n{subtitle_str}', x=title_xpos, y=title_ypos)

  # plt.legend(loc='best');

  if (save2file == True):
    # Save graph to file.
    models_names_ls = [x[:2] for x in model_names_ls]
    models_names_str = ''.join(models_names_ls)
    plot_filename = f'plot_cruxes_{semantic_type}_{models_names_str}_{models_names_str}.png'
    plotpathfilename_str = gen_pathfiletime(plot_filename)
    plt.savefig(plotpathfilename_str, format='png', dpi=300)
    print(f'Plot saved: {plot_filename}');

  return crux_points_dt

In [None]:
def plot_histogram(model_name='vader', text_unit='sentence', save2file=False):
  '''
  Given a model, text_unit
  Plot a Histogram using the default DataFrame
  '''

  if text_unit == 'sentence':
    ts_df = corpus_sents_df

  elif text_unit == 'paragraph':
    ts_df = corpus_parags_df

  elif text_unit == 'section':
    ts_df = corpus_sects_df

  elif text_unit == 'chapter':
    ts_df = corpus_chaps_df

  else:
    print(f'ERROR: {text_unit} must be sentence, paragraph or section')

  sns.histplot(ts_df[model_name], kde=True).set_title(f'{CORPUS_FULL} \n Histogram {text_unit.capitalize()} Sentiment (Model {model_name.capitalize()})')
  # get_smas(ts_df, model_name=model_name, text_unit=text_unit, win_ls=wins_def_ls)

  if (save2file == True):
    # Save graph to file.
    plot_filename = f'plot_hist_{text_unit}_{model_name}.png'
    plotpathfilename_str = gen_pathfiletime(plot_filename)
    plt.savefig(plotpathfilename_str, format='png', dpi=300)
    print(f'Plot saved: {plot_filename}');

  return

In [None]:
# Raw Plot of Section Sentiments (Not scaled by mid-Section Sentence No to match Sentence/Paragraph x-axes)

def plot_raw_sections(ts_df='corpus_sents_df', model_name='vader', semantic_type='sentence', save2file=False):
  '''
  Given a DataFrame, model_name column, semantic_type 
  Plot the raw sentiment types
  Options to save2file
  ''' 
  
  # if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.lineplot(data=ts_df, x='sect_no', y=model_name, alpha=0.5).set_title(f'{CORPUS_FULL} \n Plot {semantic_type} Sentiment (Raw {model_name.capitalize()})')

  if save2file == True:
    # Save graph to file.
    plot_filename = f'plot_nostand_sects_{model_name}.png'
    plotpathfilename_str = gen_pathfiletime(plot_filename)
    plt.savefig(plotpathfilename_str, format='png', dpi=300)
    print(f'Plot saved: {plot_filename}');

  return

# Test
# plot_raw_sections(ts_df=corpus_sects_df, model_name='pattern', semantic_type='section', save2file=False);

In [None]:
# Raw Plot of Section Sentiments (Not scaled by mid-Section Sentence No to match Sentence/Paragraph x-axes)

def plot_raw_sentiments(model_name='vader', semantic_type='sentence', save2file=False):
  '''
  Given a DataFrame, model_name column, semantic_type 
  Plot the raw sentiment types
  Options to save2file
  ''' 
  
  if semantic_type == 'sentence':
    ts_df = corpus_sents_df
    x_units = 'sent_no'
  elif semantic_type == 'paragraph':
    ts_df = corpus_parags_df
    x_units = 'parag_no'
  elif (semantic_type == 'section') | (semantic_type == 'section_stand'):
    ts_df = corpus_sects_df
    x_units = 'sect_no'
  elif (semantic_type == 'chapter') | (semantic_type == 'chapter_stand'):
    ts_df = corpus_chaps_df
    x_units = 'chap_no'
    
  else:
    print(f'ERROR: {semantic_type} must be sentence, paragraph or section')


  # if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.lineplot(data=ts_df, x=x_units, y=model_name, alpha=0.5, label=model_name).set_title(f'{CORPUS_FULL} \n Plot {semantic_type} Sentiment (Raw {model_name.capitalize()})')
  
  plt.legend(loc='best')

  if save2file == True:
    # Save graph to file.
    plot_filename = f'plot_raw_sentiments_{semantic_type}_{model_name}.png'
    plotpathfilename_str = gen_pathfiletime(plot_filename)
    plt.savefig(plotpathfilename_str, format='png', dpi=300)
    print(f'Plot saved: {plot_filename}');

  return

# Test
# plot_raw_sections(ts_df=corpus_sects_df, model_name='pattern', semantic_type='section', save2file=False);

In [None]:
# TODO: must plot in order to save, cannot save without first plotting

def get_lowess(ts_df='corpus_parags_df', models_ls=MODELS_LS, text_unit='paragraph', plot_subtitle='', alabel='', afrac=1./10, ait=5, alpha=0.5, do_plot=True, save2file=False):
  '''
  Given a DataFrame, list of column to plot, LOWESS params fraction and iterations,
  Return a DataFrame with LOWESS values
  If 'plot=True', also output plot
  '''

  # global corpus_all_df

  lowess_df = pd.DataFrame()

  # Step 1: Calculate LOWESS smoothed values
  for i,acol in enumerate(models_ls):
    sm_x, sm_y = sm_lowess(endog=ts_df[acol].values, exog=ts_df.index.values, frac=afrac, it=ait, return_sorted = True).T
    col_new = f'{acol}_lowess'
    lowess_df[col_new] = pd.Series(sm_y)
    # Optionally plot LOWESS for all models
    if do_plot:
      if alabel == '':
        alabel == acol
      plt.plot(sm_x, sm_y, label=alabel, alpha=alpha, linewidth=2)

  lowess_df['median'] = lowess_df.median(axis=1) # sm_y # corpus_all_df[df_cols_ls].median(axis=1)
  
  # Step 2: Optionally plot LOWESS for median
  if do_plot:
    # sm_x, sm_y = sm_lowess(endog=lowess_df.median, exog=lowess_df.index.values,  frac=afrac, it=ait, return_sorted = True).T
    # plt.plot(sm_x, sm_y, label='median', alpha=0.9, linewidth=2, color='black')
    
    frac_str = str(round(100*afrac))
    plt.title(f'{CORPUS_FULL} \n {plot_subtitle} {text_unit} Standardized Sentiment Smoothed with LOWESS (frac={frac_str})')
    plt.legend(title='Sentiment Model')

  # Step 3: Optionally save to file
  if save2file:
    # Save Plot to file.
    plot_filename = f'plot_{text_unit}_lowess_{plot_subtitle.split()[0].lower()}_{author_str}_{title_str}.png'
    # plotpathfilename_str = gen_pathfiletime(plot_filename)
    plt.savefig(plot_filename, format='png', dpi=300)
    print(f'Plot saved: {plot_filename}');


  return lowess_df

# Test
'''
new_lowess_col = f'{sa_model}_lowess'
my_frac = 1./10
my_frac_per = round(100*my_frac)
new_lowess_col = f'{sa_model}_lowess_{my_frac_per}'
corpus_all_df[new_lowess_col] = plot_lowess(corpus_all_df, [sa_model], afrac=my_frac)
corpus_all_df.head()
''';

In [None]:
def get_sent2dets(sent_no):
  '''
  Given a Sentence Number
  Return the corresponding Paragraph, Section and Chapter Numbers that contain it
  '''

  # Get Paragraph No containing given Sentence No
  sent_parag_no = int(corpus_sents_df[corpus_sents_df['sent_no']==sent_no]['parag_no'])

  # Get Section No containing given Sentence No.
  corpus_sects_ls = list(corpus_sects_df['sect_no'])
  for asect_no in corpus_sects_ls:
    if (int(corpus_sects_df[corpus_sects_df['sect_no'] == asect_no]['sent_no_start']) > sent_no):
      break
    sent_sect_no = asect_no
    # print(f'asect={asect_no}')

  # Get Chapter No containing given Sentence No.
  corpus_chaps_ls = list(corpus_chaps_df['chap_no'])
  for achap_no in corpus_chaps_ls:
    if (int(corpus_chaps_df[corpus_chaps_df['chap_no'] == achap_no]['sent_no_start']) > sent_no):
      break
    sent_chap_no = achap_no
    # print(f'achap={achap_no}')


  return sent_parag_no, sent_sect_no, sent_chap_no

# Test
# sent_parag_no, sent_sect_no, sent_chap_no = get_sent2dets(1408)
# print(f'sent_parag_no={sent_parag_no}\nsent_sect_no={sent_sect_no}\nsent_chap_no={sent_chap_no}')

In [None]:
# get_sentnocontext_report(the_sent_no=sent_no, the_n_sideparags=n_sideparags, the_sent_highlight=sentence_highlight)

In [None]:
def get_sentnocontext(sent_no=1, n_sideparags=1, sent_highlight=True):
  '''
  Given a sentence number in the Corpus
  Return the containing paragraph and n-paragraphs on either side
  (e.g. if n=2, return 2+1+2=5 paragraphs)
  '''

  parag_target_no = int(corpus_sents_df[corpus_sents_df['sent_no'] == sent_no]['parag_no'])
  # print(f'parag_target_no = {parag_target_no} and type: {type(parag_target_no)}')

  if n_sideparags == 0:
    parags_context_ls = list(corpus_parags_df[corpus_parags_df['parag_no'] == parag_target_no]['parag_raw'])

  else:
    parag_start = parag_target_no - n_sideparags
    parag_end = parag_target_no + n_sideparags + 1
    parags_context_ls = list(corpus_parags_df.iloc[parag_start:parag_end]['parag_raw'])


  if sent_highlight == True:
    parag_match_str = str(parags_context_ls[n_sideparags])
    # print(f'parag_match_str:\n  {parag_match_str}')
    sent_idx = sent_no
    sent_str = (corpus_sents_df[corpus_sents_df['sent_no']==sent_idx]['sent_raw'].values)[0]
    sent_str_up = sent_str.upper()
    # print(f'sent_str:\n  {sent_str}')
    # parags_context_ls[n_sideparags] 
    parags_context_ls[n_sideparags] = parag_match_str.replace(sent_str, sent_str_up)

  return parags_context_ls

# Te
# context_highlighted = get_sentnoparags(sent_no=1051, n_sideparags=1)
# print(context_highlighted)

In [None]:
def get_sentnocontext_report(the_sent_no=7, the_n_sideparags=1, the_sent_highlight=True):
  '''
  Wrapper function around  get_sentnocontext()
  Prints a nicely formatted context report
  '''

  context_noparags = the_n_sideparags*2+1

  # print('-------------------------------------------------------------')
  print(f'The {context_noparags} Paragraph(s) Context around the Sentence #{the_sent_no} Crux Point:')
  print('-------------------------------------------------------------')
  print(f"\nCrux Sentence #{the_sent_no} Raw Text: -------------------------------\n\n    {str(corpus_sents_df[corpus_sents_df['sent_no'] == the_sent_no]['sent_raw'].values[0])}\n") # iloc[the_sent_no]['sent_raw']}")

  sent_parag_no, sent_sect_no, sent_chap_no = get_sent2dets(the_sent_no)
  print(f"\nCrux Sentence #{the_sent_no} is Contained in: ---------------------------\n\n    Paragraph #{sent_parag_no}\n      Section #{sent_sect_no}\n      Chapter #{sent_chap_no}\n")

  print(f"\n{context_noparags} Paragraph(s) Context: ------------------------------")
  context_parags_ls = get_sentnocontext(sent_no=the_sent_no, n_sideparags=the_n_sideparags, sent_highlight=the_sent_highlight)
  context_len = len(context_parags_ls)
  context_mid = context_len//2
  for i, aparag in enumerate(context_parags_ls):
    if i==context_mid:
      # print(f'\n>>> Paragraph #{i}: <<< Crux Point Sentence CAPITALIZED within this Paragraph\n\n    {aparag}')
      print(f'\n<*> {aparag}')
    else:
      # print(f'\n    Paragraph #{i}:\n\n    {aparag}')
      print(f'\n    {aparag}')

  return

# Test
# get_sentnocontext_report(sent_no=1051, n_sideparags=1, sent_highlight=True)

In [None]:
def get_section_timeseries(sect_no):
  '''
  Given a Section No in the current Corpus
  Return the start,mid and ending Sent No for this Section as well as the Sentiment Time Series between the start/end Sentence for this Section
  '''
  
  section_count = corpus_sects_df.shape[0]

  # Compute the start, mid and end Sentence numbers for the selected Section
  if Select_Section_No >= section_count:
    print(f'ERROR: You picked Section #{Select_Section_No}.\n  Section for this Corpus must be between 0 and {section_count-1}')
    return -1

  else:

    # Get the starting and middle Sentence No of this Section
    sect_sent_start = int(corpus_sects_df[corpus_sects_df['sect_no'] == Select_Section_No]['sent_no_start'].values)
    # sect_sent_mid = int(corpus_sects_df[corpus_sects_df['sect_no'] == Select_Section_No]['sent_no_mid'].values)

    # Calculate last Sentence No of this Section
    if Select_Section_No == (section_count-1):   
      print(f'You selected the last Section of this Corpus')
      sect_sent_end = corpus_sents_df.shape[0] - 1
    else:
      sect_sent_end = int(corpus_sects_df[corpus_sects_df['sect_no'] == Select_Section_No+1]['sent_no_start'].values) # - 1
      
    print(f'Section #{sect_no}:----------')
    print(f'\nsect_sent_start: {sect_sent_start}')
    # print(f'sect_sent_mid: {sect_sent_mid}')
    print(f'sect_sent_end: {sect_sent_end}')


  # Comput the start, and end Paragraph numbers for the selected Section
  sect_parag_start = int(corpus_sents_df[corpus_sents_df['sent_no'] == sect_sent_start]['parag_no'].values)
  sect_parag_end = int(corpus_sents_df[corpus_sents_df['sent_no'] == sect_sent_end]['parag_no'].values)

  print(f'\nsect_parag_start: {sect_parag_start}')
  print(f'sect_parag_end: {sect_parag_end}')


  # Extract and Return both a Sentence and Paragraph DataFrame for this Section 

  section_sents_df = corpus_sents_df.iloc[sect_sent_start:sect_sent_end]

  section_parags_df = corpus_parags_df.iloc[sect_parag_start:sect_parag_end]


  return section_sents_df, section_parags_df

# Test

# section_sents_df, section_parags_df = get_section_timeseries(Select_Section_No)

# section_sents_df.head()

# print(f'\nsection_sents_df.shape: {section_sents_df.shape}')
# print(f'section_parags_df.shape: {section_parags_df.shape}')

In [None]:
"""


def get_crux_points(col_series, semantic_type='sentence', win_lowess=5, do_plot=True, save2file=False):
  '''
  Given a DataFrame and a Time Series Column within it and a LOWESS window
  Return a list of Min/Max Crux Point (x,y) coordinate tuples for that Column Time Series
  '''

  crux_ls = []

  if semantic_type == 'sentence':
    ts_df = corpus_sents_df
    x_units = 'sent_no'
  elif semantic_type == 'paragraph':
    ts_df = corpus_parags_df
    x_units = 'parag_no'
  elif (semantic_type == 'section') | (semantic_type == 'section_stand'):
    ts_df = corpus_sects_df
    x_units = 'sect_no'
  elif (semantic_type == 'chapter') | (semantic_type == 'chapter_stand'):
    ts_df = corpus_chaps_df
    x_units = 'chap_no'
    
  else:
    print(f'ERROR: {semantic_type} must be sentence, paragraph or section')



  series_len = ts_df.shape[0]

  series_no_min = ts_df[x_units].min()
  seires_no_max = ts_df[x_units].max()

  sm_x = ts_df.index.values
  sm_y = ts_df[col_series].values

  half_win = int((win_lowess/100)*series_len)

  # Find peaks(max).
  # peak_indexes = signal.argrelextrema(sm_y, np.greater, order=half_win, mode='wrap') argrelextrema will not detect flat peaks
  peak_indexes = signal.find_peaks(sm_y, distance=half_win) # np.greater, order=half_win, mode='wrap')
  peak_indexes = peak_indexes[0]

  peak_x_ls = list(peak_indexes)
  peak_y_ls = list(sm_y[peak_indexes])

  # Find valleys(min).
  # valley_indexes = signal.argrelextrema(sm_y, np.less, order=half_win, mode='clip')
  valley_indexes = signal.find_peaks(-sm_y, distance=half_win) # np.less, order=half_win, mode='clip')
  valley_indexes = valley_indexes[0]
  
  valley_x_ls = list(valley_indexes)
  valley_y_ls = list(sm_y[valley_indexes])

  # Save all peaks/valleys as list of (x,y) coordinate tuples
  print(f'type peak_x_ls is: {type(peak_x_ls)}')
  x_all_ls = peak_x_ls + valley_x_ls
  y_all_ls = peak_y_ls + valley_y_ls
  crux_coord_ls = tuple(zip(x_all_ls, y_all_ls)) 

  print(f'Original Series length={series_len} vs LOWESS Series length={len(x_all_ls)}')


  if do_plot == True:
    # Plot main graph.
    (fig, ax) = plt.subplots()
    ax.plot(sm_x, sm_y)

    win_half = 0 # 2500

    # Plot peaks.
    # ax.plot(peak_x + win_half, peak_y, marker='o', linestyle='none', color='green', label="Peaks")
    ax.scatter(peak_x_ls, peak_y_ls)
    for i, txt in enumerate(list(peak_x_ls)):
        ax.annotate(f'  Sent #{txt}', (peak_x_ls[i], peak_y_ls[i]), rotation=90, annotation_clip=True)

    # Plot valleys.
    # ax.plot(valley_x + win_half, valley_y, marker='o', linestyle='none', color='red', label="Valleys")
    ax.scatter(valley_x_ls, valley_y_ls)
    for i, txt in enumerate(list(valley_x_ls)):
        ax.annotate(f'Sent #{txt}', (valley_x_ls[i], valley_y_ls[i]), rotation=270, xytext=(valley_x_ls[i], valley_y_ls[i]-4))

    # for i, txt in enumerate(list(valley_x_ls)):
    #     ax.annotate(f'\n\n\nSent No.\n   {txt}', (valley_x_ls[i], valley_y_ls[i]))
    # plt.plot(x, y, 'bo')
    # texts = [plt.text(valley_x_ls[i], valley_y_ls[i], 'Sent No.\n   %s' %valley_x_ls[i], ha='right', va='top') for i in range(len(valley_x_ls))]
    # adjust_text(texts)

    # Confidence Interval (Min/Max Range)
    # plt.fill_between(sentiment_lowess_df['x_value'], sentiment_lowess_df['min'], sentiment_lowess_df['max'], alpha=.3, color='lightskyblue')

    plt.title(f'{CORPUS_FULL}\nRaw Sentence Sentiments with selected Section #{Select_Section_No}')
    plt.xlabel(f'Sentence No within selected Section #{Select_Section_No}')

    # locs, labels = xticks()  # Get the current locations and labels.
    plt.xticks(np.arange(sent_no_min, sent_no_max, step=10))  # Set label locations.

    plt.ylabel(f'Sentiment Value')
    plt.legend(loc='best');
  
  if save2file == True:
    # Save graph to file.
    plt.title(f'{BOOK_TITLE_FULL} \n LOWESS Smoothed Median Sentiment Curve with Crux Points via SciPy.argrelextrema')
    plt.legend(loc='best')
    plt.savefig('argrelextrema.png')

  return crux_coord_ls


  # if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.lineplot(data=ts_df, x=x_units, y=model_name, alpha=0.5, label=model_name).set_title(f'{CORPUS_FULL} \n Plot {semantic_type} Sentiment (Raw {model_name.capitalize()})')
  
  plt.legend(loc='best')

  if save2file == True:
    # Save graph to file.
    plot_filename = f'plot_raw_sentiments_{semantic_type}_{model_name}.png'
    plotpathfilename_str = gen_pathfiletime(plot_filename)
    plt.savefig(plotpathfilename_str, format='png', dpi=300)
    print(f'Plot saved: {plot_filename}');

  return



""";

In [None]:
"""
def get_lowess_cruxes(ts_df, col_series, text_type='sentence', win_lowess=5, sec_y_height=0, subtitle_str=' ', do_plot=True, save2file=False):
  '''
  Given a DataFrame and a Time Series Column within it and a LOWESS window
  Return a list of Min/Max Crux Point (x,y) coordinate tuples for that Column Time Series
  '''

  crux_ls = []

  series_len = ts_df.shape[0]

  sent_no_min = ts_df.sent_no.min()
  sent_no_max = ts_df.sent_no.max()
  # print(f'sent_no_min {sent_no_min}')

  sm_x = ts_df.index.values
  sm_y = ts_df[col_series].values

  half_win = int((win_lowess/100)*series_len)

  # Find peaks(max).
  # peak_indexes = signal.argrelextrema(sm_y, np.greater, order=half_win, mode='wrap') argrelextrema will not detect flat peaks
  peak_indexes = signal.find_peaks(sm_y, distance=half_win) # np.greater, order=half_win, mode='wrap')
  # peak_indexes = peak_indexes + sent_no_min
  # print(f'peak_indexes[0]: {peak_indexes_np[0]}')
  # print(f'peak_indexes type: {type(peak_indexes_np[0])}')
  # peak_indexes_np = peak_indexes_np + sent_no_min
  peak_indexes = peak_indexes[0]

  peak_x_ls = list(peak_indexes)
  peak_y_ls = list(sm_y[peak_indexes])

  # Find valleys(min).
  # valley_indexes = signal.argrelextrema(sm_y, np.less, order=half_win, mode='clip')
  valley_indexes = signal.find_peaks(-sm_y, distance=half_win)
  valley_indexes = valley_indexes[0]
  
  valley_x_ls = list(valley_indexes)
  valley_y_ls = list(sm_y[valley_indexes])

  # Save all peaks/valleys as list of (x,y) coordinate tuples
  # print(f'type peak_x_ls is: {type(peak_x_ls)}')
  x_all_ls = peak_x_ls + valley_x_ls
  # readjust starting Sentence No to start with first sentence in segement window
  x_all_ls = [x+sent_no_min for x in x_all_ls]
  y_all_ls = peak_y_ls + valley_y_ls
  crux_coord_ls = tuple(zip(x_all_ls, y_all_ls)) 

  # print(f'Original Series length={series_len} vs LOWESS Series length={len(x_all_ls)}')


  if do_plot == True:
    # Plot main graph.
    (fig, ax) = plt.subplots()
    ax.plot(sm_x, sm_y)

    if text_type == 'sentence':
      paragraph_boundries_ls = list(section_sents_df['parag_no'].unique())
      for i, aparag in enumerate(paragraph_boundries_ls):
        if i%5 == 0:
          # Plot every 5th paragraph
          sent_no = section_sents_df[section_sents_df['parag_no'] == aparag]['sent_no'].min()
          plt.text(sent_no, sec_y_height, f'Paragraph #{aparag}', alpha=0.2, rotation=90)
          plt.axvline(sent_no, color='blue', alpha=0.1)
    elif text_type == 'paragraph':
      paragraph_boundries_ls = list(section_sents_df['parag_no'].unique())
      for i, aparag_no in enumerate(paragraph_boundries_ls):
        if i%5 == 0:
          # Plot every 5th paragraph
          sent_no = section_sents_df[section_sents_df['parag_no'] == aparag]['sent_no'].min()
          plt.text(aparag_no, sec_y_height, f'Paragraph #{aparag_no}', alpha=0.2, rotation=90)
          plt.axvline(aparag_no, color='blue', alpha=0.1)    
    else:
      print(f"ERROR: text_type is {text_type} but must be either 'sentence' or 'paragarph'")

    win_half = 0 # 2500

    # Plot peaks.
    # ax.plot(peak_x + win_half, peak_y, marker='o', linestyle='none', color='green', label="Peaks")

    # readjust starting Sentence No to start with first sentence in segement window
    peak_x_ls = [x+sent_no_min for x in peak_x_ls]
    ax.scatter(peak_x_ls, peak_y_ls)
    for i, txt in enumerate(list(peak_x_ls)):
        ax.annotate(f'  Sent #{txt}', (peak_x_ls[i], peak_y_ls[i]), rotation=90, annotation_clip=True)

    # Plot valleys.
    # ax.plot(valley_x + win_half, valley_y, marker='o', linestyle='none', color='red', label="Valleys")
    # readjust starting Sentence No to start with first sentence in segement window
    valley_x_ls = [x+sent_no_min for x in valley_x_ls]
    ax.scatter(valley_x_ls, valley_y_ls)
    for i, txt in enumerate(list(valley_x_ls)):
        ax.annotate(f'Sent #{txt}', (valley_x_ls[i], valley_y_ls[i]), rotation=270, xytext=(valley_x_ls[i], valley_y_ls[i]-4))

    # for i, txt in enumerate(list(valley_x_ls)):
    #     ax.annotate(f'\n\n\nSent No.\n   {txt}', (valley_x_ls[i], valley_y_ls[i]))
    # plt.plot(x, y, 'bo')
    # texts = [plt.text(valley_x_ls[i], valley_y_ls[i], 'Sent No.\n   %s' %valley_x_ls[i], ha='right', va='top') for i in range(len(valley_x_ls))]
    # adjust_text(texts)

    # Confidence Interval (Min/Max Range)
    # plt.fill_between(sentiment_lowess_df['x_value'], sentiment_lowess_df['min'], sentiment_lowess_df['max'], alpha=.3, color='lightskyblue')

    plt.title(f'{CORPUS_FULL} Raw Sentence Crux Detection in Section #{Select_Section_No}\nLOWESS Smoothed {subtitle_str} and SciPy find_peaks')
    plt.xlabel(f'Sentence No within selected Section #{Select_Section_No}')

    # locs, labels = xticks()  # Get the current locations and labels.
    # plt.xticks(np.arange(sent_no_min, sent_no_max, step=10))  # Set label locations.

    plt.ylabel(f'Sentiment Value')
    plt.legend(loc='best');
  
  if save2file == True:
    # Save graph to file.
    plt.title(f'{BOOK_TITLE_FULL} \n LOWESS Smoothed Median Sentiment Curve with Crux Points via SciPy.argrelextrema')
    plt.legend(loc='best')
    plt.savefig('argrelextrema.png')

  return crux_coord_ls
  """

In [None]:
"""
def get_crux_points(ts_df, col_series, text_type='sentence', win_per=5, sec_y_height=0, subtitle_str=' ', do_plot=True, save2file=False):
  '''
  Given a DataFrame and a Time Series Column within it and a LOWESS window
  Return a list of Min/Max Crux Point (x,y) coordinate tuples for that Column Time Series
  '''

  crux_ls = []

  series_len = ts_df.shape[0]

  sent_no_min = ts_df.sent_no.min()
  sent_no_max = ts_df.sent_no.max()
  # print(f'sent_no_min {sent_no_min}')

  sm_x = ts_df.index.values
  sm_y = ts_df[col_series].values

  half_win = int((win_per/100)*series_len)

  # Find peaks(max).
  # peak_indexes = signal.argrelextrema(sm_y, np.greater, order=half_win, mode='wrap') argrelextrema will not detect flat peaks
  peak_indexes = signal.find_peaks(sm_y, distance=half_win) # np.greater, order=half_win, mode='wrap')
  # peak_indexes = peak_indexes + sent_no_min
  # print(f'peak_indexes[0]: {peak_indexes_np[0]}')
  # print(f'peak_indexes type: {type(peak_indexes_np[0])}')
  # peak_indexes_np = peak_indexes_np + sent_no_min
  peak_indexes = peak_indexes[0]

  peak_x_ls = list(peak_indexes)
  peak_y_ls = list(sm_y[peak_indexes])

  # Find valleys(min).
  # valley_indexes = signal.argrelextrema(sm_y, np.less, order=half_win, mode='clip')
  valley_indexes = signal.find_peaks(-sm_y, distance=half_win)
  valley_indexes = valley_indexes[0]
  
  valley_x_ls = list(valley_indexes)
  valley_y_ls = list(sm_y[valley_indexes])

  # Save all peaks/valleys as list of (x,y) coordinate tuples
  # print(f'type peak_x_ls is: {type(peak_x_ls)}')
  x_all_ls = peak_x_ls + valley_x_ls
  # readjust starting Sentence No to start with first sentence in segement window
  x_all_ls = [x+sent_no_min for x in x_all_ls]
  y_all_ls = peak_y_ls + valley_y_ls
  crux_coord_ls = tuple(zip(x_all_ls, y_all_ls)) 

  # print(f'Original Series length={series_len} vs LOWESS Series length={len(x_all_ls)}')


  if do_plot == True:
    # Plot main graph.
    (fig, ax) = plt.subplots()
    ax.plot(sm_x, sm_y)

    if text_type == 'sentence':
      paragraph_boundries_ls = list(section_sents_df['parag_no'].unique())
      for i, aparag in enumerate(paragraph_boundries_ls):
        if i%5 == 0:
          # Plot every 5th paragraph
          sent_no = section_sents_df[section_sents_df['parag_no'] == aparag]['sent_no'].min()
          plt.text(sent_no, sec_y_height, f'Paragraph #{aparag}', alpha=0.2, rotation=90)
          plt.axvline(sent_no, color='blue', alpha=0.1)
    elif text_type == 'paragraph':
      paragraph_boundries_ls = list(section_sents_df['parag_no'].unique())
      for i, aparag_no in enumerate(paragraph_boundries_ls):
        if i%5 == 0:
          # Plot every 5th paragraph
          sent_no = section_sents_df[section_sents_df['parag_no'] == aparag]['sent_no'].min()
          plt.text(aparag_no, sec_y_height, f'Paragraph #{aparag_no}', alpha=0.2, rotation=90)
          plt.axvline(aparag_no, color='blue', alpha=0.1)    
    else:
      print(f"ERROR: text_type is {text_type} but must be either 'sentence' or 'paragarph'")

    win_half = 0 # 2500

    # Plot peaks.
    # ax.plot(peak_x + win_half, peak_y, marker='o', linestyle='none', color='green', label="Peaks")

    # readjust starting Sentence No to start with first sentence in segement window
    peak_x_ls = [x+sent_no_min for x in peak_x_ls]
    ax.scatter(peak_x_ls, peak_y_ls)
    for i, txt in enumerate(list(peak_x_ls)):
        ax.annotate(f'  Sent #{txt}', (peak_x_ls[i], peak_y_ls[i]), rotation=90, annotation_clip=True)

    # Plot valleys.
    # ax.plot(valley_x + win_half, valley_y, marker='o', linestyle='none', color='red', label="Valleys")
    # readjust starting Sentence No to start with first sentence in segement window
    valley_x_ls = [x+sent_no_min for x in valley_x_ls]
    ax.scatter(valley_x_ls, valley_y_ls)
    for i, txt in enumerate(list(valley_x_ls)):
        ax.annotate(f'Sent #{txt}', (valley_x_ls[i], valley_y_ls[i]), rotation=270, xytext=(valley_x_ls[i], valley_y_ls[i]-4))

    # for i, txt in enumerate(list(valley_x_ls)):
    #     ax.annotate(f'\n\n\nSent No.\n   {txt}', (valley_x_ls[i], valley_y_ls[i]))
    # plt.plot(x, y, 'bo')
    # texts = [plt.text(valley_x_ls[i], valley_y_ls[i], 'Sent No.\n   %s' %valley_x_ls[i], ha='right', va='top') for i in range(len(valley_x_ls))]
    # adjust_text(texts)

    # Confidence Interval (Min/Max Range)
    # plt.fill_between(sentiment_lowess_df['x_value'], sentiment_lowess_df['min'], sentiment_lowess_df['max'], alpha=.3, color='lightskyblue')

    plt.title(f'{CORPUS_FULL} Raw Sentence Crux Detection in Section #{Select_Section_No}\nLOWESS Smoothed {subtitle_str} and SciPy find_peaks')
    plt.xlabel(f'Sentence No within selected Section #{Select_Section_No}')

    # locs, labels = xticks()  # Get the current locations and labels.
    # plt.xticks(np.arange(sent_no_min, sent_no_max, step=10))  # Set label locations.

    plt.ylabel(f'Sentiment Value')
    plt.legend(loc='best');
  
  if save2file == True:
    # Save graph to file.
    plt.title(f'{BOOK_TITLE_FULL} \n LOWESS Smoothed Median Sentiment Curve with Crux Points via SciPy.argrelextrema')
    plt.legend(loc='best')
    plt.savefig('argrelextrema.png')

  return crux_coord_ls
""";

In [None]:
def crux_sortsents(crux_ls, atop_n=3, get_peaks=True, sort_key='sentiment_val'):
  '''
  Given a list of tuples (sent_no, sentiment value), atop_n cruxes to retrieve and bool flag get_peaks
  Return a sorted list of peaks/valleys (sentiment_value, sent_no, sent_raw) from greatest down for top_n items
  '''
  # print(f'Entered crux_sortsents with crux_ls={crux_ls}\natop_n={atop_n}')

  crux_sorted_ls = []
  crux_sorted_topn_ls = []
  crux_new_ls = []

  # Sort by either sent_no or sentiment value
  if sort_key == 'sent_no':
    crux_sorted_ls = sorted(crux_ls, key=lambda tup: (tup[0]))
  else:
    crux_sorted_ls = sorted(crux_ls, key=lambda tup: (tup[1]), reverse=get_peaks)

  # Trim the n_top cruxes if more cruxes than requested, else return all found cruxes
  if (len(crux_sorted_ls) >= atop_n):
    crux_sorted_topn_ls = crux_sorted_ls[:atop_n]
  else:
    crux_sorted_topn_ls = crux_sorted_ls

  # Retrieve the Sentence raw text for each Crux and add as Tuple(sent_no, sentiment_val, raw_text) to return List
  for asent_no, asentiment_val in crux_sorted_topn_ls:
    asent_raw = str(corpus_sents_df[corpus_sents_df['sent_no'] == asent_no]['sent_raw'].values[0])
    crux_new_ls.append((int(asent_no), float(f'{asentiment_val:.3f}'), str(asent_raw),)) # Append a Tuple to return List

  return crux_new_ls

# Test
# crux_n_top_ls = crux_sortsents(section_crux_ls, atop_n=3, get_peaks=True)

In [None]:
def crux_sortsents_report(crux_ls, library_type='baseline', top_n=3, get_peaks=True, sort_by='sentiment_val', n_sideparags=1, sentence_highlight=True):
  '''
  Wrapper function to produce report based upon 'crux_sortsents() described as:
    Given a list of tuples (sent_no, sentiment value), top_n cruxes to retrieve and bool flag get_peaks
    Return a sorted list of peaks/valleys (sentiment_value, sent_no, sent_raw) from greatest down for top_n items

    # get_sentnocontext_report
  '''

  if get_peaks == True:
    crux_label = 'Peak'
  else:
    crux_label = 'Valley'

  # Filter and keep only the desired crux type in List crux_subset_ls
  crux_subset_ls = []
  for acrux_tup in crux_ls:
    crux_type, crux_x_coord, crux_y_coord = acrux_tup
    if crux_type.lower() == crux_label.lower():
      crux_subset_ls.append((crux_x_coord,crux_y_coord)) # Append a Tuple to List

  flag_2few_cruxes = False

  # Check to see if asked for more Cruxes than were found 
  top_n_found = len(crux_subset_ls)
  if top_n_found < top_n:
    flag_2few_cruxes = True
    print(f'\n\nWARNING: You asked for {top_n} {crux_label}s\n         but there only {top_n_found} were found above.\n')
    print(f'             Displaying as many {crux_label}s as possible,')
    print(f'             to retrieve more, go back to the previous code cells and re-run with wider Crux Window.\n\n')


  # Get Sentence no and raw text for appropriate Crux subset
  # print(f'Calling crux_n_top_ls with crux_subset_ls={crux_subset_ls}\ntop_n={top_n}\nget_peaks={get_peaks}')
  crux_n_top_ls = crux_sortsents(crux_ls=crux_subset_ls, atop_n=top_n, get_peaks=get_peaks, sort_key=sort_by)
  # print(f'Returning crux_n_top_ls = {crux_n_top_ls}')

  # Print appropriate header
  print('------------------------------')
  # print(f'library_type: {library_type}')
  if library_type in ['baselines','sentimentr','syuzhetr','transformers']:
    if (sort_by != 'sent_no') & (flag_2few_cruxes == False):
      print(f'Library: {library_type.capitalize()} ALL Top {top_n} {crux_label}s Found\n')
    else:
      print(f'Library #{library_type.capitalize()} ONLY Top {top_n_found} {crux_label}s Found\n')
  else:
    if (sort_by != 'sent_no') & (flag_2few_cruxes == False):
      print(f'Section #{Select_Section_No} ALL Top {top_n} {crux_label}s Found\n')
    else:
      print(f'Section #{Select_Section_No} ONLY Top {top_n_found} {crux_label}s Found\n')

  # Print summary of subset Cruxes
  for i,crux_sent_tup in enumerate(crux_n_top_ls):
    # crux_type, crux_x_coord, crux_y_coord = crux_sent_tup
    crux_x_coord, crux_y_coord, sent_txt = crux_sent_tup
    print(f'   {crux_label} #{i} at Sentence #{crux_x_coord} with Sentiment Value {crux_y_coord}')
  # print('------------------------------\n')
  # print('Sent_No  Sentiment   Sentence (Raw Text)\n')
  
  # Print details of each Crux in subset
  for sent_no, sent_pol, sent_txt in crux_n_top_ls: 
    sent_no = int(sent_no)
    print('\n\n-------------------------------------------------------------')
    print(f'Sentence #{sent_no}   Sentiment: {sent_pol:.3f}\n') #     {sent_txt}\n')
    # print('------------------------------')
    get_sentnocontext_report(the_sent_no=sent_no, the_n_sideparags=n_sideparags, the_sent_highlight=sentence_highlight)
    # get_sentnocontext(sent_no=sent_no, the_n_sideparags=n_sideparags, the_sent_highlight=sentence_highlight)


In [None]:
library_type='syuzhetr'
if library_type in ['baseline','sentimentr','syuzhetr','transformers']:
  print("It is IN")
else:
  print("BOO")

In [None]:
# For the selected Section, create an expanded Paragraph DataFrame to match the number of Sentences in the Section

def expand_parags2sents(parags_df='corpus_parags_df', sents_df='corpus_sents_df', model_name='vader_lnorm_medianiqr'):
  '''
  Given a Corpus Paragraph DataFrame and a longer Sentence DataFrame that cover the same Section of a Corpus
  Return an expanded version of the Paragraph DataFrame of equal length to the Sentence DataFrame so they can be plotted/compared along the same x-axis
  '''

  parag_sentiment_expanded_ls = []
  parags_midpoint_ls = []
  sent_sum = 0
  parag_start = section_parags_df.parag_no.min()
  print(f'parag_start: {parag_start}')
  parag_end = section_parags_df.parag_no.max() + 1 # shape[0] + 3
  print(f'parag_end: {parag_end}')
  parags_range_ls = list(range(parag_start, parag_end, 1))
  print(f'parags_range_ls: {parags_range_ls}')
  for i, aparag_no in enumerate(parags_range_ls):
    aparag_sentiment_fl = float(corpus_parags_df[corpus_parags_df['parag_no']==aparag_no][model_name])
    sent_ct = len(corpus_sents_df[corpus_sents_df.parag_no == aparag_no])
    parag_midpoint_int = int(sent_ct//2 + sent_sum)
    parags_midpoint_ls.append(parag_midpoint_int)
    for asent in range(sent_ct):
      parag_sentiment_expanded_ls.append(aparag_sentiment_fl)
    sent_sum += sent_ct
    print(f'#{i}: Paragraph #{aparag_no} has {sent_ct} Sentences and Avg Sentiment: {aparag_sentiment_fl:.3f}')

  print(f'\nSentence Total: {sent_sum} vs Original section_sents_df: {section_sents_df.shape[0]}')
  print(f'  Paragraph Sentiment length: {len(parag_sentiment_expanded_ls)}')

  # section_sents_parags_df = section_sents_df.copy()
  
  # section_sents_parags_df.head(1);

  # corpus_sents_df['']

  return parag_sentiment_expanded_ls, parags_midpoint_ls

# Test
# section_sents_df['vader_lnorm_medianiqr_parag'] = expand_parags2sents(parags_df='corpus_parags_df', sents_df='corpus_sents_df')


In [None]:
def get_crux_points(ts_df, col_series, text_type='sentence', win_per=5, sec_y_height=0, subtitle_str=' ', do_plot=True, save2file=False):
  '''
  Given a DataFrame and a Time Series Column within it and a LOWESS window
  Return a list of Min/Max Crux Point (x,y) coordinate tuples for that Column Time Series
  '''

  # print('entered get_crux_points')
  crux_ls = []

  series_len = ts_df.shape[0]
  # print(f'series_len = {series_len}')

  sent_no_min = ts_df.sent_no.min()
  sent_no_max = ts_df.sent_no.max()
  # print(f'sent_no_min {sent_no_min}')

  sm_x = ts_df.index.values
  sm_y = ts_df[col_series].values.flatten()

  half_win = int((win_per/100)*series_len)
  # print(f'half_win = {half_win}')
  # print(f'sm_y type = {type(sm_y)}')

  # Find peaks(max).
  # peak_indexes = signal.argrelextrema(sm_y, np.greater, order=half_win, mode='wrap') argrelextrema will not detect flat peaks
  peak_indexes = signal.find_peaks(sm_y, distance=half_win) # np.greater, order=half_win, mode='wrap')
  # peak_indexes = peak_indexes + sent_no_min
  # print(f'peak_indexes[0]: {peak_indexes_np[0]}')
  # print(f'peak_indexes type: {type(peak_indexes_np[0])}')
  # peak_indexes_np = peak_indexes_np + sent_no_min
  # print(f'peak_indexes type = {type(peak_indexes)}')
  peak_indexes = peak_indexes[0]

  peak_x_ls = list(peak_indexes)
  peak_x_adj_ls = [x+sent_no_min for x in peak_x_ls]

  peak_y_ls = list(sm_y[peak_indexes])

  peak_label_ls = ['peak'] * len(peak_y_ls)
  peak_coord_ls = tuple(zip(peak_label_ls, peak_x_adj_ls, peak_y_ls))

  # peak_y_all_ls = peak_y_ls + valley_y_ls
  # crux_coord_ls = tuple(zip(x_all_ls, y_all_ls)) 

  # Find valleys(min).
  # valley_indexes = signal.argrelextrema(sm_y, np.less, order=half_win, mode='clip')
  valley_indexes = signal.find_peaks(-sm_y, distance=half_win)
  valley_indexes = valley_indexes[0]
  
  valley_x_ls = list(valley_indexes)
  valley_x_adj_ls = [x+sent_no_min for x in valley_x_ls]

  valley_y_ls = list(sm_y[valley_indexes])

  valley_label_ls = ['valley'] * len(valley_y_ls)
  valley_coord_ls = tuple(zip(valley_label_ls, valley_x_adj_ls, valley_y_ls))

  # Combine Peaks and Valley Coordinates into List of Tuples(label, x_coord, y_coord)
  crux_coord_ls = peak_coord_ls + valley_coord_ls

  # Save all peaks/valleys as list of (x,y) coordinate tuples
  # print(f'type peak_x_ls is: {type(peak_x_ls)}')
  #  x_all_ls = peak_x_ls + valley_x_ls
  # readjust starting Sentence No to start with first sentence in segement window
  #  x_all_ls = [x+sent_no_min for x in x_all_ls]
  #  y_all_ls = peak_y_ls + valley_y_ls
  # crux_coord_ls = tuple(zip(x_all_ls, y_all_ls)) 

  # print(f'Original Series length={series_len} vs LOWESS Series length={len(x_all_ls)}')


  if do_plot == True:
    # Plot main graph.
    (fig, ax) = plt.subplots()
    ax.plot(sm_x, sm_y)

    section_sent_no_boundries_ls = list(corpus_sects_df['sent_no_start'])
    section_no_ls = list(corpus_sects_df['sect_no'])
    for i, asect_no in enumerate(section_sent_no_boundries_ls):
      # Plot vertical lines for section boundries
      plt.text(asect_no, sec_y_height, f'Section #{section_no_ls[i]}', alpha=0.2, rotation=90)
      plt.axvline(asect_no, color='blue', alpha=0.1)    


    win_half = 0 # 2500

    # Plot peaks.
    # ax.plot(peak_x + win_half, peak_y, marker='o', linestyle='none', color='green', label="Peaks")

    # readjust starting Sentence No to start with first sentence in segement window
    peak_x_ls = [x+sent_no_min for x in peak_x_ls]
    ax.scatter(peak_x_ls, peak_y_ls)
    for i, txt in enumerate(list(peak_x_ls)):
        ax.annotate(f'  Sent #{txt}', (peak_x_ls[i], peak_y_ls[i]), rotation=90, annotation_clip=True)

    # Plot valleys.
    # ax.plot(valley_x + win_half, valley_y, marker='o', linestyle='none', color='red', label="Valleys")
    # readjust starting Sentence No to start with first sentence in segement window
    valley_x_ls = [x+sent_no_min for x in valley_x_ls]
    ax.scatter(valley_x_ls, valley_y_ls)
    for i, txt in enumerate(list(valley_x_ls)):
        ax.annotate(f'Sent #{txt}', (valley_x_ls[i], valley_y_ls[i]), rotation=270, annotation_clip=True) # xytext=(valley_x_ls[i], valley_y_ls[i]-4))

    # for i, txt in enumerate(list(valley_x_ls)):
    #     ax.annotate(f'\n\n\nSent No.\n   {txt}', (valley_x_ls[i], valley_y_ls[i]))
    # plt.plot(x, y, 'bo')
    # texts = [plt.text(valley_x_ls[i], valley_y_ls[i], 'Sent No.\n   %s' %valley_x_ls[i], ha='right', va='top') for i in range(len(valley_x_ls))]
    # adjust_text(texts)

    # Confidence Interval (Min/Max Range)
    # plt.fill_between(sentiment_lowess_df['x_value'], sentiment_lowess_df['min'], sentiment_lowess_df['max'], alpha=.3, color='lightskyblue')

    plt.title(f'{CORPUS_FULL} SMA Smoothed Sentence Sentiment Arcs Crux Detection\n{subtitle_str} Models: {col_series}')
    plt.xlabel(f'Sentence No') # within selected Section #{Select_Section_No}')

    # locs, labels = xticks()  # Get the current locations and labels.
    # plt.xticks(np.arange(sent_no_min, sent_no_max, step=10))  # Set label locations.

    plt.ylabel(f'Sentiment Value')
    plt.legend(loc='best');
  
  if save2file == True:
    # Save graph to file.
    plt.title(f'{BOOK_TITLE_FULL} \n SMA Smoothed Sentence Sentiment Arcs Crux Points')
    # plt.legend(loc='best')
    plt.savefig(f"{CORPUS_FILENAME.split('.')[0]}_find_peaks.png")

  return crux_coord_ls;

In [None]:
def get_standardscaler(series_name, values_ser):
  '''
  Given a Series of values
  Return a list of StandardSclar transformations on that input Series
  '''

  scaler = StandardScaler()  

  # Convert to np.array
  values_np = np.array(values_ser)
  
  values_flat_np = values_np.reshape((len(values_np), 1))

  scaler = scaler.fit(values_flat_np)
  print(f'Model: {series_name}\n       Mean: {scaler.mean_}, StandardDeviation: {np.sqrt(scaler.var_)}') # % (scaler.mean_, np.sqrt(scaler.var_)))
  values_flat_xform_np = scaler.transform(values_flat_np)

  return values_flat_xform_np.flatten().tolist()

# Test
# stdscaler_series_ls = get_standardscaler('vader_lnorm_medianiqr_roll100', corpus_sents_df['vader_lnorm_medianiqr_roll100'])
# corpus_sents_df['vader_roll100_stdscaler'] = pd.Series(stdscaler_series_ls)


# **Preprocess and Review Corpus Text (Auto)**

### **Get Corpus by Sections, Chapters, Paragraphs and Sentences**

#### **Get Sections**

In [None]:
corpus_sects_ls, corpus_str_raw = corpus2sects(CORPUS_FILENAME)

print('\n\nAFTER ----------')
print(f'len(corpus_sects_ls): {len(corpus_sects_ls)}')
print("\n\n-----")
print(f'corpus_sects_ls[0]:\n\n    {corpus_sects_ls[0]}')
print("\n\n-----")
print(f'corpus_sects_ls[1]:\n\n    {corpus_sects_ls[1]}')
print("\n\n-----")
print(f'corpus_sects_ls[2]:\n\n    {corpus_sects_ls[2]}')
print("\n\n-----")
print(f'corpus_sects_ls[-2]:\n\n    {corpus_sects_ls[-2]}')
print("\n\n-----")
print(f'corpus_sects_ls[-1]:\n\n    {corpus_sects_ls[-1]}')

In [None]:
len(corpus_str_raw)

In [None]:
# Verify no CHAPTER headings remain
for i,aline in enumerate(corpus_sects_ls):
  if aline.strip().startswith('CHAPTER '):
    print(f'CHAPTER aline: {aline}')

In [None]:
print(f'len(corpus_sects_ls): {len(corpus_sects_ls)}')
print(corpus_sects_ls[0])

#### **Get Chapters**

In [None]:
!ls -altr *.txt
!head -n 10 $corpus_filename 

In [None]:
corpus_chaps_ls, corpus_str_raw = corpus2chaps(corpus_filename)

print('\n\nAFTER ----------')
print(f'len(corpus_chaps_ls): {len(corpus_chaps_ls)}')
print("\n\n-----")
print(f'corpus_chaps_ls[0]:\n\n    {corpus_chaps_ls[0]}')
print("\n\n-----")
print(f'corpus_chaps_ls[1]:\n\n    {corpus_chaps_ls[1]}')
print("\n\n-----")
print(f'corpus_chaps_ls[2]:\n\n    {corpus_chaps_ls[2]}')
"""
print("\n\n-----")
print(f'corpus_chaps_ls[-2]:\n\n    {corpus_chaps_ls[-2]}')
print("\n\n-----")
print(f'corpus_chaps_ls[-1]:\n\n    {corpus_chaps_ls[-1]}')
""";

In [None]:
# Verify Chapter and Section Counts

print(f'CHAPTER Count: len(corpus_chaps_ls): {len(corpus_chaps_ls)}')
print(f'SECTION Count: len(corpus_sects_ls): {len(corpus_sects_ls)}')

#### **Get Paragraphs**

In [None]:
# Read corpus into a single string then split into paragraphs

corpus_parags_ls, corpus_raw_str = corpus2parags(CORPUS_FILENAME)
print(f'F ound #{len(corpus_parags_ls)} paragraphs\n')

print('\nThe first 10 Paragraphs of the Corpus:')
print('-----------------------------------\n')
corpus_parags_ls[:10]

print('\nThe last 10 Paragraphs of the Corpus:')
print('-----------------------------------\n')
corpus_parags_ls[-10:]
print('\n')

n_shortest = 10
print(f'The {n_shortest} shortest Paragraphs in the Corpus are:')
print('--------------------------------------------')
temp_ls = sorted(corpus_parags_ls, key=lambda x: (len(x), x))
for i, asent in enumerate(temp_ls[:n_shortest]):
  print(f'Shortest Paragraph #{i}: {asent}')

#### **Get Sentences**

In [None]:
corpus_sents_ls = parag2sents(corpus_parags_ls)

print(f'Found {len(corpus_sents_ls)} Sentences in Corpus\n')

print(f'    First List Object in Sentence List {corpus_sents_ls[0]}\n')

print(f'    Last List Object in Sentence List {corpus_sents_ls[-1]}\n');

print(f"List Object format: ['sent_no', 'parag_no', 'sent_raw']\n")

### **Create DataFrames**

**Create Sentence DataFrame: [corpus_sents_df]**

In [None]:
# Create Corpus Sentence DataFrame

corpus_sents_df = pd.DataFrame(corpus_sents_ls)
corpus_sents_df.columns = ['sent_no', 'parag_no', 'sent_raw']
corpus_sents_df['sent_raw'] = corpus_sents_df['sent_raw'].astype('string')
# Double check to drop any rows where raw Sentence is NaN or empty string ''
corpus_sents_df.dropna(subset=['sent_raw'], inplace=True)


print(f'First 10 Sentences of {CORPUS_FULL}')
corpus_sents_df.head(10)
corpus_sents_df.info()

**Create Paragraph DataFrame: [corpus_parags_df]**

In [None]:
# Create Corpus Paragraph DataFrame

parag_no_ls = []
parag_raw_ls = []

corpus_parags_df = pd.DataFrame()

for i, aparag in enumerate(corpus_parags_ls):
  parag_no_ls.append(i)
  parag_raw_ls.append(aparag)

corpus_parags_df = pd.DataFrame(
    {'parag_no': parag_no_ls,
     'parag_raw': parag_raw_ls,
    })

# Double check to drop any rows where raw Paragraph is NaN or empty string ''
corpus_parags_df.dropna(subset=['parag_raw'], inplace=True)

# Test 
print(f'First 10 Paragraphs of {CORPUS_FULL}')
corpus_parags_df.head(10)
corpus_parags_df.info()

**Create Section DataFrame: [corpus_sects_df]**

In [None]:
MIN_SECT_LEN=25

In [None]:
# Create Corpus Section DataFrame

sect_no_ls = []
sect_raw_ls = []

# corpus_sects_df = pd.DataFrame()

# Filter out all the CHAPTER [\d]{1,2} lines
corpus_sects_noheaders_ls = []
pattern = r'CHAPTER [\d]{1,2}[^\n]*'
for asect in corpus_sects_ls:
  if re.search(pattern, asect) == None:
    corpus_sects_noheaders_ls.append(asect)
corpus_sects_ls = corpus_sects_noheaders_ls


for i, asect in enumerate(corpus_sects_ls):
  sect_no_ls.append(i)
  sect_raw_ls.append(asect)


corpus_sects_df = pd.DataFrame(
    {'sect_no': sect_no_ls,
     'sect_raw': sect_raw_ls,
    })


# Calculate the sentence number at the mid-point of each Section

sect_mid_sentno_ls = []
sect_start_sentno_ls = []
sect_sentno_base = 0
for i, sect_text in enumerate(corpus_sects_df.sect_raw):
  if len(sect_text) > MIN_SECT_LEN:
    # Create list of Sentences by sent_tokenizing Section raw text string
    sect_sents_ls = sent_tokenize(sect_text)

    # Calc and save the sent_no that begins each Section
    sect_first_sent = sect_sents_ls[0].strip()[:30]  # Match on the first 20 chars

    # Fix to remove leading/trailing parenthesis that are being interpreted by Python
    # sect_first_sent = sect_first_sent.replace('(','').replace(')','')
    sect_first_sent = sect_first_sent.strip('()[]')

    # Find Sentence No for the starting Sentence of each Section
    print(f'For Section #{i} seeking first sentence: {sect_first_sent}')
    sect_start_sentno = list(corpus_sents_df[corpus_sents_df['sent_raw'].str.contains(sect_first_sent, regex=False)]['sent_no'])[0] # Problems with Sentences beginning/endings with parenthenses
    sect_start_sentno_ls.append(int(sect_start_sentno))

    # Find the Sentence No for the middle Sentence of each Section
    sect_sents_len = len(sect_sents_ls)
    sect_mid_sentno = int(sect_sents_len/2 + sect_sentno_base)
    # print(f'Section #{i}: {len(sect_sents_ls)} Sentences, midpoint: {sect_mid_sentno}, cumulative midpoint: {sect_mid_sentno}')
    sect_mid_sentno_ls.append(sect_mid_sentno)
    sect_sentno_base += sect_sents_len

corpus_sects_df['sent_no_start'] = pd.Series(sect_start_sentno_ls)
corpus_sects_df['sent_no_mid'] = pd.Series(sect_mid_sentno_ls)

# Test 
print(f'First 2 Sections of {CORPUS_FULL}')
# corpus_sects_df.head(2)
corpus_sects_df.info()

In [None]:
corpus_sects_df.tail()

In [None]:
# Test

corpus_sents_df[corpus_sents_df['sent_raw'].str.contains('No going to the Lighthouse')]['sent_no']

**Create Chapter DataFrame: [corpus_chaps_df]**

In [None]:
# Create Corpus Chapter DataFrame

chap_no_ls = []
chap_raw_ls = []

# corpus_chaps_df = pd.DataFrame()

for i, achap in enumerate(corpus_chaps_ls):
  chap_no_ls.append(i)
  chap_raw_ls.append(achap)


corpus_chaps_df = pd.DataFrame(
    {'chap_no': chap_no_ls,
     'chap_raw': chap_raw_ls,
    })


# Calculate the sentence number at the mid-point of each Chapter

chap_mid_sentnos_ls = []
chap_start_sentnos_ls = []
chap_sentno_base = 0
for i, chap_text in enumerate(corpus_chaps_df.chap_raw):
  if len(chap_text) > MIN_CHAP_LEN:
    chap_sents_ls = sent_tokenize(chap_text)
    # Calc and save the sent_no that begins each Chapter
    chap_first_sent = chap_sents_ls[0].strip()
    # print(f'Searching for first sentence: {chap_first_sent}')
    chap_start_sentnos_ls.append(int(corpus_sents_df[corpus_sents_df['sent_raw'].str.contains(chap_first_sent, regex=False)]['sent_no']))
    # Calc and save the sent_no in the middle of each Chapter
    chap_sents_len = len(chap_sents_ls)
    chap_mid_sentno = int(chap_sents_len/2) + chap_sentno_base
    # print(f'Chapter #{i}: {len(chap_sents_ls)} Sentences, midpoint: {chap_mid_sentno}, cumulative midpoint: {chap_mid_sentno}')
    chap_mid_sentnos_ls.append(chap_mid_sentno)
    chap_sentno_base += chap_sents_len

corpus_chaps_df['sent_no_start'] = pd.Series(chap_start_sentnos_ls)
corpus_chaps_df['sent_no_mid'] = pd.Series(chap_mid_sentnos_ls)

# Test 
print(f'First 2 Chapters of {CORPUS_FULL}')
# corpus_chaps_df.head(2)
corpus_chaps_df.info()

In [None]:
print(corpus_sents_df.iloc[237]['sent_raw'])

In [None]:
# TODO: More General Cleanup

In [None]:
# TODO: Normalize Paragraphs by Lengths (Smart Aggregate/Split)

In [None]:
corpus_sects_df.columns

# **Preprocess and Review Corpus Text (Auto)**

In [None]:
# Read corpus into a single string then split into paragraphs

corpus_parags_raw_ls = read_corpus_parags(CORPUS_FILENAME)
print(f'We found #{len(corpus_parags_raw_ls)} lines\n')

print('\nThe first 10 lines of the Corpus:')
print('-----------------------------------\n')
corpus_parags_raw_ls[:10]

print('\nThe last 10 lines of the Corpus:')
print('-----------------------------------\n')
corpus_parags_raw_ls[-10:]
print('\n')

n_shortest = 10
print(f'The {n_shortest} Sentences in the Corpus are:')
print('--------------------------------------------')
temp_ls = sorted(corpus_parags_raw_ls, key=lambda x: (len(x), x))
for i, asent in enumerate(temp_ls[:n_shortest]):
  print(f'Shortest #{i}: {asent}')

In [None]:
# Tokenize Paragraphs into Sentences

'''
sent_no = 0
# sent_base = 0
corpus_sents_row_ls = []
for parag_no,aparag in enumerate(corpus_parags_raw_ls):
  sents_ls = sent_tokenize(aparag)
  # Delete (whitespace only) sentences
  sents_ls = [x.strip() for x in sents_ls if len(x.strip()) > MIN_SENT_LEN]
  # print(f'Corpus Sentences -(whitespace only) Count: {len(sents_ls)}')
  # Delete (punctuation only) sentences
  sents_ls = [x for x in sents_ls if len((re.sub(r'[^\w\s]','',x)).strip()) > MIN_SENT_LEN]
  # print(f'Corpus Sentences -(punctuation only) Count: {len(sents_ls)}')
  # sent_no = sent_base
  for s,asent in enumerate(sents_ls):
    corpus_sents_row_ls.append([sent_no, parag_no, asent])
    sent_no += 1
  # sent_base = sent_no 


print(f'{len(corpus_sents_row_ls)}')

print(f'First row {corpus_sents_row_ls[0]}')
print('\n')
print(f'Last row {corpus_sents_row_ls[-1]}')
'''

corpus_sents_row_ls = parag2sents(corpus_parags_raw_ls)
print(f'{len(corpus_sents_row_ls)}')

print(f'First row {corpus_sents_row_ls[0]}')
print('\n')
print(f'Last row {corpus_sents_row_ls[-1]}');

In [None]:
# Create Corpus Sentence DataFrame

corpus_sents_df = pd.DataFrame(corpus_sents_row_ls)
corpus_sents_df.columns = ['sent_no', 'parag_no', 'sent_raw']
corpus_sents_df['sent_raw'] = corpus_sents_df['sent_raw'].astype('string')
# Double check to drop any rows where raw Sentence is NaN or empty string ''
corpus_sents_df.dropna(subset=['sent_raw'], inplace=True)


print(f'First 10 Sentences of {CORPUS_FULL}')
corpus_sents_df.head(10)
corpus_sents_df.info()

In [None]:
# Create Corpus Paragraph DataFrame

parag_no_ls = []
parag_raw_ls = []

corpus_parags_df = pd.DataFrame()

for i, aparag in enumerate(corpus_parags_raw_ls):
  parag_no_ls.append(i)
  parag_raw_ls.append(aparag)

corpus_parags_df = pd.DataFrame(
    {'parag_no': parag_no_ls,
     'parag_raw': parag_raw_ls,
    })

# Test 
print(f'First 10 Paragraphs of {CORPUS_FULL}')
corpus_parags_df.head(10)
corpus_parags_df.info()

In [None]:
# Calculate (win_(x)1per) 1% of Corpus length for smallest (odd-valued) rolling window

# Sentences
corpus_sents_len = corpus_sents_df.shape[0]

win_raw_s1per = int(corpus_sents_len * 0.01)
# print(f'1% Rolling Window: {win_raw_s1per}')

if win_raw_s1per % 2:
  win_s1per = win_raw_s1per
else:
  win_s1per = win_raw_s1per + 1

# Paragraphs

corpus_parags_len = corpus_parags_df.shape[0]

win_raw_p1per = int(corpus_parags_len * 0.01)
# print(f'1% Rolling Window: {win_raw_1per}')

if win_raw_p1per % 2:
  win_p1per = win_raw_p1per
else:
  win_p1per = win_raw_p1per + 1

print(f'Sentence 1 Percent window: {win_s1per}')
print(f'Paragraph 1 Percent window: {win_p1per}')

In [None]:
# TODO: More General Cleanup

In [None]:
# TODO: Normalize Paragraphs by Lengths (Smart Aggregate/Split)

In [None]:
# Calculate some char/token metrics and do some EDA on them

corpus_sents_df['char_len'] = corpus_sents_df['sent_raw'].apply(lambda x: len(x))
corpus_sents_df['token_len'] = corpus_sents_df['sent_raw'].apply(lambda x: len(x.split())) 

corpus_parags_df['char_len'] = corpus_parags_df['parag_raw'].apply(lambda x: len(x))
corpus_parags_df['token_len'] = corpus_parags_df['parag_raw'].apply(lambda x: len(x.split())) 

# corpus_sents_df.head()

In [None]:
# Default cleaned raw text

# Sentences
# Let's take a look at the updated text
corpus_sents_df['sent_clean'] = corpus_sents_df['sent_raw'].apply(lambda x: text_clean(x))
# Ensure to drop all Sentences with NaN or '' Raw Text
corpus_sents_df.replace("", np.nan, regex=True, inplace=True)
corpus_sents_df.dropna(how='any', axis=0, subset=['sent_raw'], inplace=True)

print('\nCompare Raw and Cleaned Sentences:')
print('--------------------------------------')
corpus_sents_df

# Paragraphs
# Let's take a look at the updated text
corpus_parags_df['parag_clean'] = corpus_parags_df['parag_raw'].apply(lambda x: text_clean(x))
# Ensure to drop all Sentences with NaN or '' Raw Text
corpus_parags_df.replace("", np.nan, regex=True, inplace=True)
corpus_parags_df.dropna(how='any', axis=0, subset=['parag_raw'], inplace=True)

print('\nCompare Raw and Cleaned Paragraphs:')
print('--------------------------------------')
corpus_parags_df

In [None]:
corpus_sents_df.shape
print('\n')
corpus_parags_df.shape

##**Save Preprocess Corpus DataFrames**

In [None]:
# Save Preprocessed Corpus Sentences DataFrame

author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Sentences
corpus_sents_filename = f'corpus_sents_clean_{author_str}_{title_str}_{datetime_now}.csv'
print(f'Saving to file: {corpus_sents_filename}')

corpus_sents_df.to_csv(corpus_sents_filename)

# Paragraphs
corpus_parags_filename = f'corpus_parags_clean_{author_str}_{title_str}_{datetime_now}.csv'
print(f'Saving to file: {corpus_parags_filename}')

corpus_parags_df.to_csv(corpus_parags_filename)


# (ARCHIVED - BEGINNING) Configuration (Manual)

In [None]:
# Verify subdirectory change

!pwd
!ls *.txt

# TODO: Intelligently automate the filling of form based upon directory

In [None]:
CORPUS_TITLE = 'Machines Like Me' #@param {type:"string"}
CORPUS_AUTHOR = "Ian McEwan" #@param {type:"string"}
CORPUS_FILENAME = "mlm_final_hand.txt" #@param {type:"string"}
CORPUS_SUBDIR = "./research/2021/sa_book_code/books_sa/imcewan_machineslikeme" #@param {type:"string"}

CORPUS_FULL = f'{CORPUS_TITLE} by: {CORPUS_AUTHOR}'

PLOT_OUTPUT = "None" #@param ["None", "Major", "All"]

FILE_OUTPUT = "None" #@param ["None", "Major", "All"]

gdrive_subdir = CORPUS_SUBDIR
corpus_filename = ''
author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()

print(f'\nWorking Corpus Datafile: {CORPUS_SUBDIR}')
print(f'\nFull Corpus Title/Author: {CORPUS_FULL}')

# Verify contents of Corpus File is Correctly Formatted
#   
# TODO: ./utils/verify_format.py


# Setup Google gDrive

In [None]:
# Connect to Google gDrive

from google.colab import drive, files
drive.mount('/gdrive')
%cd /gdrive/MyDrive/

In [None]:
# Select the Corpus subdirectory on your Google gDrive

gdrive_subdir = "./research/2021/sa_book_code/books_sa/imcewan_machineslikeme" #@param {type:"string"}
CORPUS_SUBDIR = gdrive_subdir
%cd $gdrive_subdir


In [None]:
# Verify subdirectory change

!pwd

In [None]:
!ls *.txt

In [None]:
# Verify contents of Corpus File is Correctly Formatted
#   
# TODO: ./utils/verify_format.py

# Configuration (Auto)

**Global Configuration Constants**

In [None]:
# Minimum lengths for Sentences and Paragraphs
#   (Shorter Sents/Parags will be deleted)

MIN_PARAG_LEN = 2
MIN_SENT_LEN = 2

**Install Libraries**

In [None]:
# INSTALL LIBRARIES

!pip install sklearn

In [None]:
!pip install transformers

**Import Libraries**

In [None]:
import os
import sys
import io
import glob
import contextlib

In [None]:
# IMPORT LIBRARIES

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import re
import string

In [None]:
import collections
from collections import OrderedDict

In [None]:
# Import libraries for logging

import logging
from datetime import datetime
import time                     # (TODO: check no dependencies and delete)
from time import gmtime, strftime

In [None]:
import nltk

# Download for sentence tokenization
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Download for nltk/VADER sentiment analysis
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
from scipy import interpolate
from scipy.interpolate import CubicSpline
from scipy import signal
from scipy.signal import argrelextrema
import scipy.stats


In [None]:
from statsmodels.nonparametric.smoothers_lowess import lowess as sm_lowess

In [None]:
import transformers

**Configure Jupyter Notebook**

In [None]:
# Configure Jupyter

%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 8]
plt.rcParams['figure.dpi'] = 100

# Enable multiple outputs from one code cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display
from ipywidgets import widgets, interactive

# Configure Google Colab

%load_ext google.colab.data_table

**Configuration Details Snapshot**

In [None]:
# Snap Shot of Time, Machine, Data and Library/Version Blueprint
# TODO:

# Upload Plain Text Corpus (Interactive)

NOTE: Paragraphs separated by '\n\n'

In [None]:
# Create a working subdirectory

CORPUS_SUBDIR = './hfmodels/'

!mkdir $CORPUS_SUBDIR
%cd $CORPUS_SUBDIR

In [None]:
try:
  CORPUS_FIELNAME
  pass
except:
  uploaded_fileinfo = files.upload()
  CORPUS_FILENAME = list(uploaded_fileinfo.keys())[0]

In [None]:
# If old version of file exists (e.g. endswith '(n).txt'), delete and rename the newly uploaded version
# TODO:

# if CORPUS_FILENAME.endswith('\).txt'):
#   print('At least one previous verion of this file already existed in this upload directory')

# !rm $CORPUS_FILENAME

In [None]:
!pwd
!ls -al $CORPUS_FILENAME

# (ARCHIVED - END) Clean, Preprocess, Convert and Review Corpus (Auto)

In [None]:
CORPUS_TITLE = "Machines Like Me" #@param {type:"string"}
CORPUS_AUTHOR = "Ian McEwan" #@param {type:"string"}
CORPUS_FILENAME = "mlm_final_hand.txt" #@param {type:"string"}

CORPUS_FULL = f'{CORPUS_TITLE} by: {CORPUS_AUTHOR}'

print(f'\nWorking Corpus Datafile: {CORPUS_SUBDIR}')
print(f'\nFull Corpus Title/Author: {CORPUS_FULL}')

In [None]:
# Read corpus into a single string then split into paragraphs

# with open(uploaded_filename, "r", encoding='utf-8', errors='ignore') as infp:

'''
# Uploading a corpus file overrides the earlier corpus form assignments
try:
    uploaded_filename
except NameError:
  pass
else:
  CORPUS_FILENAME = uploaded_filename
'''

with open(CORPUS_FILENAME, "r", encoding=CORPUS_ENCODING) as infp:
  corpus_raw_str = infp.read()

corpus_parags_raw_ls = corpus_raw_str.split('\n\n')

# Strip excess whitespace and drop empty lines
corpus_parags_raw_ls = [x.strip() for x in corpus_parags_raw_ls if len(x.strip()) > MIN_PARAG_LEN]

print(f'We found #{len(corpus_parags_raw_ls)} lines\n')

print('\nThe first 10 lines of the Corpus:')
print('-----------------------------------\n')
corpus_parags_raw_ls[:10]

print('\nThe last 10 lines of the Corpus:')
print('-----------------------------------\n')
corpus_parags_raw_ls[-10:]
print('\n')
print(sorted(corpus_parags_raw_ls, key=lambda x: (len(x), x)))

In [None]:
# Create Paragraph DataFrame

parag_no_ls = []
parag_raw_ls = []
for i,aparag in enumerate(corpus_parags_raw_ls):
  parag_no_ls.append(i)
  parag_raw_ls.append(aparag)

corpus_parags_df = pd.DataFrame({'parag_no':parag_no_ls, 'parag_raw':parag_raw_ls})
corpus_parags_df.head(2)

In [None]:
# Tokenize into Sentences

sent_no = 0
# sent_base = 0
corpus_sents_row_ls = []
for parag_no,aparag in enumerate(corpus_parags_raw_ls):
  sents_ls = sent_tokenize(aparag)
  sents_ls = [x.strip() for x in sents_ls if len(x.strip()) > MIN_SENT_LEN]
  # sent_no = sent_base
  for s,asent in enumerate(sents_ls):
    corpus_sents_row_ls.append([sent_no, parag_no, asent])
    sent_no += 1
  # sent_base = sent_no 


print(f'{len(corpus_sents_row_ls)}')

print(f'First row {corpus_sents_row_ls[0]}')
print('\n')
print(f'Last row {corpus_sents_row_ls[-1]}')

In [None]:
# Create Corpus DataFrame

corpus_sents_df = pd.DataFrame(corpus_sents_row_ls)
corpus_sents_df.columns = ['sent_no', 'parag_no', 'sent_raw']
corpus_sents_df['sent_raw'] = corpus_sents_df['sent_raw'].astype('string')
print(f'First 10 Sentences of {CORPUS_FULL}')
corpus_sents_df.head(10)
corpus_sents_df.info()

In [None]:
# TODO: More General Cleanup

In [None]:
# TODO: Normalize Paragraphs by Lengths (Smart Aggregate/Split)

In [None]:
# Generate full path and timestamp for new filepath/filename

def gen_pathfiletime(file_str, subdir_str=''):

  # Genreate compressed author and title substrings
  author_raw_str = ''.join(CORPUS_AUTHOR.split()).lower()
  title_raw_str = ''.join(CORPUS_TITLE.split()).lower()

  # Generate current/unique datetime string
  datetime_str = str(datetime.now().strftime('%Y%m%d%H%M%S'))

  # Built fullpath+filename string
  file_base, file_ext = file_str.split('.')

  author_str = re.sub('[^A-Za-z0-9]+', '', author_raw_str)
  title_str = re.sub('[^A-Za-z0-9]+', '', title_raw_str)

  full_filepath_str = f'{subdir_str}{file_base}_{author_str}_{title_str}_{datetime_str}.{file_ext}'

  # print(f'Returning from gen_savepath() with full_filepath={full_filepath}')

  return full_filepath_str

# Test
# pathfilename_str = gen_pathfiletime('hist_paraglen.png')
# print(pathfilename_str)

In [None]:
# Calculate some char/token metrics and do some EDA on them

corpus_sents_df['char_len'] = corpus_sents_df['sent_raw'].apply(lambda x: len(x))
corpus_sents_df['token_len'] = corpus_sents_df['sent_raw'].apply(lambda x: len(x.split())) 
# corpus_sents_df.head()

In [None]:
PLOT_OUTPUT='All'

In [None]:
# Default clean Sentence raw text

#This function converts to lower-case, removes square bracket, removes numbers and punctuation
 
def text_clean(text):
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[\n]', ' ', text)  # Replace newline with space
    return text

# Let's take a look at the updated text
corpus_sents_df['sent_clean'] = corpus_sents_df['sent_raw'].apply(lambda x: text_clean(x))
corpus_parags_df['parag_clean'] = corpus_parags_df['parag_raw'].apply(lambda x: text_clean(x))


if (PLOT_OUTPUT == 'All'):
  corpus_sents_df.head(2)
  corpus_sents_df.info()
  corpus_parags_df.head(2)
  corpus_parags_df.info()

In [None]:
# Verify saved under newest filename

def get_recentfile(file_type='csv'):
  '''
  Given a file extension type,
  Return the most recently created file of that type 
  in the current directory
  '''
  file_pattern = "./*." + file_type
  print(f'file_pattern: {file_pattern}')
  list_of_files = glob.glob(file_pattern) # * means all if need specific format then *.csv
  latest_file = max(list_of_files, key=os.path.getmtime)

  return latest_file

# Test

# get_recentfile('txt')

### **Immediately Save Long-Running Robertlg15-Siebert Values**

In [None]:
# Save all the calculated Sentiment Values

author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# Save Sentence DataFrame
sats_filename_str = f'sentiment_sents_{sa_model}_{author_str}_{title_str}_{datetime_now}.csv'
print(f'Saving Sentences to: {sats_filename_str}')
corpus_sents_df.to_csv(sats_filename_str, index=False)

# Save Paragraph DataFrame
sats_filename_str = f'sentiment_parags_{sa_model}_{author_str}_{title_str}_{datetime_now}.csv'
print(f'Saving Paragraphs to: {sats_filename_str}')
corpus_parags_df.to_csv(sats_filename_str, index=False)


# **Transformer Sentiment Analysis Models**

### Select Interactive Model and Epochs

1) Models
* bert-base-uncased
* xlnet-base-cased
* 
* BertForSentimentClassification
* AlbertForSentimentClassification
* DistilBertForSentimentClassification

2) Hyperparameters (Finetuning)
* Epochs (1,2,3,4,6,8,10,15,20)
* Batch

2) Datasets
* SST

In [None]:
# !pip install sentencepiece

In [None]:
# import sentencepiece

In [None]:
corpus_sents_df.head(2)

## **(2) Distill BERT Default Huggingface Sentiment Analysis**

In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification
import torch

In [None]:
# https://github.com/brianadit24/SentimentAnalysiswithBERT_HF/blob/main/Sentiment_Analysis_with_BERT.ipynb


from transformers import pipeline

senti_pipeline = pipeline("sentiment-analysis")



In [None]:


senti_pipeline("I am extremely happy to share this video with all of you")

### **Sentiment Analysis**

In [None]:
# Setup for default Huggingface Sentiment Analysis  siebert/sentiment-roberta-large-english

sa_model = 'distillbertsst'

from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# Sentiment analysis pipeline
pipeline('sentiment-analysis')

hf_sadef_clf = pipeline('sentiment-analysis')
hf_sadef_clf('Such a nice weather outside !')

In [None]:
hf_sadef_clf('Fuck you asshole!')

In [None]:
# Prepare Text from DataFrame

sents_pred_df = corpus_sents_df.copy()
sents_pred_texts = sents_pred_df['sent_raw'].astype('str').tolist() # Want to catch NaN, .dropna().astype('str').tolist()

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}
        
# Tokenize texts and create prediction data set

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sents_tokenized_texts = tokenizer(sents_pred_texts,truncation=True,padding=True)
sents_pred_dataset = SimpleDataset(sents_tokenized_texts)

In [None]:
# Run predictions

sents_predictions = trainer.predict(sents_pred_dataset)
# sents_predictions = trainer.predict(sents_tokenized_texts)

In [None]:
# polprob2sentiment(temp_sentiment_df(temp_sentiment_df.iloc[0]['label'], temp_sentiment_df.iloc[0]['score']))

In [None]:
temp_sentiment_df.head(2)

In [None]:
temp_sentiment_df['polarity'] = ['NEGATIVE' if x.strip()=='1 star' else 'POSITIVE' for x in temp_sentiment_df.label]
temp_sentiment_df['polarity_sign'] = [-1.0 if x.strip()=='1 star' else +1.0 for x in temp_sentiment_df.label]
temp_sentiment_df['distillbertsst'] = temp_sentiment_df['score']*temp_sentiment_df['polarity_sign']
temp_sentiment_df.head()

In [None]:
corpus_sents_df['distillbertsst'] = temp_sentiment_df['distillbertsst']
corpus_sents_df.head()

In [None]:
# Transform predictions to labels

sa_model = 'distillbertsst'

sents_preds = sents_predictions.predictions.argmax(-1)
sents_labels = pd.Series(sents_preds).map(model.config.id2label)
sents_scores = (np.exp(sents_predictions[0])/np.exp(sents_predictions[0]).sum(-1,keepdims=True)).max(1)

# Create DataFrame with texts, predictions, labels, and scores

temp_sentiment_df = pd.DataFrame(list(zip(sents_pred_texts,sents_preds,sents_labels,sents_scores)), columns=['text','pred','label','score'])
# temp_sentiment_df.head()

# Convert label (Neg/Pos) and score (Prob) to a +/-Sentiment Float Value

corpus_sents_df[sa_model] = temp_sentiment_df.apply(lambda x: polprob2sentiment(x.polarity,x.score), axis=1)
corpus_sents_df.head()

In [None]:
polprob2sentiment('{o',0.889)

In [None]:
# Verify the head and tail are complete and correct

corpus_parags_df.iloc[:3]
corpus_parags_df.iloc[-3:]

In [None]:
# Aggregate Sentence Sentiments to populate Paragraph Sentiment DataFrame

sa_model = 'distillbertsst'
# def sentiment_sents2parags(ts_df, model_name='roberta_lg15'):
parags_sentiment_ls = sentiment_sents2parags(corpus_sents_df, sa_model)
corpus_parags_df[sa_model] = pd.Series(parags_sentiment_ls)
corpus_parags_df.head(2)
corpus_parags_df.tail(2)

### **Histogram Plots**

In [None]:
# Debug
PLOT_OUTPUT = 'Major'

In [None]:
sa_model = 'distillbertsst'

In [None]:
# Create histogram of Sentence Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_sents_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Sentence Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_sents_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

In [None]:
# Create histogram of Paragraph Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_parags_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Paragraph Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_parags_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

### **Mean SMA Plots**

In [None]:
# SMA % Sentiment of Sentence Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_sents_df, sa_model, text_unit='sentence', win_ls=[5,10,20])

In [None]:
# SMA % Sentiment of Paragraph Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_parags_df, sa_model, text_unit='paragraph', win_ls=[5,10,20])

In [None]:
# Compare Sentence and Paragraph

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y='roberta_lg15_mean_roll100', data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (10% SMA)')
  sns.lineplot(x=corpus_parags_df.index, y='roberta_lg15_mean_roll100', data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (10% SMA)')
  fig.show()

In [None]:
# corpus_sents_df.drop(columns=['y_scaled'], axis=1, inplace=True)

### **Stanford ASAP Plot**


**Stanford ASAP: Automatic Smoothing for Attention Prioritization in Time Series**

* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP.ipynb (Python)
* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP-simple.js
* http://futuredata.stanford.edu/asap/ 
* https://www.datadoghq.com/blog/auto-smoother-asap/

**Save the following plots to gDrive files?**

In [None]:
Save_to_File = False #@param {type:"boolean"}


In [None]:
# Sentence SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_sents_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, do_plot=True, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

In [None]:
# Paragraph SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_parags_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

### **LOWESS Plots**

**Sentence Sentiment LOWESS Plots**

In [None]:
# Debug

PLOT_OUTPUT='Major'

In [None]:
# Plot Sentence Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./4, 1./6, 1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_sents_df[new_lowess_col] = plot_lowess(corpus_sents_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_sents_df[new_lowess_mean_col] = corpus_sents_df[cols_lowess].mean(axis=1)

corpus_sents_df.head(2)

**Paragraph Sentiment LOWESS Plots**

In [None]:
# Plot Paragraph Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_parags_df[new_lowess_col] = plot_lowess(corpus_parags_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_parags_df[new_lowess_mean_col] = corpus_parags_df[cols_lowess].mean(axis=1)

corpus_parags_df.head(2)

**Compare Sentence and Paragraph Median LOWESS**

In [None]:
# Compare Sentence and Paragraph LOWESS means

# Calculate the Sentence and Paragraph LOWESS means and plot

# Get all the calculated LOWESS columns in a list 
cols_sents_lowess = matching_cols = get_cols_regex(corpus_sents_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)
cols_parags_lowess = matching_cols = get_cols_regex(corpus_parags_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)

# Compute the mean for all the LOWESS columns
col_mean_lowess = f'{sa_model}_mean_lowess'
corpus_sents_df[col_mean_lowess] = corpus_sents_df[cols_sents_lowess].mean(axis=1)
corpus_parags_df[col_mean_lowess] = corpus_parags_df[cols_parags_lowess].mean(axis=1)

# Plot the Sentence and Paragraph LOWESS means
if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y=col_mean_lowess, data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (8-12 LOWESS)')
  sns.lineplot(x=corpus_parags_df.index, y=col_mean_lowess, data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (8-12 LOWESS)')
  fig.show()

### **Save Newly Computed Sentiment Time Series**

In [None]:
# Save all the calculated Sentiment Values

author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

sats_filename_str = f'sentiment_sents_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_sents_df.to_csv(sats_filename_str, index=False)
print(f'Saved Sentence Sentiments with LOWESS in file: {sats_filename_str}')

sats_filename_str = f'sentiment_parags_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_parags_df.to_csv(sats_filename_str, index=False)
print(f'Saved Paragraph Sentiments with LOWESS in file: {sats_filename_str}')


## **RoBERTa Large English Tuned on 15 SA Dataset**
* **siebert/sentiment-roberta-large-english**

This model is a fine-tuned checkpoint of RoBERTa-large (Liu et al. 2019). It enables reliable binary sentiment analysis for various types of English-language text. For each instance, it predicts either positive (1) or negative (0) sentiment. The model was fine-tuned and evaluated on 15 data sets from diverse text sources to enhance generalization across different types of texts (reviews, tweets, etc.). Consequently, it outperforms models trained on only one type of text (e.g., movie reviews from the popular SST-2 benchmark) when used on new data as shown below.

Jon Chun
20 Jun 2021

Reference:

* https://huggingface.co/siebert/sentiment-roberta-large-english

* https://huggingface.co/siebert/sentiment-roberta-large-english

### **Sentiment Analysis**

In [None]:
# Setup for RoBERTa Large English 15datasets: siebert/sentiment-roberta-large-english

sa_model = 'robertalg15'

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}


# Set model
sa_model = 'robertalg15'
model_name = "siebert/sentiment-roberta-large-english"

# Load tokenizer and model, create trainer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model);

In [None]:
# Prepare Text from DataFrame

sents_pred_df = corpus_sents_df.copy()
sents_pred_texts = sents_pred_df['sent_raw'].astype('str').tolist() # Want to catch NaN, .dropna().astype('str').tolist()

# Tokenize texts and create prediction data set

sents_tokenized_texts = tokenizer(sents_pred_texts,truncation=True,padding=True)
sents_pred_dataset = SimpleDataset(sents_tokenized_texts)

In [None]:
# temp_sentiment_df.head(3)

In [None]:
corpus_sents_df.columns

In [None]:
# temp_sentiment_df.head(2)

In [None]:
# temp_ser = temp_sentiment_df.apply(lambda x: polprob2sentiment(str(x.label), float(x.score)))

In [None]:
# Transform predictions to labels

sents_preds = sents_predictions.predictions.argmax(-1)
sents_labels = pd.Series(sents_preds).map(model.config.id2label)
sents_scores = (np.exp(sents_predictions[0])/np.exp(sents_predictions[0]).sum(-1,keepdims=True)).max(1)

# Create DataFrame with texts, predictions, labels, and scores

temp_sentiment_df = pd.DataFrame(list(zip(sents_pred_texts,sents_preds,sents_labels,sents_scores)), columns=['text','pred','label','score'])
# temp_sentiment_df.head()

# Convert label (Neg/Pos) and score (Prob) to a +/-Sentiment Float Value

corpus_sents_df[sa_model] = temp_sentiment_df.apply(lambda x: polprob2sentiment(x.label,x.score), axis=1)
corpus_sents_df.head()

In [None]:
# Verify the head and tail are complete and correct

corpus_parags_df.iloc[:3]
corpus_parags_df.iloc[-3:]

In [None]:
# Aggregate Sentence Sentiments to populate Paragraph Sentiment DataFrame

# def sentiment_sents2parags(ts_df, model_name='roberta_lg15'):
parags_sentiment_ls = sentiment_sents2parags(corpus_sents_df, sa_model)
corpus_parags_df[sa_model] = pd.Series(parags_sentiment_ls)
corpus_parags_df.head(2)
corpus_parags_df.tail(2)

### **Histogram Plots**

In [None]:
# Debug
PLOT_OUTPUT = 'Major'

In [None]:
# Create histogram of Sentence Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_sents_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Sentence Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_sents_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

In [None]:
# Create histogram of Paragraph Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_parags_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Paragraph Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_parags_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

### **Mean SMA Plots**

In [None]:
# SMA % Sentiment of Sentence Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_sents_df, sa_model, text_unit='sentence', win_ls=[5,10,20])

In [None]:
# SMA % Sentiment of Paragraph Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_parags_df, sa_model, text_unit='paragraph', win_ls=[5,10,20])

In [None]:
# Compare Sentence and Paragraph

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y='roberta_lg15_mean_roll100', data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (10% SMA)')
  sns.lineplot(x=corpus_parags_df.index, y='roberta_lg15_mean_roll100', data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (10% SMA)')
  fig.show()

In [None]:
# corpus_sents_df.drop(columns=['y_scaled'], axis=1, inplace=True)

### **Stanford ASAP Plot**


**Stanford ASAP: Automatic Smoothing for Attention Prioritization in Time Series**

* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP.ipynb (Python)
* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP-simple.js
* http://futuredata.stanford.edu/asap/ 
* https://www.datadoghq.com/blog/auto-smoother-asap/

**Save the following plots to gDrive files?**

In [None]:
Save_to_File = False #@param {type:"boolean"}


In [None]:
# Sentence SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_sents_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, do_plot=True, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

In [None]:
# Paragraph SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_parags_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

### **LOWESS Plots**

**Sentence Sentiment LOWESS Plots**

In [None]:
# Debug

PLOT_OUTPUT='Major'

In [None]:
# Plot Sentence Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./4, 1./6, 1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_sents_df[new_lowess_col] = plot_lowess(corpus_sents_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_sents_df[new_lowess_mean_col] = corpus_sents_df[cols_lowess].mean(axis=1)

corpus_sents_df.head(2)

In [None]:
corpus_parags_df.columns

**Paragraph Sentiment LOWESS Plots**

In [None]:
# Plot Paragraph Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_parags_df[new_lowess_col] = plot_lowess(corpus_parags_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_parags_df[new_lowess_mean_col] = corpus_parags_df[cols_lowess].mean(axis=1)

corpus_parags_df.head(2)

**Compare Sentence and Paragraph Median LOWESS**

In [None]:
# Compare Sentence and Paragraph LOWESS means

# Calculate the Sentence and Paragraph LOWESS means and plot

# Get all the calculated LOWESS columns in a list 
cols_sents_lowess = matching_cols = get_cols_regex(corpus_sents_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)
cols_parags_lowess = matching_cols = get_cols_regex(corpus_parags_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)

# Compute the mean for all the LOWESS columns
col_mean_lowess = f'{sa_model}_mean_lowess'
corpus_sents_df[col_mean_lowess] = corpus_sents_df[cols_sents_lowess].mean(axis=1)
corpus_parags_df[col_mean_lowess] = corpus_parags_df[cols_parags_lowess].mean(axis=1)

# Plot the Sentence and Paragraph LOWESS means
if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y=col_mean_lowess, data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (8-12 LOWESS)')
  sns.lineplot(x=corpus_parags_df.index, y=col_mean_lowess, data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (8-12 LOWESS)')
  fig.show()

### **Save Newly Computed Sentiment Time Series**

In [None]:
# Save all the calculated Sentiment Values

author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

sats_filename_str = f'sentiment_sents_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_sents_df.to_csv(sats_filename_str, index=False)
print(f'Saved Sentence Sentiments with LOWESS in file: {sats_filename_str}')

sats_filename_str = f'sentiment_parags_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_parags_df.to_csv(sats_filename_str, index=False)
print(f'Saved Paragraph Sentiments with LOWESS in file: {sats_filename_str}')


## **(4) BERT NLP Town**

### **Sentiment Analysis**

In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

In [None]:
import torch

In [None]:
!pip install sentencepiece

In [None]:
import sentencepiece

In [None]:
# Instantiate model

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

#### **Sentences**

In [None]:
tokens = tokenizer.encode("I hated mas Yoza, he absolutely the worst mentor", return_tensors='pt')


In [None]:
# Predict Tokens
tokens = tokenizer.encode("It wasn't the worst i've seen, in fact, it was the opposite", return_tensors='pt')
# tokens[0]
# tokenizer.decode(tokens[0])
result = model(tokens)
result

In [None]:
predict_sentiment = int(torch.argmax(result.logits))+1
predict_sentiment

In [None]:
def nlptown_sentiment_score(text):
  '''
  Given a text string (sentence or paragraph)
  Return a floating point sentiment value
  '''

  # tokens = tokenizer.encode(text, return_tensors='pt')
  # result = model(tokens)
  # sentiment_int = int(torch.argmax(result.logits))+1
  # sentiment_fl = sentiment_int + result.logits[sentiment_int-1]
  # return sentiment_fl

  tokens = tokenizer.encode(text, return_tensors='pt')
  result = model(tokens)
  type(result)
  prob_ls = list(result.logits)[0].tolist()
  # print(f'prob_ls: {prob_ls}')
  # prob_ls_sum = sum(prob_ls)
  prob_ls_sum = sum(map(abs, prob_ls))
  prob_norm_ls = [abs(i/prob_ls_sum) for i in prob_ls]
  # prob_ls_min = min(prob_ls)
  # prob_ls_max = max(prob_ls)
  # prob_norm_ls = [(x-prob_ls_min)/(prob_ls_max-prob_ls_min) for x in prob_ls]
  # print(f'prob_norm_ls {prob_norm_ls}')
  prob_int = int(torch.argmax(result.logits))
  # print(f'prob_int {prob_int}')
  prob_frac = abs(float(prob_norm_ls[prob_int]))
  # print(f'prob_frac {prob_frac}')
  
  return prob_int + prob_frac # int(torch.argmax(result.logits))+1

In [None]:
nlptown_sentiment_score('i love the smell of beautiful flowers, the make me happy')

In [None]:
%time

# NOTE: 10m Long-running process

# Calculate Sentence Sentiment Scores using the NLPTown BERT fine-grained, fine-tuned, multi-lingual model

# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment

# This a bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews in 
#    six languages: English, Dutch, German, French, Spanish and Italian. 
#    It predicts the sentiment of the review as a number of stars (between 1 and 5).

corpus_sents_df['nlptown'] = corpus_sents_df['sent_raw'].astype('str').apply(lambda x: nlptown_sentiment_score(x))

In [None]:
# Verify

corpus_sents_df.iloc[:3]

#### **Paragraphs**

In [None]:
def trim_maxtokens(astr, token_max):
  '''
  Given an input string of tokens and a maximum token limit
  Return a string at token limit by dropping frequent words w/o sentiment value first, them random
  '''
  deadwords_ls = ['the','of','to','and','a','in','is','it','that','was','for','on','are','with','as','be','at','one','have','this','from','or','had','by']

  astr_ls = astr.split()
  deadword_idx = 0
  while deadword_idx < len(deadwords_ls):
    del_word = deadwords_ls[deadword_idx]
    astr_ls = [aword for aword in astr_ls if aword != del_word]
    deadword_idx += 1

  if len(astr_ls) > token_max:
    print('too long')
    # Start removing longest words first, random words, shortest words, POS, Capitalized?
    while len(astr_ls) > token_max:
      random_list_element = random.choice(astr_ls)
      astr_ls.remove(random_list_element)

  astr_condensed = ' '.join(astr_ls)

  return astr_condensed

# Test
# sentences_condensed = trim_maxtokens('Hello big boy! What the heck are you doing here?', 6)
# sentences_condensed

In [None]:
# Calculate Paragraph Sentiment Scores using the NLPTown BERT fine-grained, fine-tuned, multi-lingual model

# NOTE: Long-running

# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment

# This a bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews in 
#    six languages: English, Dutch, German, French, Spanish and Italian. 
#    It predicts the sentiment of the review as a number of stars (between 1 and 5).

corpus_parags_df['nlptown'] = corpus_parags_df['parag_raw'].astype('str').apply(lambda x: nlptown_sentiment_score(trim_maxtokens(x,510)))

In [None]:
# Verify

corpus_parags_df[:2]

# **Save Newest Sentiment Time Series**

In [None]:
# Save all the calculated Sentiment Values

author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

# sats_filename_str = f'sentiment_sents_lowess_{author_str}_{title_str}_{datetime_now}.csv'
sats_filename_str = f'../sum_sentiments_sents_transformers_{author_str}_{title_str}.csv' # _{datetime_now}.csv'
corpus_sents_df.to_csv(sats_filename_str, index=False)
print(f'Saved Sentence Sentiments with NLPTown added to file: {sats_filename_str}')

# sats_filename_str = f'sentiment_parags_lowess_{author_str}_{title_str}_{datetime_now}.csv'
sats_filename_str = f'../sum_sentiments_parags_transformers_{author_str}_{title_str}.csv' # _{datetime_now}.csv'
corpus_parags_df.to_csv(sats_filename_str, index=False)
print(f'Saved Paragraph Sentiments with NLPTown added to file: {sats_filename_str}')


In [None]:
corpus_sents_df.head(2)

In [None]:
!pwd

In [None]:
# Setup for RoBERTa Large English 15datasets: siebert/sentiment-roberta-large-english

sa_model = 'robertalg15'

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}


# Set model
sa_model = 'robertalg15'
model_name = "siebert/sentiment-roberta-large-english"

# Load tokenizer and model, create trainer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model);

In [None]:
# Prepare Text from DataFrame

sents_pred_df = corpus_sents_df.copy()
sents_pred_texts = sents_pred_df['sent_raw'].astype('str').tolist() # Want to catch NaN, .dropna().astype('str').tolist()

# Tokenize texts and create prediction data set

sents_tokenized_texts = tokenizer(sents_pred_texts,truncation=True,padding=True)
sents_pred_dataset = SimpleDataset(sents_tokenized_texts)

In [None]:
# Run predictions

sents_predictions = trainer.predict(sents_pred_dataset)

In [None]:
temp_sentiment_df.head(3)

In [None]:
corpus_sents_df.columns

In [None]:
temp_sentiment_df.head(2)

In [None]:
temp_ser = temp_sentiment_df.apply(lambda x: polprob2sentiment(str(x.label), float(x.score)))

In [None]:
# Transform predictions to labels

sents_preds = sents_predictions.predictions.argmax(-1)
sents_labels = pd.Series(sents_preds).map(model.config.id2label)
sents_scores = (np.exp(sents_predictions[0])/np.exp(sents_predictions[0]).sum(-1,keepdims=True)).max(1)

# Create DataFrame with texts, predictions, labels, and scores

temp_sentiment_df = pd.DataFrame(list(zip(sents_pred_texts,sents_preds,sents_labels,sents_scores)), columns=['text','pred','label','score'])
# temp_sentiment_df.head()

# Convert label (Neg/Pos) and score (Prob) to a +/-Sentiment Float Value

corpus_sents_df[sa_model] = temp_sentiment_df.apply(lambda x: polprob2sentiment(x.label,x.score), axis=1)
corpus_sents_df.head()

In [None]:
# Verify the head and tail are complete and correct

corpus_parags_df.iloc[:3]
corpus_parags_df.iloc[-3:]

In [None]:
# Aggregate Sentence Sentiments to populate Paragraph Sentiment DataFrame

# def sentiment_sents2parags(ts_df, model_name='roberta_lg15'):
parags_sentiment_ls = sentiment_sents2parags(corpus_sents_df, sa_model)
corpus_parags_df[sa_model] = pd.Series(parags_sentiment_ls)
corpus_parags_df.head(2)
corpus_parags_df.tail(2)

### **Histogram Plots**

In [None]:
# Debug
PLOT_OUTPUT = 'Major'

In [None]:
# Create histogram of Sentence Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_sents_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Sentence Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_sents_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

In [None]:
# Create histogram of Paragraph Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_parags_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Paragraph Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_parags_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

### **Mean SMA Plots**

In [None]:
# SMA % Sentiment of Sentence Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_sents_df, sa_model, text_unit='sentence', win_ls=[5,10,20])

In [None]:
# SMA % Sentiment of Paragraph Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_parags_df, sa_model, text_unit='paragraph', win_ls=[5,10,20])

In [None]:
# Compare Sentence and Paragraph

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y='roberta_lg15_mean_roll100', data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (10% SMA)')
  sns.lineplot(x=corpus_parags_df.index, y='roberta_lg15_mean_roll100', data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (10% SMA)')
  fig.show()

In [None]:
# corpus_sents_df.drop(columns=['y_scaled'], axis=1, inplace=True)

### **Stanford ASAP Plot**


**Stanford ASAP: Automatic Smoothing for Attention Prioritization in Time Series**

* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP.ipynb (Python)
* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP-simple.js
* http://futuredata.stanford.edu/asap/ 
* https://www.datadoghq.com/blog/auto-smoother-asap/

**Save the following plots to gDrive files?**

In [None]:
Save_to_File = False #@param {type:"boolean"}


In [None]:
# Sentence SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_sents_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, do_plot=True, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

In [None]:
# Paragraph SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_parags_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

### **LOWESS Plots**

**Sentence Sentiment LOWESS Plots**

In [None]:
# Debug

PLOT_OUTPUT='Major'

In [None]:
# Plot Sentence Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./4, 1./6, 1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_sents_df[new_lowess_col] = plot_lowess(corpus_sents_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_sents_df[new_lowess_mean_col] = corpus_sents_df[cols_lowess].mean(axis=1)

corpus_sents_df.head(2)

In [None]:
corpus_parags_df.columns

**Paragraph Sentiment LOWESS Plots**

In [None]:
# Plot Paragraph Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_parags_df[new_lowess_col] = plot_lowess(corpus_parags_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_parags_df[new_lowess_mean_col] = corpus_parags_df[cols_lowess].mean(axis=1)

corpus_parags_df.head(2)

**Compare Sentence and Paragraph Median LOWESS**

In [None]:
# Compare Sentence and Paragraph LOWESS means

# Calculate the Sentence and Paragraph LOWESS means and plot

# Get all the calculated LOWESS columns in a list 
cols_sents_lowess = matching_cols = get_cols_regex(corpus_sents_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)
cols_parags_lowess = matching_cols = get_cols_regex(corpus_parags_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)

# Compute the mean for all the LOWESS columns
col_mean_lowess = f'{sa_model}_mean_lowess'
corpus_sents_df[col_mean_lowess] = corpus_sents_df[cols_sents_lowess].mean(axis=1)
corpus_parags_df[col_mean_lowess] = corpus_parags_df[cols_parags_lowess].mean(axis=1)

# Plot the Sentence and Paragraph LOWESS means
if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y=col_mean_lowess, data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (8-12 LOWESS)')
  sns.lineplot(x=corpus_parags_df.index, y=col_mean_lowess, data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (8-12 LOWESS)')
  fig.show()

### **Save Newly Computed Sentiment Time Series**

In [None]:
# Save all the calculated Sentiment Values

author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

sats_filename_str = f'sentiment_sents_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_sents_df.to_csv(sats_filename_str, index=False)
print(f'Saved Sentence Sentiments with LOWESS in file: {sats_filename_str}')

sats_filename_str = f'sentiment_parags_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_parags_df.to_csv(sats_filename_str, index=False)
print(f'Saved Paragraph Sentiments with LOWESS in file: {sats_filename_str}')


## **ALBERT Sentiment Analysis**

### **Sentiment Analysis**

In [None]:
# Setup for RoBERTa Large English 15datasets: siebert/sentiment-roberta-large-english

sa_model = 'robertalg15'

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}


# Set model
sa_model = 'robertalg15'
model_name = "siebert/sentiment-roberta-large-english"

# Load tokenizer and model, create trainer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model);

In [None]:
# Prepare Text from DataFrame

sents_pred_df = corpus_sents_df.copy()
sents_pred_texts = sents_pred_df['sent_raw'].astype('str').tolist() # Want to catch NaN, .dropna().astype('str').tolist()

# Tokenize texts and create prediction data set

sents_tokenized_texts = tokenizer(sents_pred_texts,truncation=True,padding=True)
sents_pred_dataset = SimpleDataset(sents_tokenized_texts)

In [None]:
# Run predictions

sents_predictions = trainer.predict(sents_pred_dataset)

In [None]:
temp_sentiment_df.head(3)

In [None]:
corpus_sents_df.columns

In [None]:
temp_sentiment_df.head(2)

In [None]:
temp_ser = temp_sentiment_df.apply(lambda x: polprob2sentiment(str(x.label), float(x.score)))

In [None]:
# Transform predictions to labels

sents_preds = sents_predictions.predictions.argmax(-1)
sents_labels = pd.Series(sents_preds).map(model.config.id2label)
sents_scores = (np.exp(sents_predictions[0])/np.exp(sents_predictions[0]).sum(-1,keepdims=True)).max(1)

# Create DataFrame with texts, predictions, labels, and scores

temp_sentiment_df = pd.DataFrame(list(zip(sents_pred_texts,sents_preds,sents_labels,sents_scores)), columns=['text','pred','label','score'])
# temp_sentiment_df.head()

# Convert label (Neg/Pos) and score (Prob) to a +/-Sentiment Float Value

corpus_sents_df[sa_model] = temp_sentiment_df.apply(lambda x: polprob2sentiment(x.label,x.score), axis=1)
corpus_sents_df.head()

In [None]:
# Verify the head and tail are complete and correct

corpus_parags_df.iloc[:3]
corpus_parags_df.iloc[-3:]

In [None]:
# Aggregate Sentence Sentiments to populate Paragraph Sentiment DataFrame

# def sentiment_sents2parags(ts_df, model_name='roberta_lg15'):
parags_sentiment_ls = sentiment_sents2parags(corpus_sents_df, sa_model)
corpus_parags_df[sa_model] = pd.Series(parags_sentiment_ls)
corpus_parags_df.head(2)
corpus_parags_df.tail(2)

### **Histogram Plots**

In [None]:
# Debug
PLOT_OUTPUT = 'Major'

In [None]:
# Create histogram of Sentence Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_sents_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Sentence Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_sents_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

In [None]:
# Create histogram of Paragraph Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_parags_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Paragraph Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_parags_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

### **Mean SMA Plots**

In [None]:
# SMA % Sentiment of Sentence Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_sents_df, sa_model, text_unit='sentence', win_ls=[5,10,20])

In [None]:
# SMA % Sentiment of Paragraph Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_parags_df, sa_model, text_unit='paragraph', win_ls=[5,10,20])

In [None]:
# Compare Sentence and Paragraph

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y='roberta_lg15_mean_roll100', data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (10% SMA)')
  sns.lineplot(x=corpus_parags_df.index, y='roberta_lg15_mean_roll100', data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (10% SMA)')
  fig.show()

In [None]:
# corpus_sents_df.drop(columns=['y_scaled'], axis=1, inplace=True)

### **Stanford ASAP Plot**


**Stanford ASAP: Automatic Smoothing for Attention Prioritization in Time Series**

* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP.ipynb (Python)
* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP-simple.js
* http://futuredata.stanford.edu/asap/ 
* https://www.datadoghq.com/blog/auto-smoother-asap/

**Save the following plots to gDrive files?**

In [None]:
Save_to_File = False #@param {type:"boolean"}


In [None]:
# Sentence SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_sents_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, do_plot=True, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

In [None]:
# Paragraph SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_parags_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

### **LOWESS Plots**

**Sentence Sentiment LOWESS Plots**

In [None]:
# Debug

PLOT_OUTPUT='Major'

In [None]:
# Plot Sentence Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./4, 1./6, 1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_sents_df[new_lowess_col] = plot_lowess(corpus_sents_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_sents_df[new_lowess_mean_col] = corpus_sents_df[cols_lowess].mean(axis=1)

corpus_sents_df.head(2)

In [None]:
corpus_parags_df.columns

**Paragraph Sentiment LOWESS Plots**

In [None]:
# Plot Paragraph Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_parags_df[new_lowess_col] = plot_lowess(corpus_parags_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_parags_df[new_lowess_mean_col] = corpus_parags_df[cols_lowess].mean(axis=1)

corpus_parags_df.head(2)

**Compare Sentence and Paragraph Median LOWESS**

In [None]:
# Compare Sentence and Paragraph LOWESS means

# Calculate the Sentence and Paragraph LOWESS means and plot

# Get all the calculated LOWESS columns in a list 
cols_sents_lowess = matching_cols = get_cols_regex(corpus_sents_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)
cols_parags_lowess = matching_cols = get_cols_regex(corpus_parags_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)

# Compute the mean for all the LOWESS columns
col_mean_lowess = f'{sa_model}_mean_lowess'
corpus_sents_df[col_mean_lowess] = corpus_sents_df[cols_sents_lowess].mean(axis=1)
corpus_parags_df[col_mean_lowess] = corpus_parags_df[cols_parags_lowess].mean(axis=1)

# Plot the Sentence and Paragraph LOWESS means
if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y=col_mean_lowess, data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (8-12 LOWESS)')
  sns.lineplot(x=corpus_parags_df.index, y=col_mean_lowess, data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (8-12 LOWESS)')
  fig.show()

### **Save Newly Computed Sentiment Time Series**

In [None]:
# Save all the calculated Sentiment Values

author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

sats_filename_str = f'sentiment_sents_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_sents_df.to_csv(sats_filename_str, index=False)
print(f'Saved Sentence Sentiments with LOWESS in file: {sats_filename_str}')

sats_filename_str = f'sentiment_parags_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_parags_df.to_csv(sats_filename_str, index=False)
print(f'Saved Paragraph Sentiments with LOWESS in file: {sats_filename_str}')


## **XLNet Sentiment Analysis**

### **Sentiment Analysis**

In [None]:
# Setup for RoBERTa Large English 15datasets: siebert/sentiment-roberta-large-english

sa_model = 'robertalg15'

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}


# Set model
sa_model = 'robertalg15'
model_name = "siebert/sentiment-roberta-large-english"

# Load tokenizer and model, create trainer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model);

In [None]:
# Prepare Text from DataFrame

sents_pred_df = corpus_sents_df.copy()
sents_pred_texts = sents_pred_df['sent_raw'].astype('str').tolist() # Want to catch NaN, .dropna().astype('str').tolist()

# Tokenize texts and create prediction data set

sents_tokenized_texts = tokenizer(sents_pred_texts,truncation=True,padding=True)
sents_pred_dataset = SimpleDataset(sents_tokenized_texts)

In [None]:
# Run predictions

sents_predictions = trainer.predict(sents_pred_dataset)

In [None]:
temp_sentiment_df.head(3)

In [None]:
corpus_sents_df.columns

In [None]:
temp_sentiment_df.head(2)

In [None]:
temp_ser = temp_sentiment_df.apply(lambda x: polprob2sentiment(str(x.label), float(x.score)))

In [None]:
# Transform predictions to labels

sents_preds = sents_predictions.predictions.argmax(-1)
sents_labels = pd.Series(sents_preds).map(model.config.id2label)
sents_scores = (np.exp(sents_predictions[0])/np.exp(sents_predictions[0]).sum(-1,keepdims=True)).max(1)

# Create DataFrame with texts, predictions, labels, and scores

temp_sentiment_df = pd.DataFrame(list(zip(sents_pred_texts,sents_preds,sents_labels,sents_scores)), columns=['text','pred','label','score'])
# temp_sentiment_df.head()

# Convert label (Neg/Pos) and score (Prob) to a +/-Sentiment Float Value

corpus_sents_df[sa_model] = temp_sentiment_df.apply(lambda x: polprob2sentiment(x.label,x.score), axis=1)
corpus_sents_df.head()

In [None]:
# Verify the head and tail are complete and correct

corpus_parags_df.iloc[:3]
corpus_parags_df.iloc[-3:]

In [None]:
# Aggregate Sentence Sentiments to populate Paragraph Sentiment DataFrame

# def sentiment_sents2parags(ts_df, model_name='roberta_lg15'):
parags_sentiment_ls = sentiment_sents2parags(corpus_sents_df, sa_model)
corpus_parags_df[sa_model] = pd.Series(parags_sentiment_ls)
corpus_parags_df.head(2)
corpus_parags_df.tail(2)

### **Histogram Plots**

In [None]:
# Debug
PLOT_OUTPUT = 'Major'

In [None]:
# Create histogram of Sentence Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_sents_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Sentence Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_sents_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

In [None]:
# Create histogram of Paragraph Polarities

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  sns.histplot(data=corpus_parags_df[sa_model], kde=False).set_title(f'{CORPUS_FULL} \n Histogram of Paragraph Sentiment Values (Model: {sa_model})');

if (PLOT_OUTPUT == 'All'):
  # Save graph to file.
  plot_filename = 'hist_parags_{sa_model}.png'
  plotpathfilename_str = gen_pathfiletime(plot_filename)
  plt.savefig(plotpathfilename_str, format='png', dpi=300)
  print(f'Plot saved: {plot_filename}');

### **Mean SMA Plots**

In [None]:
# SMA % Sentiment of Sentence Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_sents_df, sa_model, text_unit='sentence', win_ls=[5,10,20])

In [None]:
# SMA % Sentiment of Paragraph Sentiments

# def get_smas(model_name, ts_df, win_ls=[5,10], do_plot=True, save2file=False):

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  get_smas(corpus_parags_df, sa_model, text_unit='paragraph', win_ls=[5,10,20])

In [None]:
# Compare Sentence and Paragraph

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y='roberta_lg15_mean_roll100', data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (10% SMA)')
  sns.lineplot(x=corpus_parags_df.index, y='roberta_lg15_mean_roll100', data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (10% SMA)')
  fig.show()

In [None]:
# corpus_sents_df.drop(columns=['y_scaled'], axis=1, inplace=True)

### **Stanford ASAP Plot**


**Stanford ASAP: Automatic Smoothing for Attention Prioritization in Time Series**

* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP.ipynb (Python)
* https://github.com/stanford-futuredata/ASAP/blob/master/ASAP-simple.js
* http://futuredata.stanford.edu/asap/ 
* https://www.datadoghq.com/blog/auto-smoother-asap/

**Save the following plots to gDrive files?**

In [None]:
Save_to_File = False #@param {type:"boolean"}


In [None]:
# Sentence SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_sents_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, do_plot=True, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

In [None]:
# Paragraph SMA/ASAP Plots with roberta_lg15

# raw_data = load_csv('Taxi.csv')
raw_data = list(corpus_parags_df[sa_model])
window_size, slide_size = smooth_ASAP(raw_data, 5, resolution=1000)      # 20210621 Fixed JChun
# window_size, slide_size = smooth_simple(raw_data, resolution=1000)      # 20210621 Fixed JChun
print(f'Window Size: {window_size} and Slide_Size: {slide_size}')
asap_x, asap_y = plot_asap(sa_model,raw_data, window_size, slide_size, save2file=Save_to_File);

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  pass
  # Save figure

In [None]:
# Check the automatically computed ASAP values for test SA TS

print(f'Based upon the hyperparamter resolution = 1000:')
print(f'  ASAP [window_size]: {window_size} [slide_size]: {slide_size}')
print(f'--------------------')
print(f'Original Length = {len(raw_data)} vs ASAP Length: {len(asap_x)}')

### **LOWESS Plots**

**Sentence Sentiment LOWESS Plots**

In [None]:
# Debug

PLOT_OUTPUT='Major'

In [None]:
# Plot Sentence Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./4, 1./6, 1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_sents_df[new_lowess_col] = plot_lowess(corpus_sents_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_sents_df[new_lowess_mean_col] = corpus_sents_df[cols_lowess].mean(axis=1)

corpus_sents_df.head(2)

In [None]:
corpus_parags_df.columns

**Paragraph Sentiment LOWESS Plots**

In [None]:
# Plot Paragraph Sentiment LOWESS for various frac's

lowess_frac_ls = [0.25, 0.2, 0.15, 0.1, 0.075, 0.05] # [1./8, 1./10, 1./12, 1./14, 1./16]
cols_lowess = []

if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  aplot = True 
else:
  aplot = False

for afrac in lowess_frac_ls:
  print(f'Calculating LOWESS for frac = {afrac}...')
  afrac_per_str = str(round(100*afrac))
  new_lowess_col = f'{sa_model}_frac{afrac_per_str}_lowess'
  cols_lowess.append(new_lowess_col)
  corpus_parags_df[new_lowess_col] = plot_lowess(corpus_parags_df, [sa_model], do_plot=aplot, afrac=afrac)

new_lowess_mean_col = f'{sa_model}_mean_lowess'
print(f'new_lowess_mean_cols: {new_lowess_mean_col}')
print(f'cols_lowess: {cols_lowess}')
corpus_parags_df[new_lowess_mean_col] = corpus_parags_df[cols_lowess].mean(axis=1)

corpus_parags_df.head(2)

**Compare Sentence and Paragraph Median LOWESS**

In [None]:
# Compare Sentence and Paragraph LOWESS means

# Calculate the Sentence and Paragraph LOWESS means and plot

# Get all the calculated LOWESS columns in a list 
cols_sents_lowess = matching_cols = get_cols_regex(corpus_sents_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)
cols_parags_lowess = matching_cols = get_cols_regex(corpus_parags_df, find_regex = '_lowess', ignore_regex = '_mean_', strict_match=False)

# Compute the mean for all the LOWESS columns
col_mean_lowess = f'{sa_model}_mean_lowess'
corpus_sents_df[col_mean_lowess] = corpus_sents_df[cols_sents_lowess].mean(axis=1)
corpus_parags_df[col_mean_lowess] = corpus_parags_df[cols_parags_lowess].mean(axis=1)

# Plot the Sentence and Paragraph LOWESS means
if (PLOT_OUTPUT == 'All') | (PLOT_OUTPUT == 'Major'):
  fig, axs = plt.subplots(nrows=2)
  sns.lineplot(x=corpus_sents_df.index, y=col_mean_lowess, data=corpus_sents_df, ax=axs[0]).set_title(f'{CORPUS_FULL}\n Sentence Sentiment (8-12 LOWESS)')
  sns.lineplot(x=corpus_parags_df.index, y=col_mean_lowess, data=corpus_parags_df, ax=axs[1]).set_title(f'{CORPUS_FULL}\n Paragraph Sentiment (8-12 LOWESS)')
  fig.show()

### **Save Newly Computed Sentiment Time Series**

In [None]:
# Save all the calculated Sentiment Values

author_str = ''.join(CORPUS_AUTHOR.split()).lower()
title_str = ''.join(CORPUS_TITLE.split()).lower()
datetime_now = datetime.utcnow().strftime("%Y%m%d_%H%M")

sats_filename_str = f'sentiment_sents_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_sents_df.to_csv(sats_filename_str, index=False)
print(f'Saved Sentence Sentiments with LOWESS in file: {sats_filename_str}')

sats_filename_str = f'sentiment_parags_lowess_{author_str}_{title_str}_{datetime_now}.csv'
corpus_parags_df.to_csv(sats_filename_str, index=False)
print(f'Saved Paragraph Sentiments with LOWESS in file: {sats_filename_str}')


## **T5 Sentiment Analysis**

* https://huggingface.co/mrm8488/t5-base-finetuned-imdb-sentiment

### **Sentiment Analysis**

In [None]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-imdb-sentiment")

def get_sentiment(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt')

  output = model.generate(input_ids=input_ids,
               max_length=2)

  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]

  return dec

return_str = get_sentiment("I like a lot that film")
print(return_str[1])

In [None]:
return_str = get_sentiment("I like a lot that film")
type(return_str)

In [None]:
# Setup for RoBERTa Large English 15datasets: siebert/sentiment-roberta-large-english

sa_model = 'robertalg15'

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}


# Set model
sa_model = 'robertalg15'
model_name = "siebert/sentiment-roberta-large-english"

# Load tokenizer and model, create trainer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model);

In [None]:
# Prepare Text from DataFrame

sents_pred_df = corpus_sents_df.copy()
sents_pred_texts = sents_pred_df['sent_raw'].astype('str').tolist() # Want to catch NaN, .dropna().astype('str').tolist()

# Tokenize texts and create prediction data set

sents_tokenized_texts = tokenizer(sents_pred_texts,truncation=True,padding=True)
sents_pred_dataset = SimpleDataset(sents_tokenized_texts)

In [None]:
# Run predictions

sents_predictions = trainer.predict(sents_pred_dataset)

In [None]:
temp_sentiment_df.head(3)

In [None]:
corpus_sents_df.columns

In [None]:
temp_sentiment_df.head(2)

In [None]:
temp_ser = temp_sentiment_df.apply(lambda x: polprob2sentiment(str(x.label), float(x.score)))

In [None]:
# Transform predictions to labels

sents_preds = sents_predictions.predictions.argmax(-1)
sents_labels = pd.Series(sents_preds).map(model.config.id2label)
sents_scores = (np.exp(sents_predictions[0])/np.exp(sents_predictions[0]).sum(-1,keepdims=True)).max(1)

# Create DataFrame with texts, predictions, labels, and scores

temp_sentiment_df = pd.DataFrame(list(zip(sents_pred_texts,sents_preds,sents_labels,sents_scores)), columns=['text','pred','label','score'])
# temp_sentiment_df.head()

# Convert label (Neg/Pos) and score (Prob) to a +/-Sentiment Float Value

corpus_sents_df[sa_model] = temp_sentiment_df.apply(lambda x: polprob2sentiment(x.label,x.score), axis=1)
corpus_sents_df.head()

In [None]:
# Verify the head and tail are complete and correct

corpus_parags_df.iloc[:3]
corpus_parags_df.iloc[-3:]

In [None]:
# Aggregate Sentence Sentiments to populate Paragraph Sentiment DataFrame

# def sentiment_sents2parags(ts_df, model_name='roberta_lg15'):
parags_sentiment_ls = sentiment_sents2parags(corpus_sents_df, sa_model)
corpus_parags_df[sa_model] = pd.Series(parags_sentiment_ls)
corpus_parags_df.head(2)
corpus_parags_df.tail(2)