# **SentimentArcs (Installation): Run Once to Install**

```
Jon Chun
12 Jun 2021: Started
04 Mar 2022: Last Update
```

* This is the first Notebook you should execute to install the SentimentArcs framework on youru Google GDrive.

* This notebook should only be run once to copy the Github repo and create certain files. After the initial install, there is no need to run this again unless you want to completely delete and reinstall SentimentArcs from your GDrive.

# **[STEP 1] Configuration and Setup**



## [INPUT] Connect Google gDrive to this Jupyter Notebook

In [2]:
# [INPUT REQUIRED]: Authorize access to Google gDrive

# Connect this Notebook to your permanent Google Drive
#   so all generated output is saved to permanent storage there

try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("Attempting to attach your Google gDrive to this Colab Jupyter Notebook")
  drive.mount('/gdrive')
else:
  print("Your Google gDrive is attached to this Colab Jupyter Notebook")

Attempting to attach your Google gDrive to this Colab Jupyter Notebook
Mounted at /gdrive


In [3]:
# [CUSTOMIZE]: Change the text after the Unix '%cd ' command below (change directory)
#              to math the full path to your gDrive subdirectory which should be the 
#              root directory cloned from the SentimentArcs github repo.

# NOTE: Make sure this subdirectory already exists and there are 
#       no typos, spaces or illegals characters (e.g. periods) in the full path after %cd

# NOTE: In Python all strings must begin with an upper or lowercase letter, and only
#         letter, number and underscores ('_') characters should appear afterwards.
#         Make sure your full path after %cd obeys this constraint or errors may appear.



# Step #1: Get full path to SentimentArcs subdir on gDrive
# =======
#@markdown **Accept default path on gDrive or Enter new one:**

Path_to_SentimentArcs = "/gdrive/MyDrive/cdh/sentiment_arcs/" #@param ["/gdrive/MyDrive/sentiment_arcs/"] {allow-input: true}

#@markdown (e.g. /gdrive/MyDrive/research/sentiment_arcs/)



# Step #2: Move to Parent directory of Sentiment_Arcs
# =======
parentdir_sentiment_arcs = '/'.join(Path_to_SentimentArcs.split('/')[:-2])
print(f'subdir_parent: {parentdir_sentiment_arcs}')
%cd $parentdir_sentiment_arcs


# Step #3: If project sentiment_arcs subdir does not exist, 
#          clone it from github
# =======
import os

if ~os.path.isdir('sentiment_arcs'):
  # TODO: When public, uncomment to switch to real code
  # !git clone https://github.com/jon-chun/sentiment_arcs.git

  # Test on open access github repo
  !git clone https://github.com/jon-chun/nabokov_palefire.git


# Step #4: Change into sentiment_arcs subdir
# =======
# %cd ./sentiment_arcs
# Test on open acess github repo
%cd ./nabokov_palefire

# Step #5: Confirm contents of sentiment_arcs subdir
# =======
!pwd
!ls

# TODO: Correct when switched to live
%cd $Path_to_SentimentArcs
!pwd

subdir_parent: /gdrive/MyDrive/cdh
/gdrive/MyDrive/cdh
fatal: destination path 'nabokov_palefire' already exists and is not an empty directory.
/gdrive/MyDrive/cdh/nabokov_palefire
/gdrive/MyDrive/cdh/nabokov_palefire
Foreword_Text.txt  palefire_clean_parts  Poem.txt  README.md
/gdrive/MyDrive/cdh/sentiment_arcs
/gdrive/MyDrive/cdh/sentiment_arcs


## Configure Jupyter Notebook

In [4]:
# Configure Jupyter

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Enable multiple outputs from one code cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display
from IPython.display import Image
from ipywidgets import widgets, interactive

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Clone SentimentArcs from Github

In [None]:
# Verify in SentimentArcs Root Directory

os.chdir(parentdir_sentiment_arcs)
!pwd

/gdrive/MyDrive/cdh/sentiment_arcs


In [None]:
!git clone https://github.com/jon-chun/sentiment_arcs....

In [None]:
# Verify in SentimentArcs Root Directory

os.chdir('/gdrive/MyDrive/cdh/sentiment_arcs/')
!pwd

# Download Lexicons and Datafiles

## Public Internet Files (wget)

In [None]:
!wget https://drive.google.com/open?id=1wVN-TYx53pbTEnkHzcpMsm9SADvowUx3

## Google GDrive Public Files

In [None]:
!gdown --id 1wVN-TYx53pbTEnkHzcpMsm9SADvowUx3

## Kaggle API Files

In [None]:
!mkdir .kaggle

In [None]:
# Upload kaggle.json account credentials

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
!mv kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
kaggle datasets download -d yelp-dataset/yelp-dataset

# Global Variables and Constants

## get_globals.py

In [35]:
%%writefile ./utils/get_globals.py

# Define minimum paragraph and sentence lengths for data cleaning
#   any parag/sent less than these mins will be ignored/blanked

MIN_PARAG_LEN = 10
MIN_SENT_LEN = 3

# Stopwords to add and delete from default English stopword list
STOPWORDS_ADD_EN = ['a', 'the', 'an']
STOPWORDS_DEL_EN = ['jimmy', 'dean']

# Main Dictionary holding all Lexicon by Name/Key
lexicons_dt = {}

# Test WORDS of Sentiment Analysis
test_words_ls =["Love",
                "Hate",
                "bizarre",
                "strange",
                "furious",
                "elated",
                "curious",
                "beserk",
                "gambaro"]
                
# Test SENTENCES of Sentiment Analysis
test_sentences_ls =["I hate bad evil worthless Mondays.",
                    "I love Paris in the springtime",
                    "It was Wednesday.",
                    "You are a disgusting pig - I hate you.",
                    "What a delightfully funny and beautiful good man.",
                    "That was it"]


# Abbreviation / Slang
# https://www.kaggle.com/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing/notebook

SLANG_DT = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

Overwriting ./utils/get_globals.py


## Create: get_subdirs.py

In [None]:
%%writefile ./utils/get_subdirs.py

def get_subdirs(Corpus_Genre, Corpus_Type, NotebookModels):
  '''
  Given a two strings: Corpus, Text_type
  Set all global SUB/DIR constants
  '''

  global FNAME_SENTIMENT_RAW
  global DIR_ROOT
  global SUBDIR_TEXT_RAW
  global SUBDIR_TEXT_CLEAN
  global SUBDIR_SENTIMENT_RAW
  global SUBDIR_SENTIMENT_CLEAN
  global SUBDIR_PLOTS
  global SUBDIR_DATA
  global SUBDIR_UTILS

  if NotebookModels == 'syuzhetr2sentimentr':
    FNAME_SENTIMENT_RAW = f'sentiment_raw_{Corpus_Genre}_{Corpus_Type}_syuzhetr2sentimentr.json'
  elif NotebookModels == 'lex2ml':
    FNAME_SENTIMENT_RAW = f'sentiment_raw_{Corpus_Genre}_{Corpus_Type}_lex2ml.json'
  elif NotebookModels == 'dnn2transformers':
    FNAME_SENTIMENT_RAW = f'sentiment_raw_{Corpus_Genre}_{Corpus_Type}_dnn2transformers.json'
  elif NotebookModels == 'none':
    FNAME_SENTIMENT_RAW = f'[NONE]'
  else:
    print(f'ERROR: Illegal value for NotebookModels: {NotebookModels}')
    return

  SUBDIR_TEXT_RAW = f"./text_raw/{Corpus_Genre}_text_{Corpus_Type}_raw/"
  SUBDIR_TEXT_CLEAN = f"./text_clean/{Corpus_Genre}_text_{Corpus_Type}_clean/"
  SUBDIR_SENTIMENT_RAW = f"./sentiment_raw/{Corpus_Genre}_sentiment_{Corpus_Type}_raw/"
  SUBDIR_SENTIMENT_CLEAN = f"./sentiment_clean/{Corpus_Genre}_sentiment_{Corpus_Type}_clean/"
  SUBDIR_PLOTS = f"./plots/"
  SUBDIR_DATA = f"./data/"
  SUBDIR_UTILS = f"./utils/"

  # Verify Directory Structure

  print('Verify the Directory Structure:\n')
  print('-------------------------------\n')

  print(f'          [Corpus Genre]: {Corpus_Genre}\n')
  print(f'           [Corpus Type]: {Corpus_Type}\n\n')

  print(f'   [FNAME_SENTIMENT_RAW]: {FNAME_SENTIMENT_RAW}\n\n')

  print(f'       [SUBDIR_TEXT_RAW]: {SUBDIR_TEXT_RAW}\n')
  print(f'     [SUBDIR_TEXT_CLEAN]: {SUBDIR_TEXT_CLEAN}\n')
  print(f'  [SUBDIR_SENTIMENT_RAW]: {SUBDIR_SENTIMENT_RAW}\n')
  print(f'[SUBDIR_SENTIMENT_CLEAN]: {SUBDIR_SENTIMENT_CLEAN}\n')
  print(f'          [SUBDIR_PLOTS]: {SUBDIR_PLOTS}\n')
  print(f'           [SUBDIR_DATA]: {SUBDIR_DATA}\n')
  print(f'          [SUBDIR_UTILS]: {SUBDIR_UTILS}\n')

  return

Writing ./utils/get_subdirs.py


## Create: read_yaml.py

In [None]:
%%writefile ./utils/read_yaml.py

def read_yaml(Corpus_Genre, Corpus_Type):
  '''
  Given a Corpus_Genre (e.g. novels)
  Read and return the long-form titles for both Models and Corpus Texts
  '''

  global models_titles_dt
  global corpus_titles_dt

  # Read SentimentArcs YAML Config Files on Models
  # Model in SentimentArcs Ensemble
  with open("./config/models_ref_info.yaml", "r") as stream:
    try:
      models_titles_dt = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
      print(exc)

  if Corpus_Genre == 'novels':

    # Novel Text Files
    if Corpus_Type == 'new':
      # Corpus of New Novels
      with open("./config/novels_new_info.yaml", "r") as stream:
        try:
          corpus_titles_dt = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
          print(exc)
    else:
      # Corpus of Reference Novels
      with open("./config/novels_ref_info.yaml", "r") as stream:
        try:
          corpus_titles_dt = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
          print(exc)    

  elif Corpus_Genre == 'finance':

    # Finance Text Files
    if Corpus_Type == 'new':
      # Corpus of New Finance Texts
      with open("./config/finance_new_info.yaml", "r") as stream:
        try:
          corpus_titles_dt = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
          print(exc)
    else:
      # Corpus of Reference Finance Texts
      with open("./config/finance_ref_info.yaml", "r") as stream:
        try:
          corpus_titles_dt = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
          print(exc)

  elif Corpus_Genre == 'social_media':

    # Social Media Text Files
    if Corpus_Type == 'new':
      # Corpus of New Social Media Texts
      with open("./config/social_new_info.yaml", "r") as stream:
        try:
          corpus_titles_dt = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
          print(exc)
    else:
      # Corpus of Reference Social Media Texts
      with open("./config/social_ref_info.yaml", "r") as stream:
        try:
          corpus_titles_dt = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
          print(exc)

  else:
    
    print(f"ERROR: Illegal Corpus_Genre: {Corpus_Type}\n")

  return

## Map Col/Model Names

In [None]:
# Mapping to standarize col/model names

cols_map_dt = {'syuzhet':'syuzhetr',
               'huliu':'bing_sentimentr',
               'sentiword':'sentiword_sentimentr',
               'senticnet':'senticnet_sentimentr',
               'lmcd':'lmcd_sentimentr',
               'jockers':'jockers_sentimentr',
               'jockers_rinker':'jockersrinker_sentimentr'
               }

cols_missing_ls = ['nrc_sentimentr']

## Configure Matplotlib and Seaborn

In [None]:
!pwd

/content


In [None]:
%%writefile ./utils/config_matplotlib.py

def config_matplotlib():
  '''
  Set configurations params for Matplotlib
  '''

  global plt

  from cycler import cycler

  colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']   
  linestyles = ['-', '--', ':', '-.','-', '--', ':', '-.','-', '--']

  cycle = plt.cycler("color", colors) + plt.cycler("linestyle", linestyles)

  # View previous matplotlib configuration
  # print('\n Old Matplotlib Configurtion Settings:\n')
  # plt.rc.show
  print('\n\n')

  # Update and view new matplotlib configuration
  # print('\n New Matplotlib Configurtion Settings:\n')
  myparams = {'axes.prop_cycle': cycle}
  plt.rcParams.update(myparams)

  plt.rcParams["axes.titlesize"] = 16
  plt.rcParams['figure.figsize'] = 20,10
  plt.rcParams["legend.fontsize"] = 10
  plt.rcParams["xtick.labelsize"] = 12
  plt.rcParams["ytick.labelsize"] = 12
  plt.rcParams["axes.labelsize"] = 12
  plt.rcParams["figure.titlesize"] = 32

  # View matplotlib options
  # plt.rcParams.keys()

  # Set matplotlib plot figure.figsize
  new_plt_size = plt.rcParams["figure.figsize"]=(20,10)
  print(" New figure size: ",new_plt_size)

  plt.style.use('seaborn-whitegrid')
  
  return

In [None]:
%%writefile ./utils/config_seaborn.py

def config_seaborn():
  '''
  Set configurations params for Seaborn
  '''

  global sns
  # View previous seaborn configuration
  # print('\n Old Seaborn Configurtion Settings:\n')
  sns.axes_style()
  print('\n\n')

  # Update and View new seaborn configuration
  # print('\n New Seaborn Configurtion Settings:\n')

  # Change defaults
  # sns.set(style='white', context='talk', palette='tab10')

  sns.set_theme('paper') # paper, notebook, talk, poster
  sns.set_context('paper')
  sns.set_style('white')    # darkgrid, whitegrid, dark, white, and ticks
  sns.set_palette('tab10')  # High-Contrast Palette, Vision Impaired Friendly

  return

Overwriting ./utils/config_seaborn.py


## Create: get_model_famalies.py

In [None]:
%%writefile ./utils/get_model_families.py

def get_model_famalies(models_titles_dt):
  '''
  Given a Dict of Model Titles
  Return a list of lists (one for each family populated with corresponding models)
  '''

  # Convenience lists for each type of model

  # Lexicon Models
  models_lexicon_ls = [x[0] for x in models_titles_dt.values() if x[1] == 'lexicon']
  print(f'\nThere are {len(models_lexicon_ls)} Lexicon Models')
  for i,amodel in enumerate(models_lexicon_ls):
    print(f'  Lexicon Model #{i}: {amodel}')

  # Heuristic Models
  models_heuristic_ls = [x[0] for x in models_titles_dt.values() if x[1] == 'heuristic']
  print(f'\nThere are {len(models_heuristic_ls)} Heuristic Models')
  for i,amodel in enumerate(models_heuristic_ls):
    print(f'  Heuristic Model #{i}: {amodel}')

  # Traditional ML Models
  models_tradml_ls = [x[0] for x in models_titles_dt.values() if x[1] == 'tradml']
  print(f'\nThere are {len(models_tradml_ls)} Traditional ML Models')
  for i,amodel in enumerate(models_tradml_ls):
    print(f'  Traditional ML Model #{i}: {amodel}')

  # DNN Models
  models_dnn_ls = [x[0] for x in models_titles_dt.values() if x[1] == 'dnn']
  print(f'\nThere are {len(models_dnn_ls)} DNN Models')
  for i,amodel in enumerate(models_dnn_ls):
    print(f'  DNN Model #{i}: {amodel}')

  # Transformer Models
  models_transformer_ls = [x[0] for x in models_titles_dt.values() if x[1] == 'transformer']
  print(f'\nThere are {len(models_transformer_ls)} Transformer Models')
  for i,amodel in enumerate(models_transformer_ls):
    print(f'  Transformer Model #{i}: {amodel}')

  # All Models

  models_ensemble_dt = {}
  models_ensemble_dt['lexicon'] = models_lexicon_ls
  models_ensemble_dt['heuristic'] = models_heuristic_ls
  models_ensemble_dt['ml'] = models_tradml_ls
  models_ensemble_dt['dnn'] = models_dnn_ls
  models_ensemble_dt['transformer'] = models_transformer_ls

  print(f'\nThere are {len(models_ensemble_dt.keys())} Total Models:')
  for i,amodel in enumerate(models_ensemble_dt.keys()):
    print(f'  Model #{i:>2}: {amodel}')

  print(f'\nThere are {len(models_ensemble_dt.keys())} Total Models (+1 for Ensemble Mean)')

  return models_ensemble_dt


Writing ./utils/get_model_families.py


FileNotFoundError: ignored

# Utility Functions

## Files

### file_utils.py

In [36]:
%%writefile ./utils/file_utils.py

def get_fullpath(text_title_str, ftype='data_clean', fig_no='', first_note = '',last_note='', plot_ext='png', no_date=False):
  '''
  Given a required file_type(ftype:['data_clean','data_raw','plot']) and
    optional first_note: str inserted after Title and before (optional) SMA/Standardization info
            last_note: str insterted after (optional) SMA/Standardization info and before (optional) timedate stamp
            plot_ext: change default *.png extension of plot file
            no_date: don't add trailing datetime stamp to filename
  Generate and return a fullpath (/subdir/filename.ext) to save file to
  '''

  # String with full path/filename.ext to return
  fname = ''

  # Get current datetime stamp as a string
  if no_date:
    date_dt = ''
  else:
    date_dt = f'_{datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")}'

  # Clean optional file notation if passed in
  if first_note:
    fnote_str = first_note.replace(' ', '_')
    fnote_str = '_'.join(fnote_str.split())
    fnote_str = '_'.join(fnote_str.split('.'))
    fnote_str = '_'.join(fnote_str.split('__'))
    fnote_str = fnote_str.lower()

  if first_note:
    text_title_str = f'{text_title_str}_{first_note}'

  # Option (a): Cleaned Model Data (Smoothed then Standardized)
  if ftype == 'data_clean':
    fprefix = 'sa_clean_'
    fname_str = f'{SUBDIR_SENTIMENT_CLEAN}{fprefix}{text_title_str}_{Model_Standardization_Method.lower()}_sma{Window_Percent}'
    if last_note:
      fname = f'{fname_str}_{last_note}{date_dt}.csv'
    else:
      fname = f'{fname_str}{date_dt}.csv'

  # Option (b): Raw Model Data
  elif ftype == 'data_raw':
    fprefix = 'sa_raw_'
    fname_str = f'{SUBDIR_SENTIMENT_RAW}{fprefix}{text_title_str}'
    if last_note:
      fname = f'{fname_str}_{last_note}{date_dt}.csv'
    else:
      fname = f'{fname_str}{date_dt}.csv'

  # Option (c): Plot Figure
  elif ftype == 'plot':
    if fig_no:
      fprefix = f'plot_{fig_no}_'
    else:
      fprefix = 'plot_'
    fname_str = f'{SUBDIR_SENTIMENT_PLOTS}{fprefix}{text_title_str}'
    if last_note:
      fname = f'{fname_str}_{last_note}{date_dt}.{plot_ext}'
    else:
      fname = f'{fname_str}{date_dt}.{plot_ext}'

  # Option (d): Crux Text
  elif ftype == 'crux_text':
    fprefix = 'crux_'
    fname_str = f'{SUBDIR_SENTIMENT_CRUXES}{fprefix}{text_title_str}'
    if last_note:
      fname = f'{fname_str}_{last_note}{date_dt}.txt'
    else:
      fname = f'{fname_str}{date_dt}.txt'

  else:
    print(f'ERROR: In get_fullpath() with illegal arg ftype:[{ftype}]')
    return f'ERROR: ftype:[{ftype}]'

  return fname

# --------------------------------------------------
def textfile2df(fullpath_str):
  '''
  Given a full path to a *.txt file
  Return a DataFrame with one Sentence per row
  '''

  textfile_df = pd.DataFrame()

  with open(fullpath_str,'r') as fp:
    content_str = fp.read() # .replace('\n',' ')

  sents_ls = text_str2sents(content_str)

  textfile_df['text_raw'] = pd.Series(sents_ls)

  return textfile_df

# --------------------------------------------------
# NOTE: SentimentArcs Main datastructure is a Dictionary(Corpus) of DataFrames(Documents: rows=sentences, cols=sentiment, 1 col per model in ensemble)
#       This complex data structure has 2 special I/O utility functions to read/write to permanent disk storage as *.json files

# Utility functions to read/write nested Dictionary (key=novel) of DataFrames (Cols = Model Sentiment Series) 

def write_dict_dfs(adict, out_file='sentiments.json', out_dir=SUBDIR_SENTIMENT_RAW):
  '''
  Given a Dictionary of DataFrames and optional output filename and output directory
  Write as nested json file
  '''

  # convert dataframes into dictionaries
  data_dict = {
      key: adict[key].to_dict(orient='records') 
      for key in adict.keys()
  }

  # write to disk
  out_fullpath = f'{out_dir}{out_file}'
  print(f'Saving file to: {out_fullpath}')
  with open(out_fullpath, 'w') as fp:
    json.dump(
      data_dict, 
      fp, 
      indent=4, 
      sort_keys=True
    )

  return 

def read_dict_dfs(in_file='sentiments.json', in_dir=SUBDIR_SENTIMENT_RAW):
  '''
  Given a Dictionary of DataFrames and optional output filename and output directory
  Read nested json file into Dictionary of DataFrames
  '''

  # read from disk
  in_fullpath = f'{in_dir}{in_file}'
  with open(in_fullpath, 'r') as fp:
      data_dict = json.load(fp)

  # convert dictionaries into dataframes
  all_dt = {
      key: pd.DataFrame(data_dict[key]) 
      for key in data_dict
  }

  return all_dt

# --------------------------------------------------


# --------------------------------------------------


# --------------------------------------------------

Overwriting ./utils/file_utils.py


In [None]:
%%writefile ./utils/file_utils.py

def textfile2df(fullpath_str):
  '''
  Given a full path to a *.txt file
  Return a DataFrame with one Sentence per row
  '''

  textfile_df = pd.DataFrame()

  with open(fullpath_str,'r') as fp:
    content_str = fp.read() # .replace('\n',' ')

  sents_ls = text_str2sents(content_str)

  textfile_df['text_raw'] = pd.Series(sents_ls)

  return textfile_df

## Text Cleaning

### text_cleaners.py

In [None]:
%%writefile ./utils/text_cleaners.py

def text2lemmas(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stopwords_ls):
                lemmatized.append(lemma)
    return " ".join(lemmatized)

# --------------------------------------------------
def text_str2sents(text_str, pysbd_only=False):
  '''
  Given a long text string (e.g. a novel) and pysbd_only flag
  Return a list of every Sentence defined by (a) 2+ newlines as paragraph separators, 
                                            (b) SpaCy+PySBD Pipeline, and 
                                            (c) Optionally, NLTK sentence tokenizer
  '''

  parags_ls = []
  sents_ls = []

  from pysbd.utils import PySBDFactory
  nlp = spacy.blank('en')
  nlp.add_pipe(PySBDFactory(nlp))

  print(f'BEFORE stripping out headings len: {len(text_str)}')

  parags_ls = re.split(r'[\n]{2,}', text_str)

  parags_ls = [x.strip() for x in parags_ls]

  # Strip out non-printing characters
  parags_ls = [re.sub(f'[^{re.escape(string.printable)}]', '', x) for x in parags_ls]

  # Filter out empty lines Paragraphs
  parags_ls = [x for x in parags_ls if (len(x.strip()) >= MIN_PARAG_LEN)]

  print(f'   Parag count before processing sents: {len(parags_ls)}')
  # FIRST PASS at Sentence Tokenization with PySBD

  for i, aparag in enumerate(parags_ls):
  

    aparag_nonl = re.sub('[\n]{1,}', ' ', aparag)
    doc = nlp(aparag_nonl)
    aparag_sents_pysbd_ls = list(doc.sents)
    print(f'pysbd found {len(aparag_sents_pysbd_ls)} Sentences in Paragraph #{i}')

    # Strip ofaparag_sents_pysbd_lsf whitespace from Sentences
    aparag_sents_pysbd_ls = [str(x).strip() for x in aparag_sents_pysbd_ls]

    # Filter out empty line Sentences
    aparag_sents_pysbd_ls = [x for x in aparag_sents_pysbd_ls if (len(x.strip()) > MIN_SENT_LEN)]

    print(f'      {len(aparag_sents_pysbd_ls)} Sentences remain after cleaning')

    sents_ls += aparag_sents_pysbd_ls

  # (OPTIONAL) SECOND PASS as Sentence Tokenization with NLTK
  if pysbd_only == True:
    # Only do one pass of SpaCy/PySBD Sentence tokenizer
    # sents_ls += aparag_sents_pysbd_ls
    pass
  else:
    # Do second NLTK pass at Sentence tokenization if pysbd_only == False
    # Do second pass, tokenize again with NLTK to catch any Sentence tokenization missed by PySBD
    # corpus_sents_all_nltk_ls = []
    # sents_ls = []
    # aparag_sents_nltk_ls = []
    aparag_sents_pysbd_ls = deepcopy(sents_ls)
    sents_ls = []
    for asent in aparag_sents_pysbd_ls:
      print(f'Processing asent: {asent}')
      aparag_sents_nltk_ls = []
      aparag_sents_nltk_ls = sent_tokenize(asent)

      # Strip off whitespace from Sentences
      aparag_sents_nltk_ls = [str(x).strip() for x in aparag_sents_nltk_ls]

      # Filter out empty line Sentences
      aparag_sents_nltk_ls = [x for x in aparag_sents_nltk_ls if (len(x.strip()) > MIN_SENT_LEN)]

      # corpus_sents_all_second_ls += aparag_sents_nltk_ls

      sents_ls += aparag_sents_nltk_ls

  print(f'About to return sents_ls with len = {len(sents_ls)}')
  
  return sents_ls

# --------------------------------------------------
def textfile2df(fullpath_str):
  '''
  Given a full path to a *.txt file
  Return a DataFrame with one Sentence per row
  '''

  textfile_df = pd.DataFrame()

  with open(fullpath_str,'r') as fp:
    content_str = fp.read() # .replace('\n',' ')

  sents_ls = text_str2sents(content_str)

  textfile_df['text_raw'] = pd.Series(sents_ls)

  return textfile_df

# --------------------------------------------------
def emojis2text(atext):
  for emot, text_desc in UNICODE_EMOJI.items():
    atext = atext.replace(emot, ' '.join(text_desc.replace(",", "").split()))

  atext = atext.replace('_', ' ').replace(':','')

  return atext

# --------------------------------------------------
def all_emos2text(atext):
  '''
  Given a text string with embedded emojis and/or emoticons
  Return a expanded text string with all emojis/emoticons translated into text
  '''

  # First, convert emoticons to text
  for emot, text_desc in EMOTICONS_EMO.items():
    atext = atext.replace(emot, ' ' + ' '.join(text_desc.replace(",", " ").split()))

  # Second, convert emojis to text
  for emot, text_desc in UNICODE_EMOJI.items():
    atext = atext.replace(emot, ' ' + ' '.join(text_desc.replace(",", " ").split()))

  atext = re.sub(r':([A-Za-z_]*):',r'\1',atext)
  # atext = re.sub(r'([\w]+)([_])([\w]+)',r'\1 \3',atext)
  atext = re.sub(r'_', ' ', atext)
  atext = ' '.join(atext.split())

  return atext

# --------------------------------------------------
def expand_slang(astring):
  words_ls = []
  words_expanded_ls = []
  slang_keys = SLANG_DT.keys()

  words_ls = astring.split()
  for aword in words_ls:
    if aword.lower() in SLANG_DT.keys():
      words_expanded_ls.append(SLANG_DT[aword.lower()])
    else:
      words_expanded_ls.append(aword.lower())

  # abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

  astring_expanded = ' '.join(words_expanded_ls)

  return astring_expanded 

# --------------------------------------------------
def clean_text(text_df, text_col, text_type='formal'): 
  '''
  Given a DataFrame with a Text Column of raw text of type (formal, informal, tweet)
  Return a Series of clean texts
  '''

  text_clean_ser = pd.Series()

  # Extra processing steps for 'informal' and 'tweet' types of text
  if text_type in ['informal', 'tweet']:

    # Remove URLs
    text_clean_ser = hero.remove_urls(text_df[text_col])

    # Emoticons and then Emojis to Text
    text_clean_ser = text_clean_ser.apply(lambda x : all_emos2text(x))

    # Expand Slang/Abbr
    text_clean_ser = text_clean_ser.apply(lambda x : expand_slang(x))

  else:

    text_clean_ser = text_df[text_col]


  # Expand Contractions
  text_clean_ser = text_clean_ser.apply(lambda x : contractions.fix(x))

  # Clean text: lowercase, remove punctuation/numbers, etc
  # text_clean_ser = text_clean_ser.pipe(hero.clean, hero_pre_pipeline)
  text_clean_ser = hero.clean(text_clean_ser, pipeline = def_pipeline)

  return text_clean_ser

# --------------------------------------------------
def lemma_pipe(texts):
  '''
  Given a text string
  Return a text string with all tokens lemmatized using SpaCy pipe for speed
  Called by clean_text() with SpaCy Lemmatizer
  '''
  # https://prrao87.github.io/blog/spacy/nlp/performance/2020/05/02/spacy-multiprocess.html

  lemma_tokens = []
  # for doc in nlp.pipe(docs, batch_size=32, n_process=3, disable=["tagger", "parser", "ner"]):
  for doc in nlp.pipe(texts, batch_size=200, n_process=3, disable=["tagger", "parser", "ner"]):
    # lemma_tokens.append([str(tok.lemma_).lower() if tok.lemma_ != '-PRON-' else str(tok.orth_).lower() for tok in doc])
    temp_ls = [str(tok.lemma_).lower() if tok.lemma_ != '-PRON-' else str(tok.orth_).lower() for tok in doc]
    lemma_tokens.append(' '.join(temp_ls))

  return lemma_tokens

# --------------------------------------------------

# --------------------------------------------------

Overwriting ./utils/text_cleaners.py


In [None]:
  def text_str2sents(text_str, pysbd_only=False):
    '''
    Given a long text string (e.g. a novel) and pysbd_only flag
    Return a list of every Sentence defined by (a) 2+ newlines as paragraph separators, 
                                              (b) SpaCy+PySBD Pipeline, and 
                                              (c) Optionally, NLTK sentence tokenizer
    '''

    parags_ls = []
    sents_ls = []

    from pysbd.utils import PySBDFactory
    nlp = spacy.blank('en')
    nlp.add_pipe(PySBDFactory(nlp))

    print(f'BEFORE stripping out headings len: {len(text_str)}')

    parags_ls = re.split(r'[\n]{2,}', text_str)

    parags_ls = [x.strip() for x in parags_ls]

    # Strip out non-printing characters
    parags_ls = [re.sub(f'[^{re.escape(string.printable)}]', '', x) for x in parags_ls]

    # Filter out empty lines Paragraphs
    parags_ls = [x for x in parags_ls if (len(x.strip()) >= MIN_PARAG_LEN)]

    print(f'   Parag count before processing sents: {len(parags_ls)}')
    # FIRST PASS at Sentence Tokenization with PySBD

    for i, aparag in enumerate(parags_ls):
    

      aparag_nonl = re.sub('[\n]{1,}', ' ', aparag)
      doc = nlp(aparag_nonl)
      aparag_sents_pysbd_ls = list(doc.sents)
      print(f'pysbd found {len(aparag_sents_pysbd_ls)} Sentences in Paragraph #{i}')

      # Strip ofaparag_sents_pysbd_lsf whitespace from Sentences
      aparag_sents_pysbd_ls = [str(x).strip() for x in aparag_sents_pysbd_ls]

      # Filter out empty line Sentences
      aparag_sents_pysbd_ls = [x for x in aparag_sents_pysbd_ls if (len(x.strip()) > MIN_SENT_LEN)]

      print(f'      {len(aparag_sents_pysbd_ls)} Sentences remain after cleaning')

      sents_ls += aparag_sents_pysbd_ls

    # (OPTIONAL) SECOND PASS as Sentence Tokenization with NLTK
    if pysbd_only == True:
      # Only do one pass of SpaCy/PySBD Sentence tokenizer
      # sents_ls += aparag_sents_pysbd_ls
      pass
    else:
      # Do second NLTK pass at Sentence tokenization if pysbd_only == False
      # Do second pass, tokenize again with NLTK to catch any Sentence tokenization missed by PySBD
      # corpus_sents_all_nltk_ls = []
      # sents_ls = []
      # aparag_sents_nltk_ls = []
      aparag_sents_pysbd_ls = deepcopy(sents_ls)
      sents_ls = []
      for asent in aparag_sents_pysbd_ls:
        print(f'Processing asent: {asent}')
        aparag_sents_nltk_ls = []
        aparag_sents_nltk_ls = sent_tokenize(asent)

        # Strip off whitespace from Sentences
        aparag_sents_nltk_ls = [str(x).strip() for x in aparag_sents_nltk_ls]

        # Filter out empty line Sentences
        aparag_sents_nltk_ls = [x for x in aparag_sents_nltk_ls if (len(x.strip()) > MIN_SENT_LEN)]

        # corpus_sents_all_second_ls += aparag_sents_nltk_ls

        sents_ls += aparag_sents_nltk_ls

    print(f'About to return sents_ls with len = {len(sents_ls)}')
    
    return sents_ls

## Sentiment Analysis

### get_sentiments.py

In [8]:
%%writefile ./utils/get_sentiments.py


def get_lexsent_sentiment(asent_str, lexicon_dt):
  '''
  Given a Sentence in string form and a Lexicon Dictionary
  Return the Sentiment of the Sentence = Sum(Sentiment(all words))
  '''

  sent_sentiment = 0
  asent_str = str(asent_str)
  word_ls = asent_str.split()
  for aword in word_ls:
    word_sentiment = lexicon_dt.get(aword)
    if word_sentiment != None:
      sent_sentiment += float(word_sentiment)

  return sent_sentiment

# --------------------------------------------------
"""
def lexicon_sentiment(lexicon_dt, text_str):
  '''
  Given a lexicon dict[word]=sentiment and a string
  Return a sentiment ('pos'|'neg') and a polarity (-1.0 to 1.0)
  '''

  word_ls = text_str.split()
  text_polarity = 0

  for aword in word_ls:
    word_sentiment = lexicon_dt.get(aword)
    if word_sentiment != None: #lexicon_dt.get(aword) != None:
      # print(f'Word: {aword} Polarity: {word_sentiment}')
      text_polarity += word_sentiment # lexicon_dt[aword]

  if text_polarity > 0.0:
    text_sentiment = 'pos'
  else:
    text_sentiment = 'neg'
  
  # Return tuple of polarity ('positive'|'negative') and sentiment float value (-1.0 to 1.0)
  return text_sentiment, round(text_polarity, 4)

# Test
test_str = "I love enjoying the great outdoors!"
test_tp = lexicon_sentiment(lexicon_jockersrinker_dt, test_str)
print(f'The Sentence: {test_str}\n\n  Sentiment: {test_tp[0]}\n\n  Polarity:  {test_tp[1]}')

""";

# --------------------------------------------------
"""
def pattern_discrete2continous_sentiment(text):
  '''
  Given a plain text string, give it to
    Stanford Stanza (OpenNLP) to calculate sentiment for each word on a 3 point scale 0-2
  Return a sentiment value for the entire sentence (sum of word sentiments/log(len of sentence)) 
    that approximates a normal distribution for all values
    In order to get more fine grained measure of overall Sentence sentiment
    Sentiment values will be Normalized/Standardized so absolute precision is not required
  '''
  text_sentiment_total = 0.
  text_ls = text.split()
  text_len = len(text_ls)
  for aword in text_ls:
    text_sentiment_total += pattern_sa(str(aword))[0]
  text_sentiment_norm = text_sentiment_total/(np.log(text_len)+0.01)

  return text_sentiment_norm
""";

# --------------------------------------------------
def sent2vader_comp(asent_str):
  '''
  Given a Sentence as a text string
  Return a Sentiment = sum(VADER sentiments for each word)
  '''

  words_ls = asent_str.split()
  sent_sentiment_fl = 0.0

  for j, atest_word in enumerate(words_ls):
    sent_sentiment_fl += vader_analyzer.polarity_scores(atest_word.lower())['compound']

  return sent_sentiment_fl

# --------------------------------------------------
def sent2textblob(asent_str):
  '''
  Given a Sentence as a text string
  Return a Sentiment = sum(TextBlob sentiments for each word)
  '''

  words_ls = asent_str.split()
  sent_sentiment_fl = 0.0

  for j, atest_word in enumerate(words_ls):
    sent_sentiment_fl += TextBlob(atest_word.lower()).sentiment.polarity

  return sent_sentiment_fl

# --------------------------------------------------
def flair_sentiment(asent_str):
  '''
  Given a text string, get sentiment str using Flair (e.g. 'NEGATIVE (0.9243)') 
  Return a floating point -1.0 to 1.0
  '''
  sentence = Sentence(asent_str)
  classifier.predict(sentence)

  # print(f'   Sentence: {atest_str}')
  sentiment_str = str(sentence.labels[0])

  polarity_str, polarity_val_str = sentiment_str.split()

  pol_str = polarity_str.strip()
  if pol_str.strip() == "POSITIVE":
    sign_val = 1.0
  elif pol_str.strip() == "NEGATIVE":
    sign_val = -1.0
  else:
    print(f'ERROR: Illegal value for polarity_str: {pol_str}')

  pol_val_str = polarity_val_str.strip()
  pol_val_str = pol_val_str[1:-1]
  pol_fl = sign_val * float(pol_val_str)

  return pol_fl

# --------------------------------------------------
def stanza_discrete2continous_sentiment(text):
  '''
  Given a plain text string, give it to
    Stanford Stanza (OpenNLP) to calculate sentiment for each word on a 3 point scale 0-2
  Return a sentiment value for the entire sentence (sum of word sentiments/log(len of sentence)) 
    that approximates a normal distribution for all values
    In order to get more fine grained measure of overall Sentence sentiment
    Sentiment values will be Normalized/Standardized so absolute precision is not required
  '''
  text_sentiment_tot = 0.
  text_ls = text.split()
  text_len = len(text_ls)
  for aword in text_ls:
    adoc = nlp(aword)
    for i, sentence in enumerate(adoc.sentences):
      text_sentiment_tot += float(sentence.sentiment)
  text_sentiment_norm = text_sentiment_tot/(np.log(text_len)+0.1)

  return text_sentiment_norm

# --------------------------------------------------
def ml_metrics(model,x,y):
  # https://www.kaggle.com/aditya6040/7-models-on-imdb-dataset-best-score-88-2/notebook
  y_pred = model.predict(x)
  acc = accuracy_score(y, y_pred)
  f1=f1_score(y, y_pred)
  cm=confusion_matrix(y, y_pred)
  report=classification_report(y,y_pred)
  plt.figure(figsize=(4,4))
  sns.heatmap(cm,annot=True,cmap='Blues',xticklabels=[0,1],fmt='d',annot_kws={"fontsize":19})
  plt.xlabel("Predicted",fontsize=16)
  plt.ylabel("Actual",fontsize=16)
  plt.show()
  print("\nAccuracy: ",round(acc,2))
  print("\nF1 Score: ",round(f1,2))
  print("\nConfusion Matrix: \n",cm) # Comment out?
  print("\nReport:",report)

# --------------------------------------------------
def lexicon_metrics(y, y_pred):
  acc = accuracy_score(y, y_pred)
  f1=f1_score(y, y_pred)
  cm=confusion_matrix(y, y_pred)
  report=classification_report(y, y_pred)
  plt.figure(figsize=(4,4))
  sns.heatmap(cm,annot=True,cmap='Blues',xticklabels=[0,1],fmt='d',annot_kws={"fontsize":19})
  plt.xlabel("Predicted",fontsize=16)
  plt.ylabel("Actual",fontsize=16)
  plt.show()
  print("\nAccuracy: ",round(acc,2))
  print("\nF1 Score: ",round(f1,2))
  print("\nConfusion Matrix: \n",cm) # Comment out?
  print("\nReport:",report)

# --------------------------------------------------
def labelscore2fl(labelscore_sentiment_ls, sa_model):
  '''
  Given the list of dict returned by RoBERTa15lg
  Return a floating point value for sentiment
  '''
  sentiment_fl = -99.99

  label_str = labelscore_sentiment_ls[0]['label'].strip().lower()
  score_fl = float(labelscore_sentiment_ls[0]['score'])

  # For lablels POSTIVE/POS, NEGATIVE/NEG
  if label_str in ['positive','pos']:
    sentiment_fl = score_fl
  elif label_str in ['negative','neg']:
    sentiment_fl = -1.0 * (score_fl)
  elif label_str in ['neutral','neu']:
    sentiment_fl = 0

  # For Labels 'n Stars' where n=[1..5]
  elif label_str == '1 star':
    sentiment_fl = score_fl
  elif label_str == '2 stars':
    sentiment_fl = 1.0 + score_fl
  elif label_str == '3 stars':
    sentiment_fl = 2.0 + score_fl
  elif label_str == '4 stars':
    sentiment_fl = 3.0 + score_fl
  elif label_str == '5 stars':
    sentiment_fl = 4.0 + score_fl

  # Else ERROR on illegal Label value
  else:
    print(f'ERROR: Illegal value for RoBERTa Label: {label_str}')

  return sentiment_fl

# --------------------------------------------------
def logitstensor2sentiment(hugseqclass_output):
  '''
  Given a Huggingface SequenceClassifierOutput logits tensor
  Return Sentiment and assoc softmax probability values
  '''

  text_smax_ls_ls = hugseqclass_output.logits.softmax(dim=-1).tolist()
  text_smax_ls = text_smax_ls_ls[0]
  # print(type(text_smax_ls[0]))
  # print(f'  sMAX: {text_smax_ls}')
  max_val = max(text_smax_ls)            # Probability based upon logits %
  max_indx = text_smax_ls.index(max_val) # Sentiment (starting from 0 up)
  val_scale = len(text_smax_ls)
  # print(f'   MAX: {max_val} at indx={max_indx}')

  return max_indx, val_scale, max_val



# --------------------------------------------------

Overwriting ./utils/get_sentiments.py


# R Code

In [37]:
%%writefile ./utils/get_sentimentr.R

library(sentimentr)
library(lexicon)

get_sentimentr_values <- function(s_v) {
  
  print('Processing sentimentr_jockersrinker')
  sentimentr_jockersrinker <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_jockers_rinker, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_jockers')
  sentimentr_jockers <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_jockers, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_huliu')
  sentimentr_huliu <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_huliu, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_nrc')
  sentimentr_nrc <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_nrc, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_senticnet')
  sentimentr_senticnet <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_senticnet, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_sentiword')
  sentimentr_sentiword <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_sentiword, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_loughran_mcdonald')
  sentimentr_loughran_mcdonald <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_loughran_mcdonald, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_socal_google')
  sentimentr_socal_google <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_socal_google, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  anovel_sentimentr_df <- data.frame('text_clean' = s_v,
                                'sentimentr_jockersrinker' = sentimentr_jockersrinker$sentiment,
                                'sentimentr_jockers' = sentimentr_jockers$sentiment,
                                'sentimentr_huliu' = sentimentr_huliu$sentiment,
                                'sentimentr_nrc' = sentimentr_nrc$sentiment,
                                'sentimentr_senticnet' = sentimentr_senticnet$sentiment,
                                'sentimentr_sentiword' = sentimentr_sentiword$sentiment,
                                'sentimentr_loughran_mcdonald' = sentimentr_loughran_mcdonald$sentiment,
                                'sentimentr_socal_google' = sentimentr_socal_google$sentiment
                                )
  return(anovel_sentimentr_df)

}

Overwriting ./utils/get_sentimentr.R
