# **SentimentArcs (Part 6): Analysis**

By: Jon Chun
* Original: 12 Jun 2021
* Last Update: 14 Apr 2022


# **[STEP 0] Install Libaries**

In [None]:
# If you see [Interactive namespace is empty] in response to the [%who] command below
#   your working with a fresh Linux Virtual Machine,
#   any previous work is lost,
#   and you need to SEQUENTIALLY execute EVERY cell this Notebook from the beginning 

%whos

In [None]:
# Takes far too long for inference, 
#   currently not used

# !pip install moepy

In [None]:
!pip install dtaidistance

In [None]:
!pip install sktime

In [None]:
# [RESTART RUNTIME] May be Required (only needed for Plotly)

# Designed Security Hole in older version of PyYAML, must upgrade to use plotly

# !pip install pyyaml==5.4.1

In [None]:
# To Reduce Time Series Dimensionality

!pip install lttb

In [None]:
!pip install tslearn

# [STEP 1] Manual Configuration

## (Popups) Connect Google gDrive

In [None]:
# [INPUT REQUIRED]: Authorize access to Google gDrive

# Connect this Notebook to your permanent Google Drive
#   so all generated output is saved to permanent storage there

try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("Attempting to attach your Google gDrive to this Colab Jupyter Notebook")
  drive.mount('/gdrive')
else:
  print("Your Google gDrive is attached to this Colab Jupyter Notebook")

## (3 Inputs) Define Directory Tree

In [None]:
# [CUSTOMIZE]: Change the text after the Unix '%cd ' command below (change directory)
#              to math the full path to your gDrive subdirectory which should be the 
#              root directory cloned from the SentimentArcs github repo.

# NOTE: Make sure this subdirectory already exists and there are 
#       no typos, spaces or illegals characters (e.g. periods) in the full path after %cd

# NOTE: In Python all strings must begin with an upper or lowercase letter, and only
#         letter, number and underscores ('_') characters should appear afterwards.
#         Make sure your full path after %cd obeys this constraint or errors may appear.

# #@markdown **Instructions**

# #@markdown Set Directory and Corpus names:
# #@markdown <li> Set <b>Path_to_SentimentArcs</b> to the project root in your **GDrive folder**
# #@markdown <li> Set <b>Corpus_Genre</b> = [novels, finance, social_media]
# #@markdown <li> <b>Corpus_Type</b> = [reference_corpus, new_corpus]
# #@markdown <li> <b>Corpus_Number</b> = [1-20] (id nunmber if a new_corpus)

#@markdown <hr>

# Step #1: Get full path to SentimentArcs subdir on gDrive
# =======
#@markdown **Accept default path on gDrive or Enter new one:**

Path_to_SentimentArcs = "/gdrive/MyDrive/sentimentarcs_notebooks/" #@param ["/gdrive/MyDrive/sentiment_arcs/"] {allow-input: true}


#@markdown Set this to the project root in your <b>GDrive folder</b>
#@markdown <br> (e.g. /<wbr><b>gdrive/MyDrive/research/sentiment_arcs/</b>)

#@markdown <hr>

#@markdown **Which type of texts are you cleaning?** \

Corpus_Genre = "novels" #@param ["novels", "social_media", "finance"]

# Corpus_Type = "reference" #@param ["new", "reference"]
Corpus_Type = "new" #@param ["new", "reference"]


Corpus_Number = 2 #@param {type:"slider", min:1, max:10, step:1}


#@markdown Put in the corresponding Subdirectory under **./text_raw**:
#@markdown <li> All Texts as clean <b>plaintext *.txt</b> files 
#@markdown <li> A <b>YAML Configuration File</b> describing each Texts

#@markdown Please verify the required textfiles and YAML file exist in the correct subdirectories before continuing.

print('Current Working Directory:')
%cd $Path_to_SentimentArcs

print('\n')

if Corpus_Type == 'reference':
  SUBDIR_SENTIMENT_RAW = f'sentiment_raw_{Corpus_Genre}_reference'
  SUBDIR_TEXT_CLEAN = f'text_clean_{Corpus_Genre}_reference'
else:
  SUBDIR_SENTIMENT_RAW = f'sentiment_raw_{Corpus_Genre}_{Corpus_Type}_corpus{Corpus_Number}/'
  SUBDIR_TEXT_CLEAN = f'text_clean_{Corpus_Genre}_{Corpus_Type}_corpus{Corpus_Number}/'

# PATH_SENTIMENT_RAW = f'./sentiment_raw/{SUBDIR_TEXT_RAW}'
# PATH_TEXT_CLEAN = f'./text_clean/{SUBDIR_TEXT_CLEAN}'
PATH_SENTIMENT_RAW = f'./sentiment_raw/{SUBDIR_SENTIMENT_RAW}'
PATH_TEXT_CLEAN = f'./text_clean/{SUBDIR_TEXT_CLEAN}'

# TODO: Clean up
# SUBDIR_TEXT_CLEAN = PATH_TEXT_CLEAN

print(f'PATH_SENTIMENT_RAW:\n  [{PATH_SENTIMENT_RAW}]')
print(f'SUBDIR_SENTIMENT_RAW:\n  [{SUBDIR_SENTIMENT_RAW}]')

print('\n')

print(f'PATH_TEXT_CLEAN:\n  [{PATH_TEXT_CLEAN}]')
print(f'SUBDIR_TEXT_CLEAN:\n  [{SUBDIR_TEXT_CLEAN}]')

# **[STEP 2] Automatic Configuration/Setup**

## (each time) Custom Libraries & Define Globals

In [None]:
# Add PATH for ./utils subdirectory

import sys
import os

!python --version

print('\n')

PATH_UTILS = f'{Path_to_SentimentArcs}utils'
PATH_UTILS

sys.path.append(PATH_UTILS)

print('Contents of Subdirectory [./sentiment_arcs/utils/]\n')
!ls $PATH_UTILS

# More Specific than PATH for searching libraries
# !echo $PYTHONPATH

In [None]:
# Review Global Variables and set the first few

import global_vars as global_vars

global_vars.SUBDIR_SENTIMENTARCS = Path_to_SentimentArcs
global_vars.Corpus_Genre = Corpus_Genre
global_vars.Corpus_Type = Corpus_Type
global_vars.Corpus_Number = Corpus_Number

global_vars.SUBDIR_SENTIMENT_RAW = SUBDIR_SENTIMENT_RAW
global_vars.PATH_SENTIMENT_RAW = PATH_SENTIMENT_RAW

global_vars.SUBDIR_TEXT_CLEAN = SUBDIR_TEXT_CLEAN
global_vars.PATH_TEXT_CLEAN = PATH_TEXT_CLEAN

from utils import sa_config # (e.g. define TEST_WORDS_LS)

sa_config.set_globals()

global_vars.TEST_WORDS_LS
print('\n')

dir(global_vars)

In [None]:
%whos dict

In [None]:
# Initialize and clean for each iteration of notebook

# dir(global_vars)

global_vars.corpus_texts_dt = {}
global_vars.corpus_titles_dt = {}

In [None]:
# Import SentimentArcs Utilities to define Directory Structure
#   based the Selected Corpus Genre, Type and Number

!pwd 
print('\n')

# from utils import sa_config # .sentiment_arcs_utils
from utils import sa_config

print('Objects in sa_config()')
print(dir(sa_config))
print('\n')

# Directory Structure for the Selected Corpus Genre, Type and Number
sa_config.get_subdirs(Path_to_SentimentArcs, Corpus_Genre, Corpus_Type, Corpus_Number, 'none')


In [None]:
global_vars.SUBDIR_SENTIMENT_CLEAN

In [None]:
global_vars.SUBDIR_SENTIMENT_CLEAN = './sentiment_clean/sentiemnt_clean_novels_new_corpus2/'
global_vars.SUBDIR_SENTIMENT_CLEAN

## (each time) Read YAML Configuration for Corpus and Models 

In [None]:
# from utils import sa_config # .sentiment_arcs_utils

import yaml

from utils import read_yaml

print('Objects in read_yaml()')
print(dir(read_yaml))
print('\n')

# Directory Structure for the Selected Corpus Genre, Type and Number
read_yaml.read_corpus_yaml(Corpus_Genre, Corpus_Type, Corpus_Number)

print('SentimentArcs Model Ensemble ------------------------------\n')
model_titles_ls = global_vars.models_titles_dt.keys()
print('\n'.join(model_titles_ls))


print('\n\nCorpus Texts ------------------------------\n')
corpus_titles_ls = list(global_vars.corpus_titles_dt.keys())
print('\n'.join(corpus_titles_ls))


print(f'\n\nThere are {len(model_titles_ls)} Models in the SentimentArcs Ensemble above.\n')
print(f'\nThere are {len(corpus_titles_ls)} Texts in the Corpus above.\n')
print('\n')

global_vars.corpus_titles_dt

In [None]:
global_vars.models_titles_dt.items()

In [None]:
global_vars.corpus_titles_dt

## Configure Jupyter Notebook

In [None]:
# Configure Jupyter

# To reload modules under development

# Option (a)
%load_ext autoreload
%autoreload 2
# Option (b)
# import importlib
# importlib.reload(functions.readfunctions)


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Enable multiple outputs from one code cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display
from IPython.display import Image
from ipywidgets import widgets, interactive

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Intentionally left blank

## Load Libraries

In [None]:
import numpy as np

from tqdm._tqdm_notebook import tqdm_notebook
import pandas as pd
tqdm_notebook.pandas()

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.set_option('max_colwidth', 100) # -1)

import json
from collections import Counter

# from glob import glob
# import copy


In [None]:
# Scikit Utilities, Metrics, Pipelines and Models

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


## (del?) Define Global Parameters

In [None]:
"""

# Define Globals

# Main data structure: Dictionary (key=text_name) of DataFrames (cols: text_raw, text_clean)
corpus_texts_dt = {}

# Verify in SentimentArcs Root Directory
os.chdir('/gdrive/MyDrive/cdh/sentiment_arcs/')

%run -i './utils/get_globals.py'

SLANG_DT.keys()
""";

## Setup Matplotlib Style

* https://matplotlib.org/stable/tutorials/introductory/customizing.html

In [None]:
# Configure Matplotlib

# View available styles
# plt.style.available

# Verify in SentimentArcs Root Directory
os.chdir(Path_to_SentimentArcs)

%run -i './utils/config_matplotlib.py'

config_matplotlib()

print('Matplotlib Configuration ------------------------------')
print('\n  (Uncomment to view)')
# plt.rcParams.keys()
print('\n  Edit ./utils/config_matplotlib.py to change')

## Setup Seaborn Style

In [None]:
# Configure Seaborn

# Verify in SentimentArcs Root Directory
os.chdir(Path_to_SentimentArcs)

%run -i './utils/config_seaborn.py'

config_seaborn()

print('Seaborn Configuration ------------------------------\n')
# print('\n  Update ./utils/config_seaborn.py to display seaborn settings')


## Python Utility Functions

### (each time) Generate Convenient Data Lists

In [None]:
# Derive List of Texts in Corpus a)keys and b)full author and titles

print('Dictionary: corpus_titles_dt')
global_vars.corpus_titles_dt
print('\n')

corpus_texts_ls = list(global_vars.corpus_titles_dt.keys())
print(f'\nCorpus Texts:')
for akey in corpus_texts_ls:
  print(f'  {akey}')
print('\n')

print(f'\nNatural Corpus Titles:')
corpus_titles_ls = [x[0] for x in list(global_vars.corpus_titles_dt.values())]
for akey in corpus_titles_ls:
  print(f'  {akey}')


In [None]:
global_vars.corpus_titles_dt.keys()

In [None]:
# Get Model Families of Ensemble

from utils.get_model_families import get_ensemble_model_famalies

global_vars.model_ensemble_dt = get_ensemble_model_famalies(global_vars.models_titles_dt)

print('\nTest: Lexicon Family of Models:')
global_vars.model_ensemble_dt['lexicon']

### File Functions

In [None]:
# Verify in SentimentArcs Root Directory
os.chdir(Path_to_SentimentArcs)

%run -i './utils/file_utils.py'
# from utils.file_utils import *

# %run -i './utils/file_utils.py'

# TODO: Not used? Delete?
# get_fullpath(text_title_str, ftype='data_clean', fig_no='', first_note = '',last_note='', plot_ext='png', no_date=False)

# **[STEP 3] Read all Raw Sentiment Data**




## Read Raw Sentiments

In [None]:
# Verify cwd and subdir of Raw Sentiment Data

print('Current Working Directory:')
!pwd

print(f'\nSubdir with all Cleaned Texts of Corpus:\n  {SUBDIR_SENTIMENT_RAW}')

PATH_SENTIMENT_RAW = f'{Path_to_SentimentArcs}sentiment_raw/{SUBDIR_SENTIMENT_RAW}'

print(f'\nPATH_SENTIMENT_RAW: {PATH_SENTIMENT_RAW}\n')

print(f'\n\nFilenames of Cleaned Texts:\n')
!ls -1 $PATH_SENTIMENT_RAW

# glob(f'{PATH_SENTIMENT_RAW}/*')

print('\n')

print(corpus_texts_ls)

In [None]:
# Create a List (sentiment_raw_json_ls) of all preprocessed text files

# Verify in SentimentArcs Root Directory
os.chdir(Path_to_SentimentArcs)

try:
    sentiment_raw_json_ls = glob(f'{PATH_SENTIMENT_RAW}/sentiment_raw_*.json')
    sentiment_raw_json_ls = [x.split('/')[-1] for x in sentiment_raw_json_ls]
    # sentiment_raw_json_ls = [x.split('.')[0] for x in sentiment_raw_json_ls]
except IndexError:
    raise RuntimeError('No csv file found')

print('\n'.join(sentiment_raw_json_ls))
print('\n')
print(f'Found {len(sentiment_raw_json_ls)} Preprocessed files in {SUBDIR_TEXT_CLEAN}')


In [None]:
# Global Dict for Sentiments

# Only used in this Notebook so not in defined in shared utils/global_vars
#   like global_vars.corpus_texts_dt = {}

# corpus_sentiments_dt[text] = DataFrame(Raw Sentiments, 1 Column per Model)

corpus_sentiment_dt = {}

In [None]:
%whos list

In [None]:
sentiment_raw_json_ls

In [None]:
PATH_SENTIMENT_RAW

In [None]:
%%time

# NOTE:   2m37s @09:32 on 20220416 Colab Pro CPU (634k, 668k, 909k)
#         2m07s @10:07 on 20220416 Colab Pro CPU (634k, 668k, 909k)
#         2m07s @10:09 on 20220416 Colab Pro CPU (634k, 668k, 909k)

# Read all preprocessed text files into master DataFrame (corpus_dt)

# Reset Dict for Sentiments
#   Only used in this notebook, not shared across notebooks so do not
#   share via utils/global_vars like global_vars.corpus_texts_dt

corpus_sentiment_dt = {}

for i, atext in enumerate(corpus_texts_ls):
  print(f'\n\nProcessing text #{i}: {atext}')
  corpus_sentiment_dt[atext] = pd.DataFrame(columns=['text_raw','text_clean'])

  for j, ajson in enumerate(sentiment_raw_json_ls):
    print(f'  Reading json #{j}: {ajson}')

    afile_fullpath = f'{PATH_SENTIMENT_RAW}{ajson}'
    print(f'               at: {afile_fullpath}')

    if 'transformer' in ajson:
      print(f'   One Model Transformer *.json datafile')
    else:
      print(f'   Multi-Model non-Transformer *.json datafile')

    with open(afile_fullpath) as fp:
      json_dt = json.load(fp)
      temp_df = pd.DataFrame.from_dict(json_dt[atext]).reset_index()
      # temp_df.head(5)
      # corpus_sentiment_dt[atext] = corpus_sentiment_dt[atext].update(temp_df)
      
      # corpus_sentiment_dt[atext]
      # print(f'               type: {json_dt[atext]}')

    # corpus_sentiment_dt[atext] = corpus_sentiment_dt[atext].update(temp_df)
    corpus_sentiment_dt[atext] = pd.concat([corpus_sentiment_dt[atext], temp_df], axis=1).T.drop_duplicates().T #  = corpus_sentiment_dt[atext].update(temp_df)
    # pd.concat([DF1, DF2], axis = 1).T.drop_duplicates().T
    # corpus_sentiment_dt[atext] = pd.DataFrame.from_dict(json_dt)

  # ajson_df = pd.read_csv(afile_fullpath, index_col=[0])
  # global_vars.corpus_texts_dt[atext] = ajson_df
  # corpus_sentiment_dt[atext] = ajson_df


  # a_json = json.loads(json_string)
  # print(a_json)



In [None]:
corpus_sentiment_dt.keys()

In [None]:
title_indx = 1

corpus_sentiment_dt[corpus_texts_ls[title_indx]].head()
corpus_sentiment_dt[corpus_texts_ls[title_indx]].info()
corpus_sentiment_dt[corpus_texts_ls[title_indx]].shape

print(f'For Text: {corpus_texts_ls[title_indx]}')

## Identify and Drop Duplicate Columns

In [None]:
from collections import Counter

In [None]:
corpus_sentiment_dt[atext]['roberta15lg'].columns

In [None]:
# Drop all but the i-th copy of duplicated column

def keep_nthdup_col(adf, acol, nthcopy):
  '''
  Given a DataFrame, duplicated col name and nthcopy into set of duplicated cols
  Drop the iloc version of the duplicated col list from the DataFrame
  '''

  df_col_iloc_ls = []

  # First, verify this is a duplicated column
  col_dup_ls = [x for x in corpus_sentiment_dt[atext].columns if acol == x]
  if len(col_dup_ls) <= 1:
    print(f'ERROR: Column: {acol} is not duplicated in the DataFrame cols: {adf.columns}')
    return

  # Loop over all columns to get original iloc of duplicated columns
  # corpus_sentiment_dt[atext].columns.get_loc('roberta15lg')  # Return List of booleans

  for i in range(adf.shape[1]):

    # get current col name
    acol_name = adf.columns[i]

    # if current col name matches our target col, save it
    if acol_name == acol:
      # save the iloc
      df_col_iloc_ls.append(i)

  # Second, verify iloc points to one of the duplicated columns
  if nthcopy >= len(df_col_iloc_ls):
    print(f'ERROR: passed nthcopy {nthcopy} is bigger than the number of duplicated {acol} column [0 to {len(df_col_iloc_ls)-1}]')
    return

  print(f' Duplicated col: {acol} indicies: {df_col_iloc_ls}')
  col_dup_indx = df_col_iloc_ls[nthcopy]
  print(f'     Keep Index: {col_dup_indx}')
  print(f'           Name: {adf.columns[col_dup_indx]}')
  df_col_iloc_ls.remove(col_dup_indx)
  print(f'      Drop Cols: {df_col_iloc_ls}')
  # Drop all cols by iloc index in list df_col_iloc_ls
  # adf = adf.iloc[:, [j for j, c in enumerate(list(adf.columns)) if j not in df_col_iloc_ls]]
  for acol_indx in df_col_iloc_ls:
    adf = adf.iloc[:, [j for j, c in enumerate(list(adf.columns)) if j != int(acol_indx)]]

  """
  for k, acol_indx in enumerate(df_col_iloc_ls):
    acol_drop = adf.columns[acol_indx]
    print(f'Dropping column #{k}: {acol_drop} at indx={acol_indx}')
    # adf.drop(adf.columns[acol_indx], axis=1, inplace=True)
    adf.drop(columns=[acol_drop], axis=1, inplace=True)
  """

  return adf

# Test
# keep_nthdup_col(corpus_sentiment_dt[atext], 'text_raw', 1)

In [None]:
corpus_sentiment_dt[atext].iloc[:, [j for j,c in enumerate(list(corpus_sentiment_dt[atext].columns)) if j not in [13,0]]].info()

In [None]:
# Identify and Drop Duplicate Columns

col_before_ct = len(corpus_sentiment_dt[atext].columns)
dup_col_keep_dt = {}  # Dict[dup_col] = iloc index to keep (col with min nulls)


for i,atext in enumerate(corpus_texts_ls):
  cols_dup_ls = []
  row_ct = corpus_sentiment_dt[atext].shape[0]

  print(f'\n\nProcessing Text #{i}: {atext}')
  
  # Count the frequency of each column name
  cols_ls = corpus_sentiment_dt[atext].columns
  # print(f'  Columns: {cols_ls}')
  col_count_dt = Counter(cols_ls)

  # Create list of duplicate column names in cols_dup_ls
  for key,val in col_count_dt.items():
    if val > 1:
      cols_dup_ls.append(key)
      print(f'  Duplicate col: {key} with count: {val}')

  # Count how many columns are duplicated
  dup_ct = len(cols_dup_ls)

  # For every duplicated Column
  for j, adup_col in enumerate(cols_dup_ls):
    # Count how many duplicates it has
    adup_col_ct = len(corpus_sentiment_dt[atext][adup_col])

    # Iterate through all duplicates and find the iloc index of the one
    #   with the least number of null values as the one to keep (deleting the other dups)
    col_iloc_min_null = 0  # Index to the col with min nulls
    col_min_null_ct = row_ct  # Current count of null in col with min nulls, init to row count
    dup_col_ls = corpus_sentiment_dt[atext][adup_col].columns
    for k, adup_col_ver in enumerate(dup_col_ls):
      adup_col_null_ct = corpus_sentiment_dt[atext][adup_col].iloc[:,k].isna().sum()
      if adup_col_null_ct < col_min_null_ct:
        col_min_null_ct = adup_col_null_ct
        col_iloc_min_null = k

    # Drop all but one copy of the duplicated columns
    print(f'\n      Keep iloc: {col_iloc_min_null} in adup_col: {adup_col} with {adup_col_null_ct} nulls out of {row_ct}')
    dup_col_keep_dt[adup_col] = col_iloc_min_null
    print(f'       Calling: keep_nthdup_col(adf, {adup_col}, {col_iloc_min_null})')
    corpus_sentiment_dt[atext] = keep_nthdup_col(corpus_sentiment_dt[atext], adup_col, col_iloc_min_null)


col_after_ct = len(corpus_sentiment_dt[atext].columns)

print(f'\n\nColumn Count:\n  Before: {col_before_ct}\n   After: {col_after_ct}')

In [None]:
corpus_sentiment_dt[atext].info()

## Reorder and Specify dtypes

In [None]:
# Get list of models

models_ls = list(set(corpus_sentiment_dt[corpus_texts_ls[0]].columns) - set(['text_raw','text_clean','index']))
models_ls.sort()

models_ls

print(f'\n\nTotal of {len(models_ls)} Models')

In [None]:
# Put text_raw and text_clean at front

# corpus_sentiment_dt[atext].sort_index(axis=1)
# corpus_sentiment_dt[atext] = corpus_sentiment_dt[atext].insert(0, 'text_raw', corpus_sentiment_dt[atext].pop('text_raw'))
# corpus_sentiment_dt[atext] = corpus_sentiment_dt[atext].insert(1, 'text_clean', corpus_sentiment_dt[atext].pop('text_clean'))

for i,atext in enumerate(corpus_texts_ls):

  col_first = corpus_sentiment_dt[atext].pop('index')
  corpus_sentiment_dt[atext].insert(0, 'sentence_no', col_first)

  col_second = corpus_sentiment_dt[atext].pop('text_raw')
  corpus_sentiment_dt[atext].insert(1, 'text_raw', col_second)

  col_third = corpus_sentiment_dt[atext].pop('text_clean')
  corpus_sentiment_dt[atext].insert(2, 'text_clean', col_third)

  corpus_sentiment_dt[atext].info()

In [None]:
# Convert objects to more specific dtypes

for i,atext in enumerate(corpus_texts_ls):
  print(f'\n\nProcessing Text #{i}: {atext}')

  for j, amodel in enumerate(models_ls):
  
    print(f'Processing Model #{j}: {amodel}')

    corpus_sentiment_dt[atext][amodel] = corpus_sentiment_dt[atext][amodel].astype('float')

  corpus_sentiment_dt[atext]['sentence_no'] = corpus_sentiment_dt[atext]['sentence_no'].astype('int')
  corpus_sentiment_dt[atext].info()

In [None]:
# Verify sample DataFrame

corpus_sentiment_dt[corpus_texts_ls[0]].head()

## Verify Raw Plots

In [None]:
%whos list

In [None]:
%whos dict

In [None]:
global_vars.corpus_titles_dt.keys()

In [None]:
models_ls

In [None]:
_ = corpus_sentiment_dt['tmorrison_songofsolomon'][models_ls].rolling(300, center=True, min_periods=0).mean().plot()

In [None]:
# Verify Raw Sentiments with 

win_per = 10

for i,atext in enumerate(corpus_texts_ls):
  
  win_aper = int(win_per/100 * corpus_sentiment_dt[atext].shape[0])
  _ = corpus_sentiment_dt[atext][models_ls].rolling(win_aper, center=True, min_periods=0).mean().plot()
  _= plt.title(f'Sentiment Analysis\n{global_vars.corpus_titles_dt[atext][0]}\nSmoothed SMA ({win_per}%)')
  plt.grid(True)

print(f'Read Raw Sentiments for these texts:\n  {corpus_sentiment_dt.keys()}\n\n')




## Drop or Interpolate and NaN/None Values

In [None]:
corpus_sentiment_dt[atext]['roberta15lg']

In [None]:
# Drop Columns/Models with %NaN above Threshold

null_threshold = 0.9  # Drop Col if %rows=null > Threshold

for i,atext in enumerate(corpus_texts_ls):
  print(f'\n\nProcessing Text #{i}: {atext}')

  for j, amodel in enumerate(models_ls):
  
    # print(f'Processing Model #{j}: {amodel}')

    row_ct = len(corpus_sentiment_dt[atext][amodel])
    sum_null = corpus_sentiment_dt[atext][amodel].isnull().sum()
    # print(f'There are {sum_null} null values of a total {row_ct} rows')
    null_threshold = 0.5  # if > 50% null, drop col
    # print(f'Threshold: {null_threshold} of all {row_ct} rows')
    if sum_null > int(null_threshold * row_ct):
      print(f'  %NaNs above Threshold={null_threshold}: {corpus_sentiment_dt[atext][amodel].isna().sum()}')
      # TODO: Verify before dropping Col/Model here
      # corpus_sentiment_dt[atext][models_ls].rolling(win_aper, center=True, min_periods=0).mean().plot()



## Clip Outliers and zScore Standardize

In [None]:
import statsmodels.robust.scale as sm_robust

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler

r_scaler = RobustScaler() 
z_scaler = StandardScaler()

In [None]:
models_ls[7]

In [None]:
# Simple IQR

def clip_iqr_outliers(floats_ser, iqr_limit=1.5):
  '''
  Given a Pandas Series of floats and an upper limit on IQR variance from the median
  Clip all outliers beyond the iqr_limit and return a list of floats
  '''

  quantile10 = floats_ser.quantile(0.10)
  quantile90 = floats_ser.quantile(0.90)
  print(f'10% Quantile: {quantile10}')
  print(f'90% Quantile: {quantile90}')

  floats_np = np.where(floats_ser < quantile10, quantile10, floats_ser)
  floats_np = np.where(floats_ser > quantile90, quantile90, floats_ser)
  print(f'        Skew: {pd.Series(floats_np).skew()}')

  return floats_np # .tolist()

# Test

test_np = clip_iqr_outliers(corpus_sentiment_dt[corpus_texts_ls[0]]['roberta15lg'])
len(test_np)

In [None]:
corpus_sentiment_dt[corpus_texts_ls[0]]['roberta15lg'].quantile(0.10)

In [None]:
clip_iqr_outliers(corpus_sentiment_dt[corpus_texts_ls[0]]['roberta15lg'],iqr_limit=1.5) # .values.reshape(-1, 1) )

In [None]:
corpus_sentiment_dt[atext][['sentence_no', 'text_raw', 'text_clean']]

In [None]:
models_ls

In [None]:
[x for x in corpus_sentiment_dt[atext].select_dtypes(include=[np.number]).columns if 'rz' not in x]

In [None]:
# Trim Outliers and zScore Standardize

corpus_sentiment_rz_dt = {}

for i, atext in enumerate(corpus_texts_ls):
  # atext_rz_df = corpus_sentiment_dt[atext][['sentence_no', 'text_raw', 'text_clean']].copy(deep=True)
  # col_rzscores_ls = []
  print(f"Title #{i}: {atext}")
  # df = corpus_sentiment_dt[atext].copy()
  # numeric_cols_ls = list(corpus_sentiment_dt[atext].select_dtypes(include=[np.number]).columns) # .remove('sentence_no')
  # numeric_cols_ls.remove('sentence_no')

  # for anum_col_str in numeric_cols_ls:
  for j,amodel in enumerate(models_ls):
    print(f'  Model #{j}: {amodel}')
    # anum_col_robust_np = r_scaler.fit_transform(df[amodel].values.reshape(-1, 1) )
    arobust_col_np = clip_iqr_outliers(corpus_sentiment_dt[atext][amodel],iqr_limit=1.5)
    # scaler_zscore.fit_transform(np.array(corpus_texts_dt[atext][amodel_rstd]).reshape(-1,1))
    # arobust_zscaled_col_np = z_scaler.fit_transform(arobust_col_np)
    arobust_zscaled_col_np = z_scaler.fit_transform(arobust_col_np.reshape(-1,1))
    arobust_zscaled_col_str = f'{amodel}_rz'
    corpus_sentiment_dt[atext][arobust_zscaled_col_str] = pd.Series(arobust_zscaled_col_np.squeeze(-1,))
  # corpus_sentiment_rz_dt[atext] = atext_rz_df

  # anum_col_rzscore_np = z_scaler.fit_transform(anum_col_robust_np)
  # anum_col_rzscore_str = f'{anum_col_str}_rzscore'
  # df[anum_col_rzscore_str] = pd.Series(anum_col_rzscore_np.squeeze(-1,))
  # col_rzscores_ls.append(anum_col_rzscore_str)

  # print(f'df.columns: {df.columns}')
  # win_10per = int(0.10 * df.shape[0])
  # df[col_rzscores_ls].rolling(win_10per, center=True, min_periods=0).mean() # .plot(title=f"Sentiment Analysis\n{global_vars.corpus_texts_dt[atext][0]}\nProcessing: SMA 10% (+ Robust IQR, zScore Scaling)")

In [None]:
[x for x in corpus_sentiment_dt[atext] if 'rz' in x]

In [None]:
for atext in corpus_texts_ls:
  col_drop_ls = [x for x in corpus_sentiment_dt[atext] if 'rz' in x]
  print(f'Dropping: {len(col_drop_ls)} Columns\n  {col_drop_ls}\n\n')
  corpus_sentiment_dt[atext].drop(columns=col_drop_ls, inplace=True)

In [None]:
models_rz_ls = [x for x in corpus_sentiment_dt[corpus_texts_ls[0]] if 'rz' in x]
models_rz_ls

In [None]:
text_indx = 0
text_str = corpus_texts_ls[text_indx]
title_str = global_vars.corpus_titles_dt[text_str][0]
win_per = 10
win_size = int(win_per/100 * corpus_sentiment_dt[text_str].shape[0])

_ = corpus_sentiment_dt[text_str][models_rz_ls].rolling(win_size, center=True, min_periods=0).mean().plot(alpha=0.3)
_ = corpus_sentiment_dt[text_str][models_rz_ls].mean(axis=1).rolling(win_size, center=True, min_periods=0).mean().plot(label='mean', color='red', linewidth=3, alpha=0.7)
_ = plt.legend(loc='best')
_ = plt.title(f'Sentiment Arc: {title_str}\nSmoothed SMA ({win_per}%)')
plt.grid(True)

### **Save Checkpoint**

In [None]:
# TODO: Norm all paths and subdirs as 'dir/dir/dir/' except for root: '/dir/dir/dir/'

global_vars.SUBDIR_SENTIMENT_CLEAN = 'sentiment_clean/sentiment_clean_novels_new_corpus2/'

print(f'{Path_to_SentimentArcs}{global_vars.SUBDIR_SENTIMENT_CLEAN}')

In [None]:
# Verify in SentimentArcs Root Directory
os.chdir(Path_to_SentimentArcs)

print('Currently in SentimentArcs root directory:')
!pwd

print(f'\nSaving Text_Type: {Corpus_Genre}')
print(f'     Corpus_Type: {Corpus_Type}')

# Verify Subdir to save Cleaned Texts and Texts into..

print(f'\nThese Text Titles:')
list(corpus_sentiment_dt.keys())

print(f'\n\nTo This Subdirectory:\n  {global_vars.SUBDIR_SENTIMENT_CLEAN}')

full_path = f'{Path_to_SentimentArcs}{global_vars.SUBDIR_SENTIMENT_CLEAN}'
print(f'\nFull path to this Subdirectory:\n  {full_path}')

if Corpus_Type == 'new':
  save_filename = f'sentiment_clean_{Corpus_Genre}_{Corpus_Type}_corpus{Corpus_Number}_all.json'
else:
  save_filename = f'sentiment_clean_{Corpus_Genre}_{Corpus_Type}_reference_all.json'
print(f'\nUnder this Filename:\n  {save_filename}')

write_dict_dfs(corpus_sentiment_dt, out_file=save_filename, out_dir=f'{global_vars.SUBDIR_SENTIMENT_CLEAN}')

In [None]:
# Verify json file created

!ls -altr $global_vars.SUBDIR_SENTIMENT_CLEAN

# **[STEP 4] Smoothing EDA**

In [None]:
from ipywidgets import interact, Dropdown, Select

In [None]:
selected_text = widgets.Dropdown(
    options=corpus_texts_ls,
    value=corpus_texts_ls[0],
    description='Text:',
    disabled=False,
)
selected_text

selected_model = widgets.Dropdown(
    options=models_ls,
    value='roberta15lg',
    description='Model:',
    disabled=False,
)
selected_model

## EDA: Multiple SMA Window Sizes

In [None]:
win_1per = int(1/100 * corpus_sentiment_dt[selected_text.value].shape[0])
win_range_ls = [5,10,15,20]

for i, awin_size in enumerate(win_range_ls):
  win_size = awin_size * win_1per

  title_str = global_vars.corpus_titles_dt[selected_text.value][0]

  _ = corpus_sentiment_dt[selected_text.value][selected_model.value].rolling(win_size, center=True, min_periods=0).mean().plot()
  _ = plt.title(f'Sentiment Arc: {title_str}\nModel: {selected_model.value}\nSmoothing: SMA ({win_range_ls}%)')
  _ = plt.legend(loc='best')
  plt.grid(True)


## EDA: One SMA Window Size

In [None]:
selected_text = widgets.Dropdown(
    options=corpus_texts_ls,
    value=corpus_texts_ls[0],
    description='Text:',
    disabled=False,
)
selected_text

selected_model = widgets.Dropdown(
    options=models_ls,
    value='roberta15lg',
    description='Model:',
    disabled=False,
)
selected_model

selected_sma_window = widgets.IntSlider(
    value=10,
    min=2, # max 
    max=20, # min 
    step=1, # step
    description='SMA Win%'
)
selected_sma_window

In [None]:
win_size = int(selected_sma_window.value/100 * corpus_sentiment_dt[selected_text.value].shape[0])

title_str = global_vars.corpus_titles_dt[selected_text.value][0]

_ = corpus_sentiment_dt[selected_text.value][selected_model.value].rolling(win_size, center=True, min_periods=0).mean().plot()
_ = plt.title(f'Sentiment Arc: {title_str}\nModel: {selected_model.value}\nSmoothing: SMA ({selected_sma_window.value}%)')
plt.grid(True)


## LOWESS Smoothing

In [None]:
# Create DataFrame based on selected SentimentArc above

df = pd.DataFrame(current_sentiment_arc_ser)
df.insert(0, 'sentence_no', corpus_sentiment_dt[atext]['sentence_no'])
df.insert(1, 'text_raw', corpus_sentiment_dt[atext]['text_raw'])
df.insert(2, 'text_clean', corpus_sentiment_dt[atext]['text_clean'])
df.head()

In [None]:
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization 
import statsmodels.api as sm # to build a LOWESS model
from scipy.interpolate import interp1d # for interpolation of new data points

In [None]:
# Create a scatter plot
fig = px.scatter(df, x=df['sentence_no'], y=df['roberta15lg'], opacity=0.8, color_discrete_sequence=['black'])

# Change chart background color
_ = fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
_ = fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

_ = fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

# Set figure title
title_str = global_vars.corpus_titles_dt[selected_text.value][0]
title_all_str = f'{title_str} ({selected_model.value} SMA {selected_sma_window.value}%)'
_ = fig.update_layout(title=dict(text=title_all_str, font=dict(color='black')))

# Update marker size
_ = fig.update_traces(marker=dict(size=3))

fig.show()

In [None]:
from statsmodels.nonparametric.smoothers_lowess import lowess

In [None]:
x.shape

In [None]:
# ------- Select variables -------
# y values for both
# y=df['roberta15lg'].values
y=df['roberta15lg'].values

# x values for Linear Regression
# X=df['X3 distance to the nearest MRT station'].values.reshape(-1,1) # Note, we need X to be a 2D array, hence reshape
# x values for LOWESS
# x=df['sentence_no'].values 
x=np.arange(df.shape[0])

# ------- Linear Regression -------
# Define and fit the model
# model1 = LinearRegression()
# LR = model1.fit(X, y)

# Predict a few points with Linear Regression model for the grpah
# Create 20 evenly spaced points from smallest X to largest X
# x_range = np.linspace(X.min(), X.max(), 20) 
# Predict y values for our set of X values
# y_range = model1.predict(x_range.reshape(-1, 1))


# ------- LOWESS -------
# Generate y_hat values using lowess, try a couple values for hyperparameters
y_hat1 = lowess(y, x, frac=1/20) # note, default frac=2/3
y_hat2 = lowess(y, x, frac=1/30)

In [None]:
# Create a scatter plot
_ = fig = px.scatter(df, x='sentence_no', y='roberta15lg', custom_data=['text_raw'], opacity=0.3) # , color_discrete_sequence=['black'], size=1)

# Add the prediction line
# fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Linear Regression', line=dict(color='limegreen')))
_ = fig.add_traces(go.Scatter(x=y_hat1[:,0], y=y_hat1[:,1], name='LOWESS, frac=1/20', line=dict(color='red')))
_ = fig.add_traces(go.Scatter(x=y_hat2[:,0], y=y_hat2[:,1], name='LOWESS, frac=1/30', line=dict(color='orange')))

# Change chart background color
_ = fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
_ = fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

_ = fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

# Set figure title
title_str = global_vars.corpus_titles_dt[selected_text.value][0]
title_all_str = f'{title_str} ({selected_model.value} SMA {selected_sma_window.value}%)'
_ = fig.update_layout(title=dict(text=title_all_str, font=dict(color='black')))

# Update marker size
_ = fig.update_traces(marker=dict(size=1))

# _ = fig.update_traces(mode="markers+lines", hovertemplate=None)
# _ = fig.update_layout(hovermode="x unified")

_ = fig.update_traces(
    hovertemplate="<br>".join([
        "Sentence No: %{x}",
        "Norm Sentiment: %{y}",
        "Text: %{customdata[0]}",
    ])
)

fig.show()

**[NOTE] In Graph Above carefully roll over only the red & orange lines to view corresponding Text (not blue)**

# **[STEP 5] Peak Detection & Crux Extraction**

## Peak Detection

## Crux Extraction

# **END OF NOTEBOOK**