# **Compute Sentiment Using 4 SyuzhetR and 7 SentimentR Models**

* https://www.youtube.com/watch?v=U3ByGh8RmSc

* https://github.com/ttimbers/intro-to-reticulate

[Use R on Google Colab!](https://colab.research.google.com/notebook#create=true&language=r)

# **[STEP 1] Configuration and Setup**

## Configure Jupyter Notebook

In [None]:
# Ignore warnings

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Configure Jupyter

# Enable multiple outputs from one code cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display
from IPython.display import Image
from ipywidgets import widgets, interactive

## [INPUT] Connect Google gDrive to this Jupyter Notebook

In [None]:
# [INPUT REQUIRED]: Authorize access to Google gDrive

# Connect this Notebook to your permanent Google Drive
#   so all generated output is saved to permanent storage there

try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("Attempting to attach your Google gDrive to this Colab Jupyter Notebook")
  drive.mount('/gdrive')
else:
  print("Your Google gDrive is attached to this Colab Jupyter Notebook")

In [None]:
!ls

In [None]:
# [CUSTOMIZE]: Change the text after the Unix '%cd ' command below (change directory)
#              to math the full path to your gDrive subdirectory which should be the 
#              root directory cloned from the SentimentArcs github repo.

# NOTE: Make sure this subdirectory already exists and there are 
#       no typos, spaces or illegals characters (e.g. periods) in the full path after %cd

# NOTE: In Python all strings must begin with an upper or lowercase letter, and only
#         letter, number and underscores ('_') characters should appear afterwards.
#         Make sure your full path after %cd obeys this constraint or errors may appear.



# Step #1: Get full path to SentimentArcs subdir on gDrive
# =======
#@markdown **Accept default path on gDrive or Enter new one:**

Path_to_SentimentArcs = "/gdrive/MyDrive/cdh/sentiment_arcs/" #@param ["/gdrive/MyDrive/sentiment_arcs/"] {allow-input: true}

#@markdown (e.g. /gdrive/MyDrive/research/sentiment_arcs/)



# Step #2: Move to Parent directory of Sentiment_Arcs
# =======
parentdir_sentiment_arcs = '/'.join(Path_to_SentimentArcs.split('/')[:-2])
print(f'subdir_parent: {parentdir_sentiment_arcs}')
%cd $parentdir_sentiment_arcs


# Step #3: If project sentiment_arcs subdir does not exist, 
#          clone it from github
# =======
import os

if not os.path.isdir('sentiment_arcs'):
  # NOTE: This will not work until SentimentArcs becomes an open sourced PUBLIC repo
  # !git clone https://github.com/jon-chun/sentiment_arcs.git

  # Test on open access github repo
  !git clone https://github.com/jon-chun/nabokov_palefire.git


# Step #4: Change into sentiment_arcs subdir
# =======
%cd ./sentiment_arcs
# Test on open acess github repo
# %cd ./nabokov_palefire

# Step #5: Confirm contents of sentiment_arcs subdir
# =======
!ls


In [None]:
# [VERIFY]: Ensure that all the manually preprocessed novel are in plain text
#   files and file names are formatted correctly

# %cd ../sentiment_arcs
!pwd
!ls ./text_raw

## Define Directory Tree Structure

In [None]:
#@markdown **Sentiment Arcs Directory Structure** \
#@markdown \
#@markdown **1. Input Directories:** \
#@markdown (a) Raw textfiles in subdir: ./text_raw/(text_type)/  \
#@markdown (b) Cleaned textfiles in subdir: ./text_clean/(text_type)/ \
#@markdown \
#@markdown **2. Output Directories** \
#@markdown (1) Raw Sentiment time series datafiles and plots in subdir: ./sentiment_raw/(text_type) \
#@markdown (2) Cleaned Sentiment time series datafiles and plots in subdir: ./sentiment_clean/(text_type) \
#@markdown \
#@markdown **Which type of texts are you analyzing?** \

Text_Type = "novels" #@param ["novels", "social_media", "finance"]

#@markdown Please check that the required textfiles and datafiles exist in the correct subdirectories before continuing.




In [None]:
# Create Directory CONSTANTS based On Document Type

SUBDIR_TEXT_RAW = f"./text_raw/{Text_Type}_raw/"
SUBDIR_TEXT_CLEAN = f"./text_clean/{Text_Type}_clean/"
SUBDIR_SENTIMENT_RAW = f"./sentiment_raw/{Text_Type}_raw/"
SUBDIR_SENTIMENT_CLEAN = f"./sentiment_clean/{Text_Type}_clean/"
SUBDIR_PLOTS = f"./plots/{Text_Type}_plots/"

# Verify Directory Structure

print('Verify the Directory Structure:\n')
print('-------------------------------\n')

print(f'           [Corpus Type]: {Text_Type}\n')
print(f'       [SUBDIR_TEXT_RAW]: {SUBDIR_TEXT_RAW}\n')
print(f'     [SUBDIR_TEXT_CLEAN]: {SUBDIR_TEXT_CLEAN}\n')
print(f'  [SUBDIR_SENTIMENT_RAW]: {SUBDIR_SENTIMENT_RAW}\n')
print(f'[SUBDIR_SENTIMENT_CLEAN]: {SUBDIR_SENTIMENT_CLEAN}\n')
print(f'          [SUBDIR_PLOTS]: {SUBDIR_PLOTS}\n')

## Read YAML Configuration File

In [None]:
!pip install pyyaml

In [None]:
import yaml

### Define Texts to Analyze

In [None]:
# Read SentimentArcs YAML Config Files for Different Corpora Types(3) and Text Files Details

# Novel Text Files
with open("./config/novels_info.yaml", "r") as stream:
  try:
    novels_dt = yaml.safe_load(stream)
  except yaml.YAMLError as exc:
    print(exc)

# Finance Text Files
with open("./config/finance_info.yaml", "r") as stream:
  try:
    finance_dt = yaml.safe_load(stream)
  except yaml.YAMLError as exc:
    print(exc)

# Social Media Text Files

with open("./config/social_info.yaml", "r") as stream:
  try:
    social_dt = yaml.safe_load(stream)
  except yaml.YAMLError as exc:
    print(exc)

In [None]:
import json

In [None]:
# Verify the Corpora: Novel Textfiles in novels_dt

print (json.dumps(novels_dt, indent=2))

In [None]:
# Verify the Corpora: Novel Textfiles in finance_dt

print (json.dumps(finance_dt, indent=2))

In [None]:
# Verify the Corpora: Novel Textfiles in social_dt

print (json.dumps(social_dt, indent=2))

## Define Globals

In [None]:
# TODO

## Install Libraries: R

In [None]:
# !pip install rpy2

In [None]:
# !pip install -U rpy2

In [None]:
# Load Jupyter rpy2 Extension  
#   enables the %%R magic commands

%load_ext rpy2.ipython

In [None]:
# %reload_ext rpy2.ipython

In [None]:
%%time 
%%capture 
%%R

# Install Syuzhet.R, Sentiment.R and Utility Libraries

# NOTE: 1m12s 
#       1m05s

install.packages(c('syuzhet', 'sentimentr', 'tidyverse', 'lexicon'))

library(syuzhet)
library(sentimentr)
library(tidyverse)
library(lexicon)

In [None]:
# %reload_ext rpy2.ipython

In [None]:
# Load Python libraries to exchange data with R Program Space and read R Datafiles

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr

In [None]:
%%R

# Verify R in Kernel Version

R.version.string

In [None]:
%%R

# Verify R Kernel Session Info

sessionInfo()

In [None]:
%%R

# Verfiy R Kernel Environment

# Sys.getenv


## Install Libraries: Python

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
from glob import glob
import copy
import json

## Setup Matplotlib Style

* https://matplotlib.org/stable/tutorials/introductory/customizing.html

In [None]:
from cycler import cycler

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']   
linestyles = ['-', '--', ':', '-.','-', '--', ':', '-.','-', '--']

cycle = plt.cycler("color", colors) + plt.cycler("linestyle", linestyles)

# View previous matplotlib configuration
print('\n Old Matplotlib Configurtion Settings:\n')
# plt.rc.show
print('\n\n')

# Update and view new matplotlib configuration
print('\n New Matplotlib Configurtion Settings:\n')
myparams = {'axes.prop_cycle': cycle}
plt.rcParams.update(myparams)

plt.rcParams["axes.titlesize"] = 16
plt.rcParams['figure.figsize'] = 20,10
plt.rcParams["legend.fontsize"] = 10
plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["ytick.labelsize"] = 12
plt.rcParams["axes.labelsize"] = 12


In [None]:
"""
import matplotlib.colors as mcolors

mcolors.TABLEAU_COLORS

all_named_colors = {}
all_named_colors.update(mcolors.TABLEAU_COLORS)

print('\n')
all_named_colors.values()
""";

In [None]:
# Set matplotlib plot figure.figsize

new_plt_size = plt.rcParams["figure.figsize"]=(20,10)

print(" New figure size: ",new_plt_size)

## Setup Seaborn Style

In [None]:
# View previous seaborn configuration
print('\n Old Seaborn Configurtion Settings:\n')
sns.axes_style()
print('\n\n')

# Update and View new seaborn configuration
print('\n New Seaborn Configurtion Settings:\n')
# sns.set_style('white')
sns.set_context('paper')
sns.set_style('white')
sns.set_palette('tab10')

# Change defaults
# sns.set(style='white', context='talk', palette='tab10')

In [None]:
# Seaborn: Set Theme (Scale of Font)

sns.set_theme('paper')  # paper, notebook, talk, poster


# Seaborn: Set Context
# sns.set_context("notebook")



# Seaborn: Set Style

# sns.set_style('ticks') # darkgrid, whitegrid, dark, white, and ticks

In [None]:
# Seaborn: Default Palette (Pastel?)

sns.color_palette()

In [None]:
# Seaborn: Set to High-Contrast Palette (more Vision Impaired Friendly)

sns.set_palette('tab10')
sns.color_palette()

In [None]:
plt.style.available

In [None]:
plt.style.use('seaborn-whitegrid')

## Python Utility Functions

In [None]:
# Utility functions to read/write nested Dictionary (key=novel) of DataFrames (Cols = Model Sentiment Series) 

def write_dict_dfs(adict, out_file='sentiments.json', out_dir=SUBDIR_SENTIMENT_RAW):
  '''
  Given a Dictionary of DataFrames and optional output filename and output directory
  Write as nested json file
  '''

  # convert dataframes into dictionaries
  data_dict = {
      key: adict[key].to_dict(orient='records') 
      for key in adict.keys()
  }

  # write to disk
  out_fullpath = f'{out_dir}{out_file}'
  print(f'Saving file to: {out_fullpath}')
  with open(out_fullpath, 'w') as fp:
    json.dump(
      data_dict, 
      fp, 
      indent=4, 
      sort_keys=True
    )

  return 

def read_dict_dfs(in_file='sentiments.json', in_dir=SUBDIR_SENTIMENT_RAW):
  '''
  Given a Dictionary of DataFrames and optional output filename and output directory
  Read nested json file into Dictionary of DataFrames
  '''

  # read from disk
  in_fullpath = f'{in_dir}{in_file}'
  with open(in_fullpath, 'r') as fp:
      data_dict = json.load(fp)

  # convert dictionaries into dataframes
  all_dt = {
      key: pd.DataFrame(data_dict[key]) 
      for key in data_dict
  }

  return all_dt

# **[STEP 2] Read all Preprocessed Novels**

In [None]:
!pwd

In [None]:
SUBDIR_TEXT_CLEAN

In [None]:
!ls $SUBDIR_TEXT_CLEAN

In [None]:
# Create a List (preprocessed_ls) of all preprocessed text files

try:
    preprocessed_ls = glob(f'{SUBDIR_TEXT_CLEAN}*.csv')
    preprocessed_ls = [x.split('/')[-1] for x in preprocessed_ls]
    preprocessed_ls = [x.split('.')[0] for x in preprocessed_ls]
except IndexError:
    raise RuntimeError('No csv file found')

print('\n'.join(preprocessed_ls))
print('\n')
print(f'Found {len(preprocessed_ls)} Preprocessed files in {SUBDIR_TEXT_CLEAN}')

In [None]:
# Read all preprocessed text files into master DataFrame (corpus_dt)

corpus_dt = {}

for i,anovel in enumerate(preprocessed_ls):
  print(f'Processing #{i}: {anovel}...')
  afile_fullpath = f'{SUBDIR_TEXT_CLEAN}{anovel}.csv'
  print(f'               {afile_fullpath}')
  anovel_df = pd.read_csv(afile_fullpath)
  corpus_dt[anovel] = anovel_df

In [None]:
# Verify the novels read into master Dictionary of DataFrames

corpus_dt.keys()
print('\n')
print(f'There were {len(corpus_dt)} preprocessed novels read into the Dict corpus_dt')

In [None]:
# Check if there are any Null strings in the text_clean columns

for i, anovel in enumerate(list(corpus_dt.keys())):
  print(f'\nNovel #{i}: {anovel}')
  nan_ct = corpus_dt[anovel].text_clean.isna().sum()
  if nan_ct > 0:
    print(f'      {nan_ct} Null strings in the text_clean column')

In [None]:
# Fill in all the Null value of text_clean with placeholder 'empty_string'

for i, anovel in enumerate(list(corpus_dt.keys())):
  # print(f'Novel #{i}: {anovel}')
  # Fill all text_clean == Null with 'empty_string' so sentimentr::sentiment doesn't break
  corpus_dt[anovel][corpus_dt[anovel].text_clean.isna()] = 'empty_string'

In [None]:
# Verify one DataFrame in the master Dictionary

corpus_dt['dbrown_thedavincicode'].head()

# **[STEP 3] Get Sentiments with SyuzhetR (4 Models)**

## Option (a): Read Previously Computed SyuzhetR Values from Datafiles

In [None]:
# Read in Saved SyuzhetR Datafile from subdir_sentiments/all_4syuzhetr.json

corpus_syuzhetr_dt = read_dict_dfs('all_4syuzhetr.json')
corpus_syuzhetr_dt.keys()

In [None]:
# Verify all the Novels have 4 Syuzhet Model Values

for i, anovel in enumerate(list(corpus_syuzhetr_dt.keys())):
  print(f'Novel #{i}: {anovel}')
  corpus_syuzhetr_dt[anovel].drop(columns=['Unnamed: 0'], inplace=True)
  print(f'      df.shape: {corpus_syuzhetr_dt[anovel].shape}')

In [None]:
# Verify DataFrame for test novel

novel_str = 'cdickens_achristmascarol'
corpus_syuzhetr_dt[novel_str].head()

## Option (b): Compute New SyuzhetR Values

In [None]:
# Verify text_clean of sample text

text_sample = 'cdickens_achristmascarol'

corpus_dt[text_sample]['text_clean'].to_list()[:10]

In [None]:
%%time

# Compute Sentiments from all 4 Syuzhet Models applied to all 32 Novels (4 x 32 = 128 runs)

# NOTE:  9m45s 23:30 on 20220114 Colab Pro (33 Novels)
#       28:32s 21:06 on 20220226 Colab Pro (33 Novels)

# base = importr('base')
syuzhet = importr('syuzhet')

# corpus_syuzhetr_dt = {}

# base.rank(0, na_last = True)
novels_keys_ls = list(corpus_dt.keys())
novels_keys_ls.sort()
for i, anovel in enumerate(novels_keys_ls):
  print(f'Processing Novel #{i}: {anovel}...')
  corpus_dt[anovel]['syuzhetr_syuzhet'] = syuzhet.get_sentiment(corpus_dt[anovel]['text_clean'].to_list(), method='syuzhet')
  corpus_dt[anovel]['syuzhetr_bing'] = syuzhet.get_sentiment(corpus_dt[anovel]['text_clean'].to_list(), method='bing')
  corpus_dt[anovel]['syuzhetr_afinn'] = syuzhet.get_sentiment(corpus_dt[anovel]['text_clean'].to_list(), method='afinn')
  corpus_dt[anovel]['syuzhetr_nrc'] = syuzhet.get_sentiment(corpus_dt[anovel]['text_clean'].to_list(), method='nrc')

## Checkpoint: Save SyuzhetR Values

In [None]:
# Verify in SentimentArcs Root Directory

!pwd
print('\n')
!ls

In [None]:
# Verify Save Destination Subdir: SUBDIR_SENTIMENT_RAW

SUBDIR_SENTIMENT_RAW
print('\n')
!ls $SUBDIR_SENTIMENT_RAW

In [None]:
corpus_dt.keys()

In [None]:
corpus_dt['cdickens_achristmascarol']

In [None]:
# Save sentiment values to subdir_sentiments

write_dict_dfs(corpus_dt, out_file='all_4syuzhetr.json', out_dir=SUBDIR_SENTIMENT_RAW)

In [None]:
# Verify Dictionary was saved correctly by reading back the *.json datafile

test_dt = read_dict_dfs(in_file='all_4syuzhetr.json', in_dir=SUBDIR_SENTIMENT_RAW)
test_dt.keys()

## Plot SyuzhetR 4 Models

In [None]:
#@markdown Select option to save plots:
Save_Raw_Plots = True #@param {type:"boolean"}

Save_Smooth_Plots = True #@param {type:"boolean"}
Resolution = "300" #@param ["100", "300"]



In [None]:
# Get Col Names for all 4 SyuzhetR Models

cols_all_ls = corpus_dt['cdickens_achristmascarol'].columns
cols_syuzhetr_ls = [x for x in cols_all_ls if 'syuzhetr_' in x]
cols_syuzhetr_ls

In [None]:
novels_dt['cdickens_achristmascarol'][0]

In [None]:
SUBDIR_PLOTS

In [None]:
# Verify 4 SyuzhetR Models with Plots

for i, anovel in enumerate(list(corpus_dt.keys())):

  print(f'Novel #{i}: {novels_dt[anovel][0]}')

  # Raw Sentiments 
  fig = corpus_dt[anovel][cols_syuzhetr_ls].plot(title=f'{novels_dt[anovel][0]}\n SyuzhetR 4 Models: Raw Sentiments', alpha=0.3)
  plt.show();

  if Save_Raw_Plots:
    plt.savefig(f'{SUBDIR_PLOTS}plot_syuzhetr_raw_{anovel}_dpi{Resolution}.png', dpi=int(Resolution))

  
  # Smoothed Sentiments (SMA 10%)
  # novel_sample = 'cdickens_achristmascarol'
  win_10per = int(corpus_dt[anovel].shape[0] * 0.1)
  corpus_dt[anovel][cols_syuzhetr_ls].rolling(win_10per, center=True, min_periods=0).mean().plot(title=f'{novels_dt[anovel][0]}\n SyuzhetR 4 Models: Smoothed Sentiments (SMA 10%)', alpha=0.3)
  plt.show();

  if Save_Smooth_Plots:
    plt.savefig(f'{SUBDIR_PLOTS}plot_syuzhetr_smooth10sma_{anovel}_dpi{Resolution}.png', dpi=int(Resolution))


# **[STEP 4] Get Sentiments with SentimentR (7 Models)**

## Option (a): Read Previous Computed SentimentR Values from DataFile

In [None]:
# Read in Saved SyuzhetR Datafile from subdir_sentiments/all_4syuzhetr.json

corpus_sentimentr_dt = read_dict_dfs('all_7sentimentr.json')
corpus_sentimentr_dt.keys()

In [None]:
# Verify all the Novels have 4 Syuzhet Model Values

for i, anovel in enumerate(list(corpus_sentimentr_dt.keys())):
  print(f'Novel #{i}: {anovel}')
  corpus_sentimentr_dt[anovel].drop(columns=['Unnamed: 0'], inplace=True)
  print(f'      df.shape: {corpus_sentimentr_dt[anovel].shape}')

In [None]:
# Verify DataFrame for test novel

novel_str = 'cdickens_achristmascarol'
corpus_sentimentr_dt[novel_str].head()

## Option (b): Compute New SentimentR Values

Call function in external get_sentimentr.R from within Python Loop

* https://medium.com/analytics-vidhya/calling-r-from-python-magic-of-rpy2-d8cbbf991571

* https://rpy2.github.io/doc/v3.0.x/html/generated_rst/pandas.html

In [None]:
%%file get_sentimentr.R

library(sentimentr)
library(lexicon)

get_sentimentr_values <- function(s_v) {
  
  print('Processing sentimentr_jockersrinker')
  sentimentr_jockersrinker <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_jockers_rinker, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_jockers')
  sentimentr_jockers <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_jockers, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_huliu')
  sentimentr_huliu <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_huliu, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_nrc')
  sentimentr_nrc <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_nrc, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_senticnet')
  sentimentr_senticnet <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_senticnet, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_sentiword')
  sentimentr_sentiword <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_sentiword, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_loughran_mcdonald')
  sentimentr_loughran_mcdonald <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_loughran_mcdonald, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  print('Processing sentimentr_socal_google')
  sentimentr_socal_google <- sentiment(s_v, polarity_dt=lexicon::hash_sentiment_socal_google, 
                                        hypen="", amplifier.weight=0.8, n.before=5, n.after=2,
                                        adversative.weight=0.25, neutral.nonverb.like = FALSE, missing_value = 0)

  anovel_sentimentr_df <- data.frame('text_clean' = s_v,
                                'sentimentr_jockersrinker' = sentimentr_jockersrinker$sentiment,
                                'sentimentr_jockers' = sentimentr_jockers$sentiment,
                                'sentimentr_huliu' = sentimentr_huliu$sentiment,
                                'sentimentr_nrc' = sentimentr_nrc$sentiment,
                                'sentimentr_senticnet' = sentimentr_senticnet$sentiment,
                                'sentimentr_sentiword' = sentimentr_sentiword$sentiment,
                                'sentimentr_loughran_mcdonald' = sentimentr_loughran_mcdonald$sentiment,
                                'sentimentr_socal_google' = sentimentr_socal_google$sentiment
                                )
  return(anovel_sentimentr_df)

}

In [None]:
# Verify the *.R file above was written correctly

!cat get_sentimentr.R

In [None]:
# Setup python robject with external library::function()
# https://rpy2.github.io/doc/v3.0.x/html/generated_rst/pandas.html

# import rpy2.robjects as robjects

# Defining the R script and loading the instance in Python
# from rpy2.robjects import pandas2ri 
r = robjects.r

# Loading the function we have defined in R.
r['source']('get_sentimentr.R')

# Reading and processing data
get_sentimentr_function_r = robjects.globalenv['get_sentimentr_values']

In [None]:
# Test

# Convert Python List of Strings to a R vector of characters
test_ls = corpus_dt['cdickens_achristmascarol']['text_clean'].to_list()
s_v = robjects.StrVector(test_ls)
type(s_v)

get_sentimentr_function_r(s_v)

In [None]:
novels_dt.keys()

In [None]:
text_clean_ct = corpus_dt['dbrown_thedavincicode'].text_clean.isna().sum()
text_clean_ct
# len(text_clean_ls.isnull())

**[RE-EXECUTE] May have to re-execute following code cell several times**

In [None]:
%whos dict

In [None]:
%%time

# NOTE: 8m19s 13 Novels 
#      16m39s 19 Novels
#     -----------------
#      24m58s 32 Novels

# Call external get_sentimentr::get_sentimentr_values with Python loop over all novels

# novels_sentimentr_dt = {}

anovel_df = pd.DataFrame()

novels_keys_ls = list(corpus_dt.keys())
novels_keys_ls.sort()
# for i, anovel in enumerate(novels_keys_ls[:19]):
for i, anovel in enumerate(novels_keys_ls):  
  print(f'\nProcessing Novel #{i}: {anovel}')
  print(f'     {corpus_dt[anovel].shape}')
  # Get text_clean as list of strings
  text_clean_ls = corpus_dt[anovel]['text_clean'].to_list()

  # Convert Python List of Strings to a R vector of characters
  # https://rpy2.github.io/doc/v3.0.x/html/generated_rst/pandas.html
  s_v = robjects.StrVector(text_clean_ls)
  anovel_df_r = get_sentimentr_function_r(s_v)

  # Convert rpy2.robjects.vectors.DataFrame to pandas.core.frame.DataFrame
  # https://stackoverflow.com/questions/20630121/pandas-how-to-convert-r-dataframe-back-to-pandas 
  print(f'type(anovel_df_r): {type(anovel_df_r)}')
  anovel_df = pd.DataFrame.from_dict({ key : np.asarray(anovel_df_r.rx2(key)) for key in anovel_df_r.names })
  print(f'type(anovel_df): {type(anovel_df)}')

  # Save Results
  # novels_dt[anovel] = anovel_df.copy(deep=True)

  corpus_dt[anovel]['sentimentr_jockersrinker'] = anovel_df[anovel]['sentimentr_jockersrinker']
  corpus_dt[anovel]['sentimentr_jockers'] = anovel_df[anovel]['sentimentr_jockers']
  corpus_dt[anovel]['sentimentr_huliu'] = anovel_df[anovel]['sentimentr_huliu']
  corpus_dt[anovel]['sentimentr_nrc'] = anovel_df[anovel]['sentimentr_nrc']
  corpus_dt[anovel]['sentimentr_senticnet'] = anovel_df[anovel]['sentimentr_senticnet']
  corpus_dt[anovel]['sentimentr_sentiword'] = anovel_df[anovel]['sentimentr_sentiword']
  corpus_dt[anovel]['sentimentr_loughran_mcdonald'] = anovel_df[anovel]['sentimentr_loughran_mcdonald']
  corpus_dt[anovel]['sentimentr_socal_google'] = anovel_df[anovel]['sentimentr_socal_google']  

In [None]:
cols_sentimentr_ls = [x for x in novels_dt['cdickens_greatexpectations'].columns if 'sentimentr_' in x]
cols_sentimentr_ls

In [None]:
for i, anovel in enumerate(novels_keys_ls):
  print(f'Novel #{i}: {anovel}')
  for j, amodel in enumerate(cols_sentimentr_ls):
    print(f'           Model #{j}: {amodel}')
    corpus_dt[anovel][amodel] = novels_dt[anovel][amodel]

In [None]:
corpus_dt['cdickens_greatexpectations'].head()

In [None]:
len(corpus_dt)

## Checkpoint: Save SentimentR Values

In [None]:
# Verify in SentimentArcs Root Directory

!pwd
print('\n')
!ls

In [None]:
# Verify Save Destination Subdir: SUBDIR_SENTIMENT_RAW

SUBDIR_SENTIMENT_RAW
print('\n')
!ls $SUBDIR_SENTIMENT_RAW

In [None]:
corpus_dt.keys()

In [None]:
corpus_dt['cdickens_achristmascarol']

In [None]:
# Save sentiment values to subdir_sentiments

write_dict_dfs(corpus_dt, out_file='all_7sentimentr.json', out_dir=SUBDIR_SENTIMENT_RAW)

In [None]:
# Verify Dictionary was saved correctly by reading back the *.json datafile

test_dt = read_dict_dfs(in_file='all_7sentimentr.json', in_dir=SUBDIR_SENTIMENT_RAW)
test_dt.keys()

In [None]:
test_dt['cdickens_greatexpectations'].columns

## Plot SentimentR 7 Models

In [None]:
#@markdown Select option to save plots:
Save_Raw_Plots = True #@param {type:"boolean"}

Save_Smooth_Plots = True #@param {type:"boolean"}
Resolution = "100" #@param ["100", "300"]



In [None]:
# Get Col Names for all SentimentR Models
cols_all_ls = corpus_dt['cdickens_achristmascarol'].columns
cols_sentimentr_ls = [x for x in cols_all_ls if 'sentimentr_' in x]
cols_sentimentr_ls

In [None]:
novels_dt['cdickens_achristmascarol'][0]

In [None]:
SUBDIR_PLOTS

In [None]:
novels_dt['cdickens_greatexpectations']

In [None]:
# Verify 7 SentimentR Models with Plots


for i, anovel in enumerate(list(corpus_dt.keys())):

  print(f'Novel #{i}: {novels_dt[anovel][0]}')

  # Raw Sentiments 
  fig = corpus_dt[anovel][cols_sentimentr_ls].plot(title=f'{novels_dt[anovel][0]}\n SentimentR 7 Models: Raw Sentiments', alpha=0.3)
  plt.show();

  if Save_Raw_Plots:
    plt.savefig(f'{SUBDIR_PLOTS}plot_sentimentr_raw_{anovel}_dpi{Resolution}.png', dpi=int(Resolution))

  
  # Smoothed Sentiments (SMA 10%)
  # novel_sample = 'cdickens_achristmascarol'
  win_10per = int(corpus_dt[anovel].shape[0] * 0.1)
  corpus_dt[anovel][cols_sentimentr_ls].rolling(win_10per, center=True, min_periods=0).mean().plot(title=f'{novels_dt[anovel][0]}\n SentimentR 7 Models: Smoothed Sentiments (SMA 10%)', alpha=0.3)
  plt.show();

  if Save_Smooth_Plots:
    plt.savefig(f'{SUBDIR_PLOTS}plot_sentimentr_smooth10sma_{anovel}_dpi{Resolution}.png', dpi=int(Resolution))


# **END OF NOTEBOOK**

---