In [1]:
!pip install empath
!pip install vaderSentiment

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import pandas as pd
# from tqdm.notebook import tqdm
from collections import defaultdict, Counter
from empath import Empath
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from google.colab import drive
drive.mount('/content/drive')

Collecting empath
[?25l  Downloading https://files.pythonhosted.org/packages/d2/84/a5de61a99252f60d705d7982b3648db517a704c89fa7629d3d3637a6e604/empath-0.89.tar.gz (57kB)
[K     |█████▊                          | 10kB 20.6MB/s eta 0:00:01[K     |███████████▍                    | 20kB 1.7MB/s eta 0:00:01[K     |█████████████████               | 30kB 2.3MB/s eta 0:00:01[K     |██████████████████████▊         | 40kB 2.5MB/s eta 0:00:01[K     |████████████████████████████▍   | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.0MB/s 
Building wheels for collected packages: empath
  Building wheel for empath (setup.py) ... [?25l[?25hdone
  Created wheel for empath: filename=empath-0.89-cp36-none-any.whl size=57824 sha256=57eb527d7b1396fbb1e04b09044fdef648e25ed1263d87bd201de24b10b79958
  Stored in directory: /root/.cache/pip/wheels/84/ea/2f/2bc54d4f9985ce61753ebc5b00cb2df51d855589267c667308
Successfully built empath
Installing collected packages: empath
Su

In [0]:
def add_prefix_to_dict_keys(dict, prefix):
  return {f'{prefix}_{k}': v for k, v in dict.items()}

In [0]:
def calc_similarity_between_strings(all_data_df_row):
  x = all_data_df_row['input']
  y = all_data_df_row['output_list']

  # tokenization first
  x_list = word_tokenize(x)
  y_list = word_tokenize(y)

  # Check for stop words
  sw = stopwords.words('english')  
  l1 = list()
  l2 = list()

  # Remove stop words from string
  x_set = {w for w in x_list if not w in sw}  
  y_set = {w for w in y_list if not w in sw}  


  # form a set containing keywords of both strings  
  rvector = x_set.union(y_set)  
  for w in rvector: 
      if w in x_set: l1.append(1) # create a vector 
      else: l1.append(0) 
      if w in y_set: l2.append(1) 
      else: l2.append(0) 
  c = 0
    
  # cosine formula  
  for i in range(len(rvector)): 
          c+= l1[i]*l2[i] 
  cosine = c / float((sum(l1)*sum(l2))**0.5) 
  return cosine

In [0]:
class SeanceScorer():

  def __init__(self):
    # add in inquiererbasic.txt
    with open('/content/drive/My Drive/5_datascience/2_context_chatbot/data/inputs/seance/inquirerbasic.txt', 'r') as content_file:
        content = content_file.read()

    split_by_metric = content.split('\n')

    self.seance_dict = {}
    for line in split_by_metric:
      word_list = line.split('\t')
      self.seance_dict[f'{word_list[0]}'] = word_list[1:]

    # add in negative_words.txt
    with open('/content/drive/My Drive/5_datascience/2_context_chatbot/data/inputs/seance/negative_words.txt', 'r') as content_file:
        content = content_file.read()

    self.seance_dict['negative_words'] = content.split('\n')

    # add in positive_words.txt
    with open('/content/drive/My Drive/5_datascience/2_context_chatbot/data/inputs/seance/positive_words.txt', 'r') as content_file:
        content = content_file.read()

    self.seance_dict['positive_words'] = content.split('\n')
    

  def calc_seance_scores(self, text_to_score, type_scored):
    scores_dict = {}
    for dict_key in self.seance_dict.keys():
      cur_list = self.seance_dict[dict_key] # The list of words associated with the seance feature
      counted = Counter(text_to_score.split())
      common_words = set(cur_list).intersection(counted)
      count_common = sum(counted[wrd] for wrd in set(cur_list))
      scores_dict[f'{type_scored}_s_{dict_key}'] = count_common
    return(scores_dict)
      

Example of seance scores.

# Grade all outputs with seance, empath, vader, and semantic similarity.

In [0]:
graded_outputs = pd.read_csv('/content/drive/My Drive/5_datascience/2_context_chatbot/data/graded/all_outputs_NCJ_ratings.csv')
graded_outputs = graded_outputs.drop(columns=['Unnamed: 0','Unnamed: 5'], axis=1).reset_index().drop('index', axis=1)

In [6]:
graded_outputs.head()

Unnamed: 0,input,output_list,temp_list,NCJ_Rating
0,"I suffer from debilitating social anxiety, whi...",I think that's a very different thought.,0.4,5
1,"I suffer from debilitating social anxiety, whi...","I mean, I think that's kind of the way you're ...",0.4,8
2,"I suffer from debilitating social anxiety, whi...","No, no, no.",0.4,2
3,"I suffer from debilitating social anxiety, whi...",That's not what I mean.,0.4,1
4,"I suffer from debilitating social anxiety, whi...","I'm not saying you will be alone forever, but ...",0.4,0


In [0]:
# Initalize scorers
seance_scorer = SeanceScorer() # from custom made seance scorer
lexicon = Empath() # from empath library
analyzer = SentimentIntensityAnalyzer()


all_scores_list = []

for i in range(len(graded_outputs)):
    seance_input_scores = pd.DataFrame(seance_scorer.calc_seance_scores(graded_outputs['input'][i], type_scored='prompt'), index=[i])
    seance_output_scores = pd.DataFrame(seance_scorer.calc_seance_scores(graded_outputs['output_list'][i], type_scored='response'), index=[i])
    empath_input_scores = pd.DataFrame(add_prefix_to_dict_keys(lexicon.analyze(graded_outputs['input'][i]), prefix='prompt_e'), index=[i])
    empath_output_scores = pd.DataFrame(add_prefix_to_dict_keys(lexicon.analyze(graded_outputs['output_list'][i]), prefix='response_e'), index=[i])
    vader_input_scores = pd.DataFrame(add_prefix_to_dict_keys(analyzer.polarity_scores(graded_outputs['input'][i]), prefix='prompt_v'), index=[i])
    vader_output_scores = pd.DataFrame(add_prefix_to_dict_keys(analyzer.polarity_scores(graded_outputs['output_list'][i]), prefix='response_v'), index=[i])
    all_scores_list.append(pd.concat([seance_input_scores, seance_output_scores, empath_input_scores, empath_output_scores, vader_input_scores, vader_output_scores], axis=1))

In [0]:
all_scores = pd.concat(all_scores_list)
all_data = pd.concat([graded_outputs, all_scores], axis = 1)
all_data['prompt_response_similarity'] = all_data.apply(calc_similarity_between_strings, axis =1)

In [9]:
all_data.head(5)

Unnamed: 0,input,output_list,temp_list,NCJ_Rating,prompt_s_Positiv_GI,prompt_s_Negativ_GI,prompt_s_Pstv_GI,prompt_s_Affil_GI,prompt_s_Ngtv_GI,prompt_s_Hostile_GI,prompt_s_Strong_GI,prompt_s_Power_GI,prompt_s_Weak_GI,prompt_s_Submit_GI,prompt_s_Active_GI,prompt_s_Passive_GI,prompt_s_Pleasur_GI,prompt_s_Pain_GI,prompt_s_Feel_GI,prompt_s_Arousal_GI,prompt_s_Emot_GI,prompt_s_Virtue_GI,prompt_s_Vice_GI,prompt_s_Ovrst_GI,prompt_s_Undrst_GI,prompt_s_Academ_GI,prompt_s_Doctrin_GI,prompt_s_Econ_2_GI,prompt_s_Exch_GI,prompt_s_Econ_GI,prompt_s_Exprsv_GI,prompt_s_Legal_GI,prompt_s_Milit_GI,prompt_s_Polit_2_GI,prompt_s_Polit_GI,prompt_s_Relig_GI,prompt_s_Role_GI,prompt_s_Coll_GI,prompt_s_Work_GI,prompt_s_Ritual_GI,...,response_e_terrorism,response_e_smell,response_e_disappointment,response_e_poor,response_e_plant,response_e_pain,response_e_beauty,response_e_timidity,response_e_philosophy,response_e_negotiate,response_e_negative_emotion,response_e_cleaning,response_e_messaging,response_e_competing,response_e_law,response_e_friends,response_e_payment,response_e_achievement,response_e_alcohol,response_e_liquid,response_e_feminine,response_e_weapon,response_e_children,response_e_monster,response_e_ocean,response_e_giving,response_e_contentment,response_e_writing,response_e_rural,response_e_positive_emotion,response_e_musical,prompt_v_neg,prompt_v_neu,prompt_v_pos,prompt_v_compound,response_v_neg,response_v_neu,response_v_pos,response_v_compound,prompt_response_similarity
0,"I suffer from debilitating social anxiety, whi...",I think that's a very different thought.,0.4,5,1,2,1,1,2,0,0,0,2,1,0,1,0,1,0,1,1,1,1,2,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.373,0.582,0.045,-0.8625,0.0,1.0,0.0,0.0,0.316228
1,"I suffer from debilitating social anxiety, whi...","I mean, I think that's kind of the way you're ...",0.4,8,1,2,1,1,2,0,0,0,2,1,0,1,0,1,0,1,1,1,1,2,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.373,0.582,0.045,-0.8625,0.0,0.83,0.17,0.357,0.298142
2,"I suffer from debilitating social anxiety, whi...","No, no, no.",0.4,2,1,2,1,1,2,0,0,0,2,1,0,1,0,1,0,1,1,1,1,2,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.373,0.582,0.045,-0.8625,0.0,1.0,0.0,0.0,0.298142
3,"I suffer from debilitating social anxiety, whi...",That's not what I mean.,0.4,1,1,2,1,1,2,0,0,0,2,1,0,1,0,1,0,1,1,1,1,2,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.373,0.582,0.045,-0.8625,0.0,1.0,0.0,0.0,0.23094
4,"I suffer from debilitating social anxiety, whi...","I'm not saying you will be alone forever, but ...",0.4,0,1,2,1,1,2,0,0,0,2,1,0,1,0,1,0,1,1,1,1,2,0,0,1,1,0,1,0,1,0,1,1,0,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.373,0.582,0.045,-0.8625,0.25,0.75,0.0,-0.4588,0.48795


In [0]:
all_data.to_csv('/content/drive/My Drive/5_datascience/2_context_chatbot/data/graded/all_graded_with_scores.csv', index_label=False)