In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import pandas as pd
from tqdm.notebook import tqdm
from collections import defaultdict, Counter
from empath import Empath
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /Users/jasonkim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasonkim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
scored_df = pd.read_csv('../data/scored_chatbot_interactions/processed/JasonMcFadden_scored_chats_processed.csv')
scored_df.head()

Unnamed: 0.1,Unnamed: 0,iteration,input,cb_response_1,input_2,cb_response_2,input_3,cb_response_3,cbr1_score,cbrall_score
0,0,0,"I suffer from debilitating social anxiety, whi...",it is possible that you ' ll eventually experi...,"Oh really, should I be concerned?",not at all . there are many different therapie...,"Oh that's great, what kind of therapies are th...",there are many different therapies for depress...,6.5,7.0
1,2,0,I'd like to feel more confident and less anxious,i ' d like to feel more confident and less anx...,Do you have any suggestions?,i ' d want to feel more confident .,Same here. Any tips on how to help?,i would like to help with how to get rid of an...,2.0,2.0
2,3,0,"I'm anxious about my presentation, I think I w...",the problem is that you are not prepared enough .,"I'm well prepared, just anxious.",you are not prepared to be a therapist .,I know. Any tips on how to manage my anxiety?,you need to be aware that there is a time and ...,1.0,2.0
3,4,0,I'm really overloaded with work and have a ton...,there are many different types of psychologist...,Which psychologist can help me?,i don ' t know of a specific type of psycholog...,Do you have any other suggestions?,it ' s not clear what types of therapy are app...,4.0,6.0
4,5,0,I'm feeling really stressed out all the time a...,"a therapist may be helpful , but they don ' t ...",So I should see a psychologist?,i ' d recommend getting a copy of the book you...,Thank you. Any other tips for how to manage my...,the most common way you can manage stress is t...,8.0,8.5


In [3]:
def add_prefix_to_dict_keys(dict, prefix):
    return {f'{prefix}_{k}': v for k, v in dict.items()}

def calc_similarity_between_strings(all_data_df_row):
    x = all_data_df_row['input']
    y = all_data_df_row['cb_response_1']

    # tokenization first
    x_list = word_tokenize(x)
    y_list = word_tokenize(y)

    # Check for stop words
    sw = stopwords.words('english')  
    l1 = list()
    l2 = list()

    # Remove stop words from string
    x_set = {w for w in x_list if not w in sw}  
    y_set = {w for w in y_list if not w in sw}  


    # form a set containing keywords of both strings  
    rvector = x_set.union(y_set)  
    for w in rvector: 
        if w in x_set: l1.append(1) # create a vector 
        else:l1.append(0) 
        if w in y_set: l2.append(1) 
        else: l2.append(0) 
    c = 0

    # cosine formula  
    for i in range(len(rvector)): 
          c+= l1[i]*l2[i] 
    cosine = c / float((sum(l1)*sum(l2))**0.5) 
    return cosine

In [4]:

class SeanceScorer():

    def __init__(self):
        # add in inquiererbasic.txt
        with open('../data/inputs/seance/inquirerbasic.txt', 'r') as content_file:
            content = content_file.read()

        split_by_metric = content.split('\n')

        self.seance_dict = {}
        for line in split_by_metric:
            word_list = line.split('\t')
            self.seance_dict[f'{word_list[0]}'] = word_list[1:]

        # add in negative_words.txt
        with open('../data/inputs/seance/negative_words.txt', 'r') as content_file:
            content = content_file.read()

        self.seance_dict['negative_words'] = content.split('\n')

        # add in positive_words.txt
        with open('../data/inputs/seance/positive_words.txt', 'r') as content_file:
            content = content_file.read()

        self.seance_dict['positive_words'] = content.split('\n')
    

    def calc_seance_scores(self, text_to_score, type_scored):
        scores_dict = {}
        for dict_key in self.seance_dict.keys():
            cur_list = self.seance_dict[dict_key] # The list of words associated with the seance feature
            counted = Counter(text_to_score.split())
            common_words = set(cur_list).intersection(counted)
            count_common = sum(counted[wrd] for wrd in set(cur_list))
            scores_dict[f'{type_scored}_s_{dict_key}'] = count_common
        return(scores_dict)

In [5]:

# Initalize scorers
seance_scorer = SeanceScorer() # from custom made seance scorer
lexicon = Empath() # from empath library
analyzer = SentimentIntensityAnalyzer()


all_scores_list = []

for i in tqdm(range(len(scored_df))):
    seance_input_scores = pd.DataFrame(seance_scorer.calc_seance_scores(scored_df['input'][i], type_scored='prompt'), index=[i])
    seance_output_scores = pd.DataFrame(seance_scorer.calc_seance_scores(scored_df['cb_response_1'][i], type_scored='response'), index=[i])
    empath_input_scores = pd.DataFrame(add_prefix_to_dict_keys(lexicon.analyze(scored_df['input'][i]), prefix='prompt_e'), index=[i])
    empath_output_scores = pd.DataFrame(add_prefix_to_dict_keys(lexicon.analyze(scored_df['cb_response_1'][i]), prefix='response_e'), index=[i])
    vader_input_scores = pd.DataFrame(add_prefix_to_dict_keys(analyzer.polarity_scores(scored_df['input'][i]), prefix='prompt_v'), index=[i])
    vader_output_scores = pd.DataFrame(add_prefix_to_dict_keys(analyzer.polarity_scores(scored_df['cb_response_1'][i]), prefix='response_v'), index=[i])
    all_scores_list.append(pd.concat([seance_input_scores, seance_output_scores, empath_input_scores, empath_output_scores, vader_input_scores, vader_output_scores], axis=1))
    all_scores = pd.concat(all_scores_list)
    
    
all_data = pd.concat([scored_df, all_scores], axis = 1)
all_data['prompt_response_similarity'] = all_data.apply(calc_similarity_between_strings, axis =1)

HBox(children=(IntProgress(value=0, max=381), HTML(value='')))




## How to cull columns?

- Look into the variability across columns
    - Less than 5 percent of the data are required to be different from the modal value.
- Do a PCA among the columns that appear to measure the same thing
    - E.g. positive sentiment
- Also use number of unique values
- Drop linear features
    - Drop columns with high correlation, e.g. > .96

In [6]:
all_data.head()

Unnamed: 0.1,Unnamed: 0,iteration,input,cb_response_1,input_2,cb_response_2,input_3,cb_response_3,cbr1_score,cbrall_score,...,response_e_musical,prompt_v_neg,prompt_v_neu,prompt_v_pos,prompt_v_compound,response_v_neg,response_v_neu,response_v_pos,response_v_compound,prompt_response_similarity
0,0,0,"I suffer from debilitating social anxiety, whi...",it is possible that you ' ll eventually experi...,"Oh really, should I be concerned?",not at all . there are many different therapie...,"Oh that's great, what kind of therapies are th...",there are many different therapies for depress...,6.5,7.0,...,0.0,0.373,0.582,0.045,-0.8625,0.286,0.714,0.0,-0.5267,0.19518
1,2,0,I'd like to feel more confident and less anxious,i ' d like to feel more confident and less anx...,Do you have any suggestions?,i ' d want to feel more confident .,Same here. Any tips on how to help?,i would like to help with how to get rid of an...,2.0,2.0,...,0.0,0.125,0.438,0.437,0.647,0.109,0.51,0.382,0.647,0.771517
2,3,0,"I'm anxious about my presentation, I think I w...",the problem is that you are not prepared enough .,"I'm well prepared, just anxious.",you are not prepared to be a therapist .,I know. Any tips on how to manage my anxiety?,you need to be aware that there is a time and ...,1.0,2.0,...,0.0,0.333,0.667,0.0,-0.5423,0.353,0.647,0.0,-0.5213,0.0
3,4,0,I'm really overloaded with work and have a ton...,there are many different types of psychologist...,Which psychologist can help me?,i don ' t know of a specific type of psycholog...,Do you have any other suggestions?,it ' s not clear what types of therapy are app...,4.0,6.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.903,0.097,0.4019,0.111803
4,5,0,I'm feeling really stressed out all the time a...,"a therapist may be helpful , but they don ' t ...",So I should see a psychologist?,i ' d recommend getting a copy of the book you...,Thank you. Any other tips for how to manage my...,the most common way you can manage stress is t...,8.0,8.5,...,0.0,0.187,0.625,0.188,-0.2484,0.0,0.934,0.066,0.2263,0.1849


In [8]:
all_data.shape

(381, 795)

In [9]:
all_data.columns.values

array(['Unnamed: 0', 'iteration', 'input', 'cb_response_1', 'input_2',
       'cb_response_2', 'input_3', 'cb_response_3', 'cbr1_score',
       'cbrall_score', 'prompt_s_Positiv_GI', 'prompt_s_Negativ_GI',
       'prompt_s_Pstv_GI', 'prompt_s_Affil_GI', 'prompt_s_Ngtv_GI',
       'prompt_s_Hostile_GI', 'prompt_s_Strong_GI', 'prompt_s_Power_GI',
       'prompt_s_Weak_GI', 'prompt_s_Submit_GI', 'prompt_s_Active_GI',
       'prompt_s_Passive_GI', 'prompt_s_Pleasur_GI', 'prompt_s_Pain_GI',
       'prompt_s_Feel_GI', 'prompt_s_Arousal_GI', 'prompt_s_Emot_GI',
       'prompt_s_Virtue_GI', 'prompt_s_Vice_GI', 'prompt_s_Ovrst_GI',
       'prompt_s_Undrst_GI', 'prompt_s_Academ_GI', 'prompt_s_Doctrin_GI',
       'prompt_s_Econ_2_GI', 'prompt_s_Exch_GI', 'prompt_s_Econ_GI',
       'prompt_s_Exprsv_GI', 'prompt_s_Legal_GI', 'prompt_s_Milit_GI',
       'prompt_s_Polit_2_GI', 'prompt_s_Polit_GI', 'prompt_s_Relig_GI',
       'prompt_s_Role_GI', 'prompt_s_Coll_GI', 'prompt_s_Work_GI',
       'prompt_s