In [74]:
import pandas as pd
df = pd.read_csv('../data/recall_mood_data.csv', index_col=0)
df.head(1)

Unnamed: 0,Date,Experimenter,Subject Initials,SubjectID,ConditionExpt,ConditionSub,Credit,Notes,convos,recalls
0,2018-11-30 00:00:00,Evgeniia,JM,1,Incongruent,Happy,1,,I'm Spencer by the way I'm Jake Nice to meet y...,We can start with living in Nashville I live ...


In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

In [6]:
def get_embeddings(sentences):
    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    return sentence_embeddings

In [8]:
convos = get_embeddings(df['convos'].to_list())

In [9]:
recalls = get_embeddings(df['recalls'].to_list())

In [None]:
torch.reshape(recalls[1], (1,768))

In [59]:
cos = torch.nn.CosineSimilarity(dim=1)
distances = []
for convo, recall in zip(convos, recalls):
    distances.append(cos(convo, torch.reshape(recall, (1,768))).numpy())

In [75]:
df['cosine_distances'] = distances
df.head(1)

Unnamed: 0,Date,Experimenter,Subject Initials,SubjectID,ConditionExpt,ConditionSub,Credit,Notes,convos,recalls,cosine_distances
0,2018-11-30 00:00:00,Evgeniia,JM,1,Incongruent,Happy,1,,I'm Spencer by the way I'm Jake Nice to meet y...,We can start with living in Nashville I live ...,[0.5797823]


# No correlation

In [62]:
from scipy import stats

stats.spearmanr(df['SubjectID'], df['cosine_distances'])

SpearmanrResult(correlation=0.017745298901829312, pvalue=0.8487400392089366)

# Recall sentences with embeddings

We recall about half of the conversations using this method.

This is not working as well as I had imagined.

In [73]:
cos = torch.nn.CosineSimilarity(dim=1)

for i, recall in enumerate(recalls):
    sim = cos(convos, recall).argmax(-1).int()
    if sim == i or sim == (i//2)*2:
        print('Successful recall!')
    else:
        print(f'{i} was most similar to {sim}')

0 was most similar to 32
Successful recall!
Successful recall!
Successful recall!
Successful recall!
Successful recall!
6 was most similar to 50
7 was most similar to 10
Successful recall!
9 was most similar to 32
10 was most similar to 92
11 was most similar to 32
12 was most similar to 92
13 was most similar to 106
14 was most similar to 44
15 was most similar to 8
Successful recall!
17 was most similar to 92
Successful recall!
19 was most similar to 88
Successful recall!
Successful recall!
Successful recall!
23 was most similar to 4
24 was most similar to 8
Successful recall!
26 was most similar to 32
27 was most similar to 50
28 was most similar to 44
29 was most similar to 44
30 was most similar to 4
Successful recall!
Successful recall!
Successful recall!
Successful recall!
35 was most similar to 88
36 was most similar to 10
37 was most similar to 52
38 was most similar to 32
39 was most similar to 32
40 was most similar to 48
41 was most similar to 48
42 was most similar to 32
S