### Install the model

In [19]:
#For reference:
from platform import python_version
print(python_version()) ##3.11.4

3.9.13


In [2]:
import numpy as np
from scipy import stats as ssp
import scipy as sp
import pandas as pd
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows', 100)
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score



In [3]:
#load Roberta Vectors from the multilingual model: https://www.sbert.net/index.html
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

### Get and clean the data

In [20]:
#Load data
data = pd.read_csv("./activities_sample2.csv")

In [21]:
#Add space after commas in categorical responses
data['activity_mc'] = data['activity_mc'].str.replace(",", ", ")

In [22]:
#Remove any extra whitespace
data['activity_mc'] = data['activity_mc'].str.strip()
data['activity_open'] = data['activity_open'].str.strip() 

In [23]:
#Examine data
data.head()

Unnamed: 0,ppID,questionListName,timeStampStart,timeStampStop,activity_mc,activity_open,sample
0,302,Event-Based Morning Assessment 1,18-10-22 11:41,18-10-22 11:44,"studying, working","i had breakfast, got to uni and had an hour tu...",2
1,302,Event-Based Daily Assessment,18-10-22 14:39,18-10-22 14:41,"biking, engaging in a hobby, walking, working,...","finished my tutorial, went home, read my book ...",2
2,302,Event-Based Daily Assessment,18-10-22 19:10,18-10-22 19:11,"eating, engaging in a hobby, self-care, sleeping",lunch\r\ntherapy\r\nbook\r\nnap,2
3,302,Event-Based Daily Assessment,18-10-22 20:55,18-10-22 20:55,"self-care, studying",showered and got to my dutch class,2
4,302,Event-Based Evening Assessment,19-10-22 0:22,19-10-22 0:23,"eating, studying",dutch course and dinner,2


In [24]:
#Check for missing data
print(sum(pd.isna(data['activity_mc'])))
print(sum(pd.isna(data['activity_open'])))

3
0


In [25]:
#Remove missing data
data = data.dropna(subset='activity_mc')
data = data.dropna(subset='activity_open')

### Get unique phrases and encode vector positions for open descriptions

In [26]:
#Get unique phrases
data_unique_phrases_open = data['activity_open'].unique()

In [27]:
#Compare number of unique phrases to total number of phrases
print(len(data))
print(len(data_unique_phrases_open))

2251
1889


In [28]:
#Get the vector positions for the unique phrases
roberta_open = model.encode(data_unique_phrases_open,show_progress_bar=True)
roberta_dict_open = dict(zip(data_unique_phrases_open, roberta_open))

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

In [29]:
#Confirm the data have the expected size/shape
roberta_open.shape

(1889, 512)

In [30]:
#Assign vectors to each data point
data['robertaVec_open'] = data['activity_open'].apply(lambda x: roberta_dict_open[x])

### Get unique phrases and encode vector positions for multiple choice descriptions

In [31]:
#Get unique phrases
data_unique_phrases_mc = data['activity_mc'].unique()

In [32]:
#Compare number of unique phrases to total number of phrases
print(len(data))
print(len(data_unique_phrases_mc))

2251
1237


In [33]:
#Get the vector positions for the unique phrases
roberta_mc = model.encode(data_unique_phrases_mc,show_progress_bar=True)
roberta_dict_mc = dict(zip(data_unique_phrases_mc, roberta_mc))

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

In [34]:
#Confirm the data have the expected size/shape
roberta_mc.shape

(1237, 512)

In [35]:
#Assign vectors to each data point
data['robertaVec_mc'] = data['activity_mc'].apply(lambda x: roberta_dict_mc[x])

### Get distances between multiple choice and open descriptions

In [36]:
#Calculate pairwise distances using Euclidean distance
data['distance'] = data.apply(lambda x: np.linalg.norm(x['robertaVec_open']-x['robertaVec_mc']), axis=1)

In [37]:
def cosine(x,y):
    x = x.reshape(1, -1)
    y = y.reshape(1, -1)
    cosine = cosine_similarity(x,y)[0][0]
    return cosine

In [38]:
#Calculate pairwise distances using cosine similarity
data['similarity'] = data.apply(lambda x: cosine(x['robertaVec_open'],x['robertaVec_mc']), axis=1)

In [39]:
#Save updated data to a new csv file
data.to_csv('activities_sample2_distances.csv', index=False)