In [None]:
import json
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def bert_encode(text):
    if not text:
        return np.zeros((1, model.config.dim))
    tokens = tokenizer.tokenize(text)
    if not tokens:
        return np.zeros((1, model.config.dim))
    tokens = tokens[:510]
    input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(0)

    with torch.no_grad():
        outputs = model(input_ids)
        return outputs[0].mean(dim=1).detach().numpy()


def calculate_similarity(text1, text2):
    vec1 = bert_encode(text1)
    vec2 = bert_encode(text2)
    return cosine_similarity(vec1, vec2)[0][0]

file_paths = [
    "20230727_195927_pr_sharings.json",
    "20230727_195941_issue_sharings.json",
]

total_related_follow_ups = 0
total_new_queries = 0

for file_path in file_paths:
    with open(file_path, 'r') as file:
        data = json.load(file)['Sources']
        for source in data:
            for chatgpt_sharing in source.get('ChatgptSharing', []):
                conversations = chatgpt_sharing.get('Conversations', [])
                if len(conversations) > 1:
                    for i in range(len(conversations) - 1):
                        current_interaction = conversations[i]
                        next_interaction = conversations[i + 1]
                        similarity = calculate_similarity(current_interaction['Answer'], next_interaction['Prompt'])
                        if similarity >= 0.5:
                            total_related_follow_ups += 1
                        else:
                            total_new_queries += 1

print(f"Total Related Follow-Ups: {total_related_follow_ups}")
print(f"Total New Queries: {total_new_queries}")


Total Related Follow-Ups: 1341
Total New Queries: 35


# New Section