In [1]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
import pandas as pd
import nltk
import json
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/ataman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ataman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
file_paths = [
    "/Users/ataman/Downloads/soft_eng_data/20230810_123110_pr_sharings.json",
    "/Users/ataman/Downloads/soft_eng_data/20230810_123938_issue_sharings.json",
    "/Users/ataman/Downloads/soft_eng_data/20230810_124048_discussion_sharings.json",
    "/Users/ataman/Downloads/soft_eng_data/20230810_124807_commit_sharings.json",
    "/Users/ataman/Downloads/soft_eng_data/20230810_133121_file_sharings.json",
    "/Users/ataman/Downloads/soft_eng_data/20230810_134011_hn_sharings.json"
]




In [3]:



def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    text = ' '.join(words)
    return text


In [4]:
import os

for file_path in file_paths:
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")
    else:
        print(f"File exists: {file_path}")


File exists: /Users/ataman/Downloads/soft_eng_data/20230810_123110_pr_sharings.json
File exists: /Users/ataman/Downloads/soft_eng_data/20230810_123938_issue_sharings.json
File exists: /Users/ataman/Downloads/soft_eng_data/20230810_124048_discussion_sharings.json
File exists: /Users/ataman/Downloads/soft_eng_data/20230810_124807_commit_sharings.json
File exists: /Users/ataman/Downloads/soft_eng_data/20230810_133121_file_sharings.json
File exists: /Users/ataman/Downloads/soft_eng_data/20230810_134011_hn_sharings.json


In [5]:

all_conversations = []


for file_path in file_paths:
    with open(file_path, 'r') as file:
        data = json.load(file)['Sources']
        for source in data:
            for chatgpt_sharing in source.get('ChatgptSharing', []):
                conversation = []
                for interaction in chatgpt_sharing.get('Conversations', []):
                    prompt = interaction.get('Prompt', '')
                    answer = interaction.get('Answer', '')
                    if prompt and answer:
                        conversation.append({'prompt': prompt, 'answer': answer})
                if conversation:
                    all_conversations.append(conversation)


In [6]:

vectorizer = TfidfVectorizer(preprocessor=preprocess)
initial_prompts = [conversation[0]['prompt'] for conversation in all_conversations]
initial_vectors = vectorizer.fit_transform(initial_prompts)


In [7]:

for i, conversation in enumerate(all_conversations):
    initial_vector = initial_vectors[i:i+1]  
    follow_up_texts = [interaction['prompt'] for interaction in conversation[1:]]
    related_follow_ups = 0
    new_queries = 0
    
    if follow_up_texts:
        follow_up_vectors = vectorizer.transform(follow_up_texts)
        similarities = cosine_similarity(follow_up_vectors, initial_vector).flatten()

        
        threshold = 0.5
        related_follow_ups = (similarities >= threshold).sum()
        new_queries = (similarities < threshold).sum()

   
    conversation[0]['related_follow_ups'] = related_follow_ups
    conversation[0]['new_queries'] = new_queries


In [None]:

for conversation in all_conversations:
    initial_prompt = conversation[0]['prompt']
    analysis = TextBlob(initial_prompt)
    sentiment = analysis.sentiment.polarity
    subjectivity = analysis.sentiment.subjectivity
    conversation[0]['sentiment'] = sentiment
    conversation[0]['subjectivity'] = subjectivity


In [None]:

data = {
    'initial_query': [],
    'sentiment': [],
    'subjectivity': [],
    'related_follow_ups': [],
    'new_queries': []
}

for conversation in all_conversations:
    data['initial_query'].append(conversation[0]['prompt'])
    data['sentiment'].append(conversation[0]['sentiment'])
    data['subjectivity'].append(conversation[0]['subjectivity'])
    data['related_follow_ups'].append(conversation[0]['related_follow_ups'])
    data['new_queries'].append(conversation[0]['new_queries'])

df = pd.DataFrame(data)


In [None]:

average_sentiment = df['sentiment'].mean()
average_subjectivity = df['subjectivity'].mean()
average_related_follow_ups = df['related_follow_ups'].mean()
average_new_queries = df['new_queries'].mean()

print(f"Average Sentiment: {average_sentiment}")
print(f"Average Subjectivity: {average_subjectivity}")
print(f"Average Number of Related Follow-Ups: {average_related_follow_ups}")
print(f"Average Number of New Queries: {average_new_queries}")
