# Utilities 

First, we defined all common imports and functions used across the code.

In [5]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

%matplotlib inline

snapshots = []

def read_json_file(file_path):
    print(f"\tReading file: {file_path}")
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

def calculate_average(data):
    return sum(data) / len(data)
    
def get_snapshots():
    if len(snapshots) > 0:
        print("Returning existing snapshots")
        return snapshots

    print("Returning snapshots from OS")
    for root, _, files in os.walk("../snapshots"):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                snapshots.append(read_json_file(file_path))
    
    return snapshots
    
def draw_pie_chart(sizes, labels, title):
    fig, ax = plt.subplots()
    ax.pie(sizes, labels=labels, autopct='%1.1f%%')
    ax.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
    ax.set_title(title)
    plt.show()

# Research Question 1

What is the typical structure of conversations between developers and ChatGPT?   
How many turns does it take on average to reach a conclusion?

# Approach 1: Simple Issues Categorization

In this approach, ..

In [None]:
def categorize_conversation(prompt, answer):
    prompt = prompt.lower()
    answer = answer.lower()

    if "bug" in prompt or "error" in answer:
        category = "Bug"
    elif "feature" in prompt or "implement" in answer:
        category = "Feature Request"
    elif "how to" in prompt or "why" in answer:
        category = "Theoretical Question"
    else:
        category = "Other"

    return category


def process_json_files_rq1_simple():
    categories_count = {
        "Bug": 0,
        "Feature Request": 0,
        "Theoretical Question": 0,
        "Other": 0,
    }

    for data in get_snapshots():
        for source in data.get("Sources", []):
            chatgpt_sharing = source.get("ChatgptSharing", [])
            for sharing in chatgpt_sharing:
                for conversation in sharing.get("Conversations", []):
                    prompt = conversation.get("Prompt", "")
                    answer = conversation.get("Answer", "")
                    category = categorize_conversation(prompt, answer)
                    categories_count[category] += 1

    return categories_count


print("Starting analysis...")
result_rq1_simple = process_json_files_rq1_simple()
print("\nAnalysis completed.\n")

### RQ1 Results: Simple Issues Categorization

In [None]:
print("Results:")
print("\tTotal conversations:", sum(result_rq1_simple.values()))

for category, count in result_rq1_simple.items():
    print(f"\t{category}: {count}")

draw_pie_chart(
    result_rq1_simple.values(), 
    result_rq1_simple.keys(), 
    "Results of RQ1: Simple Issue Categorization"
)

### Approach 2: Issues categorization via Trained Model

In this approach, we 

In [6]:
vectorizer = TfidfVectorizer(max_features=1000)
model = MultinomialNB()


def train_model():
    # Load and prepare the machine learning model
    print("Training model...")
    
    df = pd.read_csv("training_set.csv")
    X_train, _, y_train, _ = train_test_split(
        df["text"], df["label"], test_size=0.2, random_state=42
    )
    X_train_tfidf = vectorizer.fit_transform(X_train)
    model.fit(X_train_tfidf, y_train)
    
    print("Training model complete.\n")


def combined_categorization(prompt, answer, model, vectorizer):
    combined_text = prompt + " " + answer
    transformed_text = vectorizer.transform([combined_text])
    return model.predict(transformed_text)[0]


def process_json_files_rq1_model(model, vectorizer):
    categories_count = {
        "Bug": 0,
        "Feature Request": 0,
        "Theoretical Question": 0,
        "Other": 0,
    }
    results = []

    for data in get_snapshots():
        for source in data.get("Sources", []):
            chatgpt_sharing = source.get("ChatgptSharing", [])
            for sharing in chatgpt_sharing:
                for conversation in sharing.get("Conversations", []):
                    prompt = conversation.get("Prompt", "")
                    answer = conversation.get("Answer", "")
                    category = combined_categorization(
                        prompt, answer, model, vectorizer
                    )
                    categories_count[category] += 1

                    # Only add to results if category is not 'Other'
                    if category != "Other":
                        results.append(
                            {
                                "prompt": prompt,
                                "answer": answer,
                                "category": category,
                            }
                        )

    return categories_count, results


train_model()

print("Starting analysis...")
model_categories_count, model_categorized_results = process_json_files_rq1_model(model, vectorizer)
print("\nAnalysis completed.\n")

Training model...
Training model complete.

Starting analysis...


NameError: name 'vectorizer' is not defined

### RQ1 Results: Issues Categorization via Trained Model

In [None]:
print("Results:")
print("\tTotal conversations:", len(model_categorized_results))

for category, count in model_categories_count.items():
    print(f"\t{category}: {count}")

draw_pie_chart(
    model_categories_count.values(), 
    model_categories_count.keys(), 
    "Results of RQ1: Issues Categorization via Trained Model"
)

# Research Question 2

What is the typical structure of conversations between developers and ChatGPT?   
How many turns does it take on average to reach a conclusion?

In [None]:
def process_json_files_rq2():
    structure = {
        "NumberOfPrompts": [],
        "TokensOfPrompts": [],
        "TokensOfAnswers": [],
        "ListOfCode": 0,
        "TotalConversations": 0,
    }
  
    for data in get_snapshots():
        for source in data.get("Sources", []):
            chatgpt_sharing = source.get("ChatgptSharing", [])
            for sharing in chatgpt_sharing:
                
                number_of_prompts = sharing.get("NumberOfPrompts")
                if number_of_prompts is not None or "":
                    structure["NumberOfPrompts"].append(number_of_prompts)

                number_of_tokens_per_prompts = sharing.get("TokensOfPrompts")
                if number_of_tokens_per_prompts is not None or "":
                    structure["TokensOfPrompts"].append(
                        number_of_tokens_per_prompts
                    )

                number_of_tokens_per_answers = sharing.get("TokensOfAnswers")
                if number_of_tokens_per_answers is not None or "":
                    structure["TokensOfAnswers"].append(
                        number_of_tokens_per_answers
                    )

                # Get number of conversations having code
                for conversation in sharing.get("Conversations", []):
                    structure["TotalConversations"] += 1
                    list_of_code = conversation.get("ListOfCode", [])
                    if len(list_of_code) > 0:
                        structure["ListOfCode"] += 1

    return structure


print("Starting analysis...")
structure_results = process_json_files_rq2()
print("\nAnalysis completed.\n")

### RQ2 Results: Structure and statistics of conversations

In [None]:
print("Results:")

print("\tNumber of total conversations: ", structure_results["TotalConversations"])
print("\tNumber of conversations having code: ", structure_results["ListOfCode"])

print(
    "\tAverage number of prompts in a conversations: ",
    calculate_average(structure_results["NumberOfPrompts"]),
)

print(
    "\tAverage number of words in a question of a conversation: ",
    calculate_average(structure_results["TokensOfPrompts"]),
)

print(
    "\tAverage number of words in an answer of a conversation: ",
    calculate_average(structure_results["TokensOfAnswers"]),
)