# ....


In [4]:
import json
import os
import pandas as pd


In [5]:
file_path = 'dataset_conversations.txt'

In [6]:
# Define a set of feedback keywords that indicate a successful conversation.
# You may expand or adjust this list based on your data.
SUCCESS_FEEDBACK_KEYWORDS = {
    "very satisfactory", "satisfactory", "neutral", "unsatisfactory", "very unsatisfactory"
}

def is_successful(conversation, last_n=5, feedback_length_threshold=50):
    """
    Determine if a conversation is successful by checking if:
      1. One of the last `last_n` messages contains the word "feedback"
      2. At least one user message among those last messages is short enough
         (assuming feedback is typically brief)
    Returns a tuple (success_flag, feedback_message).
    """
    messages = conversation.get("inputs", {}).get("messages", [])
    if len(messages) < 3:
        return False, None  # Too short to be a proper conversation

    # Check if any of the last `last_n` messages contain the word "feedback"
    last_messages = messages[-last_n:]
    if not any("feedback" in msg.get("content", "").lower() for msg in last_messages):
        return False, None

    # Extract user messages from the last `last_n` messages
    user_messages = [msg.get("content", "").strip() for msg in last_messages if msg.get("role", "").lower() == "user"]

    # Assume feedback messages are short; filter out longer messages
    feedback_candidates = [msg for msg in user_messages if len(msg) < feedback_length_threshold]

    if feedback_candidates:
        # Return the last short message as the feedback
        return True, feedback_candidates[-1]

    return False, None

# Example usage within the processing function:
def process_conversations(file_path):
    """
    Reads and processes the dataset file.
    Returns a DataFrame with one row per conversation including:
      - metadata (as JSON/dict)
      - conversation turns (list of messages)
      - final feedback (if any)
      - success flag
      - error_info (if any)
    """
    import json
    import pandas as pd

    conversations = []

    # Determine file type: JSONL or single JSON array
    with open(file_path, "r", encoding="utf-8") as f:
        first_char = f.read(1)
        f.seek(0)
        if first_char == '[':
            data = json.load(f)
        else:
            data = [json.loads(line) for line in f if line.strip()]

    print(f"Total conversations found: {len(data)}")

    for idx, conv in enumerate(data):
        metadata = conv.get("metadata", {})
        inputs = conv.get("inputs", {})
        messages = inputs.get("messages", [])
        outputs = conv.get("outputs", {})

        success, feedback = is_successful(conv, last_n=5, feedback_length_threshold=50)

        error_info = metadata.get("error", None)

        conversation_entry = {
            "conversation_id": idx,
            "metadata": metadata,
            "messages": messages,
            "final_feedback": feedback,
            "successful": success,
            "error_info": error_info
        }
        conversations.append(conversation_entry)

    df = pd.DataFrame(conversations)
    print("Feedback Summary:")
    print(df['final_feedback'].dropna().unique())
    print(f"Successful conversations: {df['successful'].sum()} out of {len(df)}")
    return df

    # Process each conversation
    for idx, conv in enumerate(data):
        metadata = conv.get("metadata", {})
        inputs = conv.get("inputs", {})
        messages = inputs.get("messages", [])
        outputs = conv.get("outputs", {})

        # Optionally, we could check if the system prompt is the same everywhere.
        # For now, we'll leave it in the conversation turns.

        success, feedback = is_successful(conv)

        # Optionally extract error messages or timestamps if available.
        # For this example, we assume that error messages (if any) might be present in metadata or as a separate field.
        error_info = metadata.get("error", None)  # Adjust field name if needed

        conversation_entry = {
            "conversation_id": idx,
            "metadata": metadata,
            "messages": messages,
            "final_feedback": feedback,
            "successful": success,
            "error_info": error_info
        }
        conversations.append(conversation_entry)

    df = pd.DataFrame(conversations)
    print(f"Successful conversations: {df['successful'].sum()} out of {len(df)}")
    return df



# Process the dataset and create a DataFrame of conversations
df_conversations = process_conversations(file_path)

# Save the DataFrame to a CSV file for further analysis if needed
output_csv = "processed_conversations.csv"
df_conversations.to_csv(output_csv, index=False)
print(f"Processed data saved to {output_csv}")


Total conversations found: 19
Feedback Summary:
['thanks!' 'good night!' 'I dont know what else to ask :) it was fine'
 'no thank youy' 'Nothing comes to mind right now.' 'extremely helpful'
 'no']
Successful conversations: 7 out of 19
Processed data saved to processed_conversations.csv


In [7]:
df_conversations.head()

Unnamed: 0,conversation_id,metadata,messages,final_feedback,successful,error_info
0,0,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",,False,
1,1,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",thanks!,True,
2,2,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",good night!,True,
3,3,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",,False,
4,4,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",I dont know what else to ask :) it was fine,True,


In [8]:
df_conversations['error_info'].value_counts()

Unnamed: 0_level_0,count
error_info,Unnamed: 1_level_1


In [9]:
# look at unique values in df_conversations['successful']
df_conversations['successful'].value_counts()


Unnamed: 0_level_0,count
successful,Unnamed: 1_level_1
False,12
True,7


In [10]:
# reimplemented inside is_successful() func !

# def contains_feedback_keyword(conversation, last_n=3):
#     """
#     Check if the word "feedback" appears in the content of the last `last_n` messages.
#     """
#     messages = conversation.get("inputs", {}).get("messages", [])
#     if not messages:
#         return False
#     last_messages = messages[-last_n:]
#     for msg in last_messages:
#         if "feedback" in msg.get("content", "").lower():
#             return True
#     return False

# # After processing conversations into the DataFrame, add a new column:
# df_conversations["has_feedback_prompt"] = df_conversations.apply(
#     lambda row: contains_feedback_keyword({"inputs": {"messages": row["messages"]}}), axis=1
# )

# # Print out how many conversations include the word "feedback" in the last 3 messages
# feedback_count = df_conversations["has_feedback_prompt"].sum()
# total_conversations = len(df_conversations)

# print(f"Conversations with 'feedback' in the last 3 messages: {feedback_count} out of {total_conversations}")
# print(df_conversations["has_feedback_prompt"].value_counts())


In [11]:
df_conversations = process_conversations(file_path)

# Quick check on successful conversation counts
print(df_conversations['successful'].value_counts())

Total conversations found: 19
Feedback Summary:
['thanks!' 'good night!' 'I dont know what else to ask :) it was fine'
 'no thank youy' 'Nothing comes to mind right now.' 'extremely helpful'
 'no']
Successful conversations: 7 out of 19
successful
False    12
True      7
Name: count, dtype: int64


In [12]:
# Filter the DataFrame for successful conversations
df_successful = df_conversations[df_conversations["successful"] == True]

# Write the filtered DataFrame to an Excel file
output_excel = "successful_conversations.xlsx"
df_successful.to_excel(output_excel, index=False)

print(f"Successfully wrote {len(df_successful)} successful conversations to {output_excel}")


Successfully wrote 7 successful conversations to successful_conversations.xlsx


### Filter and convert successful msgs into readable .txt

In [13]:
# Define the output text file name
output_txt = "successful_conversations.txt"

with open(output_txt, "w", encoding="utf-8") as f:
    for idx, row in df_successful.iterrows():
        # Write a conversation separator with the conversation id
        f.write(f"----- Conversation {row['conversation_id']} -----\n\n")

        # Extract messages (list of message dictionaries)
        messages = row["messages"]
        for msg in messages:
            role = msg.get("role", "").lower()
            # Skip system messages
            if role == "system":
                continue

            # Capitalize the role for display
            role_display = "Assistant" if role == "assistant" else "User" if role == "user" else role.capitalize()
            content = msg.get("content", "").strip()
            f.write(f"{role_display}: {content}\n\n")

        # Add extra blank lines to clearly separate conversations
        f.write("\n\n")

print(f"Successfully wrote successful conversations to {output_txt}")


Successfully wrote successful conversations to successful_conversations.txt


#### system prompt extraction

In [14]:
# Extract all system prompts from each conversation
system_prompts = set()
for idx, row in df_conversations.iterrows():
    messages = row["messages"]
    # Look for the first system message in each conversation (if any)
    for msg in messages:
        if msg.get("role", "").lower() == "system":
            system_prompts.add(msg.get("content", "").strip())
            break  # Assuming only the first system prompt is relevant

# Check if we have a unique system prompt or multiple variants
if len(system_prompts) == 1:
    system_prompt = system_prompts.pop()
    print("Unique system prompt found.")
else:
    # If there are multiple, join them with a separator for review
    system_prompt = "\n\n---\n\n".join(system_prompts)
    print("Multiple system prompts found. They are combined below:")

# Save the system prompt(s) to a text file
output_system_prompt_file = "system_prompt.txt"
with open(output_system_prompt_file, "w", encoding="utf-8") as f:
    f.write(system_prompt)

print(f"System prompt(s) extracted and saved to {output_system_prompt_file}")

# Optionally, store it in a variable for later use
print("Extracted system prompt(s):")
print(system_prompt)


Unique system prompt found.
System prompt(s) extracted and saved to system_prompt.txt
Extracted system prompt(s):
You are a coach specialized in difficult conversations, dedicated to guiding people towards clearer and more direct communication, offering practical advice and helping to express thoughts and feelings effectively.

Guide the discussion using these steps:

----- STEPS ----

- Have a series of 3 interactions with the user about: What is the context of the conversation, who are the people involved.

- Have a series of 3 interactions with the user about: What are the unexpressed thoughts and feelings (Left column: what we don't say but do think or feel).

- Have a series of 3 interactions with the user about: What is the deep truth and the fundamental values affected (The essence of the unexpressed thoughts and feelings).

- Have a series of 3 interactions with the user about: What is the difference between facts and thoughts, explain the concept of the ladder of inference: th

### Conversations analysis

In [15]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

In [16]:
# Ensure NLTK's required resources are available.
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [17]:
def compute_dialogue_length(messages):
    """
    Compute the total number of words in a conversation's messages,
    excluding messages from the 'system' role.
    """
    total_words = 0
    for msg in messages:
        role = msg.get("role", "").lower()
        # Exclude system messages
        if role == "system":
            continue
        content = msg.get("content", "")
        total_words += len(word_tokenize(content))
    return total_words

In [18]:
# Add a new column 'dialogue_length' to df_conversations
df_conversations['dialogue_length'] = df_conversations['messages'].apply(compute_dialogue_length)

# Separate successful and non-successful conversations
df_success = df_conversations[df_conversations['successful'] == True]
df_non_success = df_conversations[df_conversations['successful'] == False]

# Compute metrics for successful conversations
success_lengths = df_success['dialogue_length'].tolist()
mean_success = np.mean(success_lengths) if success_lengths else 0
median_success = np.median(success_lengths) if success_lengths else 0

# Compute metrics for non-successful conversations
non_success_lengths = df_non_success['dialogue_length'].tolist()
mean_non_success = np.mean(non_success_lengths) if non_success_lengths else 0
median_non_success = np.median(non_success_lengths) if non_success_lengths else 0

print("Successful Conversations:")
print("Mean dialogue length (words):", mean_success)
print("Median dialogue length (words):", median_success)
print("All dialogue lengths:", success_lengths)

print("\nNon-Successful Conversations:")
print("Mean dialogue length (words):", mean_non_success)
print("Median dialogue length (words):", median_non_success)
print("All dialogue lengths:", non_success_lengths)


Successful Conversations:
Mean dialogue length (words): 2029.7142857142858
Median dialogue length (words): 1934.0
All dialogue lengths: [2214, 1644, 1587, 2919, 2211, 1934, 1699]

Non-Successful Conversations:
Mean dialogue length (words): 9408.833333333334
Median dialogue length (words): 2041.0
All dialogue lengths: [87303, 4543, 3044, 2843, 1882, 1413, 1489, 1220, 1838, 3762, 2200, 1369]


there's a weird very long conversation, let's look at it separately
it's not really meaningful so I pop it from all dialogue length to see more realistic picture  


In [19]:
non_success_lengths.pop(0)
non_success_lengths

[4543, 3044, 2843, 1882, 1413, 1489, 1220, 1838, 3762, 2200, 1369]

In [20]:
f'without outliers: mean dialogue length: {np.mean(non_success_lengths)}, median length: {np.median(non_success_lengths)}'

'without outliers: mean dialogue length: 2327.5454545454545, median length: 1882.0'

In [21]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m122.9/244.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [22]:
# write ALL conversations to WORD document
from docx import Document

doc = Document()
doc.add_heading('All Conversations', level=1)

# Loop through each conversation in the DataFrame (df_conversations)
for idx, row in df_conversations.iterrows():
    conv_id = row.get("conversation_id", idx)
    doc.add_heading(f'Conversation {conv_id}', level=2)

    # Retrieve the list of messages for the conversation
    messages = row.get("messages", [])

    # Loop through messages in their original order
    for msg in messages:
        role = msg.get("role", "").lower()
        # Skip system messages to keep the conversation between user and assistant
        if role == "system":
            continue
        # Ensure role is displayed in order: user, assistant, etc.
        role_display = "User" if role == "user" else "Assistant" if role == "assistant" else role.capitalize()
        content = msg.get("content", "").strip()
        doc.add_paragraph(f"{role_display}: {content}")

    # Add an extra blank paragraph to separate conversations
    doc.add_paragraph("\n")

# Save the document
output_docx = "all_conversations.docx"
doc.save(output_docx)
print(f"All conversations have been saved to {output_docx}")


All conversations have been saved to all_conversations.docx


In [32]:
df_conversations


Unnamed: 0,conversation_id,metadata,messages,final_feedback,successful,error_info,dialogue_length,turn_metrics
0,0,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",,False,,87303,"{'turn_count': 1325, 'user_turns': 663, 'assis..."
1,1,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",thanks!,True,,2214,"{'turn_count': 31, 'user_turns': 16, 'assistan..."
2,2,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",good night!,True,,1644,"{'turn_count': 25, 'user_turns': 13, 'assistan..."
3,3,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",,False,,4543,"{'turn_count': 41, 'user_turns': 21, 'assistan..."
4,4,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",I dont know what else to ask :) it was fine,True,,1587,"{'turn_count': 23, 'user_turns': 12, 'assistan..."
5,5,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",no thank youy,True,,2919,"{'turn_count': 39, 'user_turns': 20, 'assistan..."
6,6,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",,False,,3044,"{'turn_count': 33, 'user_turns': 17, 'assistan..."
7,7,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",,False,,2843,"{'turn_count': 41, 'user_turns': 21, 'assistan..."
8,8,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",,False,,1882,"{'turn_count': 25, 'user_turns': 13, 'assistan..."
9,9,"{'dataset_split': ['base'], 'ls_model_type': '...","[{'role': 'system', 'content': 'You are a coac...",,False,,1413,"{'turn_count': 19, 'user_turns': 10, 'assistan..."


In [23]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from docx import Document

# Ensure NLTK resources are downloaded.
nltk.download('punkt')
nltk.download('stopwords')

# --- Part 1: Write non‑successful conversations to a Word document ---

# Assume df_conversations is already loaded and processed with a "successful" flag
# and a "messages" column containing the conversation turns.
# We also assume that a column "dialogue_length" (total word count excluding system messages)
# is already computed as in our previous analysis.

# Filter non-successful conversations
df_non_success = df_conversations[df_conversations['successful'] == False]

# Create a Word document
doc = Document()
doc.add_heading('Non-Successful Conversations', level=1)

for idx, row in df_non_success.iterrows():
    conv_id = row['conversation_id']
    doc.add_heading(f'Conversation ID: {conv_id}', level=2)

    # Iterate through messages in original order, skipping system messages.
    messages = row['messages']
    for msg in messages:
        role = msg.get("role", "").lower()
        if role == "system":
            continue
        # Capitalize role for clarity.
        role_display = "User" if role == "user" else "Assistant" if role == "assistant" else role.capitalize()
        content = msg.get("content", "").strip()
        # Add each message as a paragraph.
        doc.add_paragraph(f"{role_display}: {content}")
    # Add a blank paragraph as separator between conversations.
    doc.add_paragraph("\n")

# Save the document.
doc_output = "non_successful_conversations.docx"
doc.save(doc_output)
print(f"Non-successful conversations saved to {doc_output}")

# --- Part 2: Extract and analyze the outlier conversation (87303 words) ---

# Find the outlier conversation in df_non_success.
outlier_df = df_non_success[df_non_success['dialogue_length'] == 87303]
if outlier_df.empty:
    print("No conversation with 87303 words found.")
else:
    # Assume there's only one outlier; extract it.
    outlier_conv = outlier_df.iloc[0]
    outlier_id = outlier_conv['conversation_id']
    print(f"Outlier conversation found with ID: {outlier_id}")

    # Extract full text of the conversation (excluding system messages)
    conv_text_parts = []
    for msg in outlier_conv['messages']:
        role = msg.get("role", "").lower()
        if role == "system":
            continue
        role_display = "User" if role == "user" else "Assistant" if role == "assistant" else role.capitalize()
        content = msg.get("content", "").strip()
        conv_text_parts.append(f"{role_display}: {content}")
    conv_text = "\n\n".join(conv_text_parts)

    # Print basic analysis:
    # Tokenize the text.
    tokens = word_tokenize(conv_text.lower())
    # Remove stopwords and punctuation.
    stop_words = set(stopwords.words('english'))
    punctuation = set(".,!?;:")
    filtered_tokens = [t for t in tokens if t not in stop_words and t not in punctuation]

    # Compute frequency distribution.
    freq_dist = Counter(filtered_tokens)
    most_common = freq_dist.most_common(10)

    print(f"\nOutlier Conversation (ID: {outlier_id}) Analysis:")
    print(f"Total words (after tokenization): {len(tokens)}")
    print("Top 10 most common words (excluding stopwords/punctuation):")
    for word, count in most_common:
        print(f"  {word}: {count}")

    # Optionally, you can save the outlier conversation text to a separate .txt file for further review.
    with open("outlier_conversation.txt", "w", encoding="utf-8") as f_out:
        f_out.write(conv_text)
    print("Outlier conversation text saved to outlier_conversation.txt")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Non-successful conversations saved to non_successful_conversations.docx
Outlier conversation found with ID: 0

Outlier Conversation (ID: 0) Analysis:
Total words (after tokenization): 89953
Top 10 most common words (excluding stopwords/punctuation):
  team: 2383
  *: 2332
  member: 1482
  's: 1182
  help: 1174
  thoughts: 1162
  situation: 1161
  conversation: 962
  review: 934
  user: 663
Outlier conversation text saved to outlier_conversation.txt


In [24]:
# Example DataFrame structure:
# Each row is a conversation with a 'messages' column containing a list of message dictionaries.
# Example message: {"role": "user", "content": "Hello, how are you?"}

def compute_turn_metrics(messages):
    turn_count = 0
    user_turns = 0
    assistant_turns = 0
    total_words = 0
    words_per_turn = []

    for msg in messages:
        role = msg.get("role", "").lower()
        # Optionally, skip system messages:
        if role == "system":
            continue
        turn_count += 1
        if role == "user":
            user_turns += 1
        elif role == "assistant":
            assistant_turns += 1

        content = msg.get("content", "")
        tokens = word_tokenize(content)
        word_count = len(tokens)
        total_words += word_count
        words_per_turn.append(word_count)

    avg_turn_length = total_words / turn_count if turn_count > 0 else 0
    return {
        "turn_count": turn_count,
        "user_turns": user_turns,
        "assistant_turns": assistant_turns,
        "total_words": total_words,
        "avg_turn_length": avg_turn_length,
        "words_per_turn": words_per_turn
    }

# Assuming df_conversations is your DataFrame with each conversation in a 'messages' column.
df_conversations["turn_metrics"] = df_conversations["messages"].apply(compute_turn_metrics)

# Example: Extract and display metrics for the 2nd conversation.
first_metrics = df_conversations.iloc[1]["turn_metrics"]
print('conversation #2')
print("Turn count:", first_metrics["turn_count"])
print("User turns:", first_metrics["user_turns"])
print("Assistant turns:", first_metrics["assistant_turns"])
print("Average turn length (words):", first_metrics["avg_turn_length"])


conversation #2
Turn count: 31
User turns: 16
Assistant turns: 15
Average turn length (words): 71.41935483870968


In [25]:
# Loop over all conversations and print turn metrics
for index, row in df_conversations.iterrows():
    # Calculate metrics for the current conversation
    metrics = compute_turn_metrics(row["messages"])
    conv_id = row.get("conversation_id", index)

    print(f"Conversation {conv_id}:")
    print(f"  Total Turns: {metrics['turn_count']}")
    print(f"  User Turns: {metrics['user_turns']}")
    print(f"  Assistant Turns: {metrics['assistant_turns']}")
    print(f"  Total Words: {metrics['total_words']}")
    print(f"  Average Turn Length: {metrics['avg_turn_length']:.2f} words")
    print("-" * 40)


Conversation 0:
  Total Turns: 1325
  User Turns: 663
  Assistant Turns: 662
  Total Words: 87303
  Average Turn Length: 65.89 words
----------------------------------------
Conversation 1:
  Total Turns: 31
  User Turns: 16
  Assistant Turns: 15
  Total Words: 2214
  Average Turn Length: 71.42 words
----------------------------------------
Conversation 2:
  Total Turns: 25
  User Turns: 13
  Assistant Turns: 12
  Total Words: 1644
  Average Turn Length: 65.76 words
----------------------------------------
Conversation 3:
  Total Turns: 41
  User Turns: 21
  Assistant Turns: 20
  Total Words: 4543
  Average Turn Length: 110.80 words
----------------------------------------
Conversation 4:
  Total Turns: 23
  User Turns: 12
  Assistant Turns: 11
  Total Words: 1587
  Average Turn Length: 69.00 words
----------------------------------------
Conversation 5:
  Total Turns: 39
  User Turns: 20
  Assistant Turns: 19
  Total Words: 2919
  Average Turn Length: 74.85 words
---------------------

**Turn Balance:**
Most conversations show a balanced turn distribution between the user and the assistant. For instance, in almost every dialogue, the number of user turns and assistant turns is almost equal, which may indicate that both parties are engaged in a back-and-forth exchange. This balance could be a positive indicator of a collaborative conversation.

**Conversation Length Variation:**
The total turn counts vary widely—from as few as 13 turns (Conversation 12) to as many as 43 turns (Conversation 14). Longer conversations (in terms of turns or total words) might suggest more in‑depth or complex discussions, while shorter ones could be more straightforward or focused.

**Average Turn Length Differences:**
Average turn lengths range from about 51 words (Conversation 17) to over 110 words (Conversation 3). Longer turns may *indicate more detailed or explanatory responses*, whereas shorter turns might be more to the point.

For example, **Conversation 3**’s higher average (110.80 words per turn) could reflect a more elaborative or detailed exchange, perhaps tackling a more nuanced issue.
In contrast, **Conversation 17**’s lower average (51.48 words per turn) might suggest a brisk, succinct conversation style.

**Implications for Dialogue Quality:**
A balanced turn-taking structure generally indicates that both sides are contributing. However, whether longer or shorter turns lead to higher user satisfaction **might depend on the context**:

In some situations, concise answers (shorter turns) could be preferred for efficiency.
In other contexts, more detailed answers (longer turns) might be necessary to cover complex topics.

**Potential for Further Analysis:**
It could be useful to correlate these turn metrics with other factors (e.g., user feedback or outcome measures) to see if, for example, conversations with a particular range of average turn lengths tend to be rated more highly. Also, analyzing whether longer conversations tend to be more engaging or if they sometimes indicate over-elaboration could provide deeper insights.

#### **Analyzing conversation #3**
with the highest average words per turn

**Summary**:

In this conversation, the user prepares for a difficult discussion with a client. The context is that the user's team is undergoing a restructuring—because the business is moving to a more cost‑effective country, the team needs to meet a specific Leader-to-team member ratio, which now renders one Leader surplus. The user explains that one Leader, who has demonstrated strong technical skills and contributed significantly to the team, might be re‑assigned to a new role. However, the client is known to be extremely cost‑focused and numbers‑oriented and is new to this line of business, meaning that trust hasn’t been fully established yet.

Throughout the conversation, the assistant guides the user in clarifying not only the factual background (such as the current ratio and the observed technical proficiency) but also the deeper concerns and unexpressed feelings. The user reveals internal conflicts: on one hand, a sense of responsibility to advocate for the team and explore all options for retaining valuable talent; on the other, a worry about being perceived as misaligned with the client’s cost‑cutting strategy or as incompetent. The conversation delves into these layers, touching on the user's values around human‑centric leadership, professional integrity, and the importance of business continuity. The assistant helps the user outline both the context and an example dialogue, and later assists in outlining a pilot proposal to test the new role’s value while addressing cost concerns.

**Conclusions and Relevance:**

**Depth of Analysis:**
The conversation is highly relevant because it not only captures the logistical challenges (such as the need to adjust team ratios) but also deeply explores the emotional and value‑based dimensions of the issue. The assistant’s probing questions help the user articulate both the observable facts and the underlying concerns—key elements when preparing for a strategic conversation with a cost‑focused client.

**Balanced Perspective**:
The dialogue shows that the user is trying to balance two important objectives: honoring the client's financial priorities while ensuring that valuable team capabilities aren’t lost. This balance is essential for maintaining business continuity and strategic alignment. The conversation’s structure highlights the importance of empathy, active listening, and finding common ground—critical skills in challenging negotiations.

**Actionable Outcome**:
The discussion culminates in concrete next steps, such as drafting a pilot proposal with clear KPIs. By outlining a proposal that includes an initial assessment, a pilot phase, and regular reviews, the conversation directly supports the user in taking a data‑driven, measured approach that could mitigate risks associated with both talent loss and client cost concerns.

**Relevance to Business Strategy**:
Given that the client’s priorities are sharply focused on cost reduction, the conversation’s detailed exploration of how to demonstrate value (e.g., through efficiency gains and strategic role re‑assignment) is particularly pertinent. It ensures that the user's approach is aligned with both the emotional needs of the team and the practical, financial demands of the client.

**Overall**, Conversation #3 is an excellent example of using a structured, reflective approach to prepare for a difficult conversation. It demonstrates how combining objective data with an exploration of personal values and concerns can lead to a more effective, nuanced strategy that addresses both internal and external stakeholder needs.

## LangChain

In [26]:
!pip install langchain openai pandas



In [27]:
!pip install --upgrade langchain langchain-community langchain-openai


Collecting langchain
  Downloading langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.9-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.7 (from langchain)
  Downloading langchain_text_splitters-0.3.7-py3-none-any.whl.metadata (1.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting openai<2.0.0,>=1.66.3 (from langchain-openai)
  Downloading openai-1.66.5-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken<1,>=0.7 (fro

In [28]:
import langchain
print(dir(langchain))



In [29]:
import os
import json
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

import re

In [30]:
os.environ["OPENAI_API_KEY"] = "" # Replace with actual key!

In [None]:
conversations = []
with open(file_path, "r", encoding="utf-8") as file:
    for line in file:
        try:
            conversations.append(json.loads(line.strip()))
        except json.JSONDecodeError:
            continue  # Skip invalid lines

# Extract relevant data

data_list = []
for convo in conversations:
    messages = convo.get("inputs", {}).get("messages", [])

    # Extract user messages
    user_messages = [msg["content"] for msg in messages if msg["role"] == "user"]

    # Extract feedback correctly
    feedback_messages = [msg for msg in user_messages if any(
        kw in msg.lower() for kw in ["satisfactory", "helpful", "useful", "positive", "negative"]
    )]

    # Store extracted information
    data_list.append({
        "user_messages": " ".join(user_messages),  # Combine all user messages for LLM input
        "feedback_messages": " ".join(feedback_messages),
    })


# Convert to DataFrame
df = pd.DataFrame(data_list)

# Initialize LangChain LLM
llm = ChatOpenAI(model_name="gpt-4o")


In [None]:
# Function to categorize conversations and extract trends
def analyze_conversation(convo_text):
    prompt = f"""
    Analyze the following conversation and categorize it into a main theme (e.g., "Conflict Resolution", "Leadership Coaching", "Feedback Handling").
    Also, determine the user's satisfaction level based on their feedback.

    Conversation:
    {convo_text}

    Return the result in JSON format with:
    - "category": The conversation theme
    - "satisfaction": "High", "Medium", or "Low"
    """

    response = llm.invoke([HumanMessage(content=prompt)]).content

    # Fix: Remove markdown formatting (backticks) from JSON response
    clean_response = re.sub(r"```json\n|\n```", "", response).strip()

    # Handle empty or malformed responses
    try:
        return json.loads(clean_response)  # Convert response to dictionary
    except json.JSONDecodeError:
        print(f"Warning: LLM returned an unparseable response:\n{response}\n")
        return {"category": "Unknown", "satisfaction": "Unknown"}  # Default fallback


# Apply AI analysis to each conversation
df["analysis"] = df["user_messages"].apply(analyze_conversation)

# Extract structured results
df["category"] = df["analysis"].apply(lambda x: x.get("category", "Unknown"))
df["satisfaction"] = df["analysis"].apply(lambda x: x.get("satisfaction", "Unknown"))

# Save processed data
df.to_csv("conversation_analysis_results.csv", index=False)

# Display results
print(df.head())

# Group by category to analyze trends
category_trends = df.groupby("category")["satisfaction"].value_counts(normalize=True).unstack()

# Display trends in satisfaction across categories
print("\nUser Satisfaction Trends by Conversation Type:")
print(category_trends)


                                       user_messages  \
0  Review a difficult conversation I already had ...   
1  Prepare for a difficult conversation I am goin...   
2  Review a difficult conversation I already had ...   
3  Prepare for a difficult conversation I am goin...   
4  Review a difficult conversation I already had ...   

                                   feedback_messages  \
0  THere is a team member in my team who is alway...   
1  The goal of the training is the leadership dev...   
2                                  very satisfactory   
3                very satisfactory extremely helpful   
4                                  very satisfactory   

                                            analysis             category  \
0  {'category': 'Conflict Resolution', 'satisfact...  Conflict Resolution   
1  {'category': 'Conflict Resolution', 'satisfact...  Conflict Resolution   
2  {'category': 'Feedback Handling', 'satisfactio...    Feedback Handling   
3  {'category': 'C

In [None]:
category_trends.head()

NameError: name 'category_trends' is not defined

##### further attempts to play with it

In [None]:
from langchain.evaluation import l.oad_evaluator
from langchain_openai import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage

In [None]:
llm = ChatOpenAI(model_name="gpt-4o")

evaluator = load_evaluator(
    "criteria",
    llm=llm,
    criteria={
        "conciseness": "Is the response short and to the point while still being informative?",
        "relevance": "Does the response directly address the user's question?",
        "helpfulness": "Does the response provide meaningful and useful information?",
        "completeness": "Is the response sufficiently detailed and does it cover all aspects of the query?"
    }
)



In [None]:
evaluation_results = []
for convo in conversations:
    messages = convo.get("inputs", {}).get("messages", [])

    # Extract user & assistant messages
    user_messages = [msg["content"] for msg in messages if msg.get("role") == "user"]
    assistant_messages = [msg["content"] for msg in messages if msg.get("role") == "assistant"]

    # Skip evaluation if assistant response is missing
    if not assistant_messages:
        continue

    # Select last assistant response for evaluation
    ai_response = assistant_messages[-1]
    user_input = user_messages[-1] if user_messages else ""

    # Evaluate AI response using LangChain
    eval_result = evaluator.evaluate_strings(
        prediction=ai_response,
        input=user_input
    )

    # Store structured results
    evaluation_results.append({
        "user_input": user_input,
        "ai_response": ai_response,
        "evaluation": eval_result
    })

# Convert to DataFrame for analysis
df_evaluation = pd.DataFrame(evaluation_results)
df_evaluation.head()

Unnamed: 0,user_input,ai_response,evaluation
0,THere is a team member in my team who is alway...,Thank you for sharing more about the situation...,{'reasoning': 'To assess whether the submissio...
1,thanks!,Thank you for your kind feedback! I'm glad you...,{'reasoning': 'To assess whether the submissio...
2,good night!,No worries at all! I'm glad to hear that your ...,{'reasoning': 'To evaluate whether the submiss...
3,looks good. Thank you again,Great to hear that the conversation example wo...,{'reasoning': '**Conciseness:** 1. The submi...
4,I dont know what else to ask :) it was fine,I'm really pleased to hear that it helped! \n\...,{'reasoning': 'To evaluate whether the submiss...


In [None]:
# Extract key evaluation details into a structured format
evaluation_data = []

for index, row in df_evaluation.iterrows():
    eval_result = row["evaluation"]

    # Extract details if available
    reasoning = eval_result.get("reasoning", "N/A")  # Full explanation
    value = eval_result.get("value", "N/A")  # Pass/Fail/Other criteria
    score = eval_result.get("score", None)  # Numeric Score

    # Store structured data
    evaluation_data.append({
        "user_input": row["user_input"],
        "ai_response": row["ai_response"],
        "reasoning": reasoning,
        "value": value,
        "score": score
    })

# Convert to DataFrame for easy visualization
df_structured_eval = pd.DataFrame(evaluation_data)



In [None]:
df_structured_eval['value'].value_counts(), df_structured_eval['score'].value_counts() # score is 0-1 (binary classified)

(value
 N    17
 Y     2
 Name: count, dtype: int64,
 score
 0    17
 1     2
 Name: count, dtype: int64)

In [None]:
#switching to 10-scale
evaluator = load_evaluator(
    "criteria",
    llm=llm,
    criteria={
        "conciseness": "Rate on a scale of 0-10. Is the response short and to the point while still being informative?",
        "relevance": "Rate on a scale of 0-10. Does the response directly address the user's question?",
        "helpfulness": "Rate on a scale of 0-10. Does the response provide meaningful and useful information?",
        "completeness": "Rate on a scale of 0-10. Is the response sufficiently detailed and does it cover all aspects of the query?"
    }
)


In [None]:
evaluation_results = []

for convo in conversations:
    messages = convo.get("inputs", {}).get("messages", [])

    # Extract user & assistant messages
    user_messages = [msg["content"] for msg in messages if msg.get("role") == "user"]
    assistant_messages = [msg["content"] for msg in messages if msg.get("role") == "assistant"]

    # Skip evaluation if assistant response is missing
    if not assistant_messages:
        continue

    # Select last assistant response for evaluation
    ai_response = assistant_messages[-1]
    user_input = user_messages[-1] if user_messages else ""

    # Evaluate AI response using LangChain
    eval_result = evaluator.evaluate_strings(
        prediction=ai_response,
        input=user_input
    )

    # Extract structured evaluation details
    evaluation_results.append({
        "user_input": user_input,
        "ai_response": ai_response,
        "conciseness_score": eval_result.get("conciseness", {}).get("score", "N/A"),
        "relevance_score": eval_result.get("relevance", {}).get("score", "N/A"),
        "helpfulness_score": eval_result.get("helpfulness", {}).get("score", "N/A"),
        "completeness_score": eval_result.get("completeness", {}).get("score", "N/A"),
        "reasoning": eval_result.get("reasoning", "N/A")  # Store evaluation explanation
    })

df_evaluation = pd.DataFrame(evaluation_results)




In [None]:
# Extract key evaluation details into a structured format
evaluation_data = []

for index, row in df_evaluation.iterrows():
    eval_result = row["evaluation"]

    # Extract main reasoning, score, and value
    reasoning = eval_result.get("reasoning", "N/A")
    value = eval_result.get("value", "N/A")
    overall_score = eval_result.get("score", "N/A")  # Numeric score

    # Extract individual criterion scores (if available in reasoning)
    conciseness_score = None
    relevance_score = None
    helpfulness_score = None
    completeness_score = None

    # Manually parse reasoning to extract scores
    if "conciseness" in reasoning.lower():
        conciseness_score = overall_score  # Placeholder (we need a better way to parse this)
    if "relevance" in reasoning.lower():
        relevance_score = overall_score
    if "helpfulness" in reasoning.lower():
        helpfulness_score = overall_score
    if "completeness" in reasoning.lower():
        completeness_score = overall_score

    # Store structured data
    evaluation_data.append({
        "user_input": row["user_input"],
        "ai_response": row["ai_response"],
        "conciseness_score": conciseness_score,
        "relevance_score": relevance_score,
        "helpfulness_score": helpfulness_score,
        "completeness_score": completeness_score,
        "overall_score": overall_score,
        "value": value,
        "reasoning": reasoning
    })

# Convert to DataFrame for easy visualization
df_structured_eval = pd.DataFrame(evaluation_data)


In [None]:
df_structured_eval.head()

Unnamed: 0,user_input,ai_response,conciseness_score,relevance_score,helpfulness_score,completeness_score,overall_score,value,reasoning
0,THere is a team member in my team who is alway...,Thank you for sharing more about the situation...,0,0,0,0,0,N,To assess whether the submission meets all the...
1,thanks!,Thank you for your kind feedback! I'm glad you...,0,0,0,0,0,N,To assess whether the submission meets the cri...
2,good night!,No worries at all! I'm glad to hear that your ...,0,0,0,0,0,N,To evaluate whether the submission meets all t...
3,looks good. Thank you again,Great to hear that the conversation example wo...,1,1,1,1,1,Y,**Conciseness:** \n1. The submission is relat...
4,I dont know what else to ask :) it was fine,I'm really pleased to hear that it helped! \n\...,0,0,0,0,0,N,To evaluate whether the submission meets the c...


In [None]:
df_structured_eval['overall_score'].value_counts()

Unnamed: 0_level_0,count
overall_score,Unnamed: 1_level_1
0,17
1,2


In [None]:
# Check evaluator response for multiple cases
for i in range(3):  # Print first 3 evaluations
    print(f"\n🔹 Evaluation {i+1}:")
    print(json.dumps(df_evaluation['evaluation'][i], indent=2))



🔹 Evaluation 1:
{
  "reasoning": "To assess whether the submission meets all the criteria listed, let's evaluate each criterion step-by-step:\n\n1. **Conciseness**: \n   - The response isn't particularly concise. It includes a detailed suggestion to explore unexpressed thoughts and feelings, which, while potentially useful, might be more elaborate than necessary given the input. The response could have been shortened by omitting some exploratory questions or directly suggesting a simple solution.\n\n2. **Relevance**: \n   - The response attempts to address the broader issue of managing negativity within a team. However, the original input does not explicitly ask for assistance or advice, but rather describes a situation. While relevance could be interpreted broadly in terms of addressing the challenge posed, the response doesn't directly address any specific query, because none was stated in the input.\n\n3. **Helpfulness**: \n   - The response offers a somewhat helpful approach by en

## Ideas for future analysis
1.   **RAGAS for Conversational AI:**

  RAGAS helps you evaluate how relevant the responses are by comparing the context, user intent, and factual accuracy.

  It measures the **semantic relevance of the response**, making sure the chatbot answers **in line with the user's needs and expectations**.

  RAGAS evaluates how well the system maintains **conversation continuity**, ensuring that each response makes sense based on previous exchanges.

  Retrieval relevance is also considered—whether the **assistant pulls the right information** from the right sources and presents it in an **effective, coherent way.**

  **Evaluating User Satisfaction**: Some advanced evaluation systems, including RAGAS, may allow for incorporating user feedback to improve the chatbot’s performance. If users indicate that responses were not relevant or helpful, this feedback can be used to adjust the relevance assessment and further fine-tune the chatbot's behavior.


2. **Proper usage of langchain**

For now it seems useless or I cannot leverage the power of it

3. ...

4. ...

In [None]:
!pip freeze > requirements.txt


In [None]:
!pip install openpyxl




In [None]:
import openpyxl # Corrected the import statement
print(openpyxl.__version__)

3.1.5
