<a href="https://colab.research.google.com/github/jerichosy/THS-STX_user-idea-similarity-for-fixation/blob/main/Similarity_Metric_on_AIdea_Map.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

for context these are the user ideas the users throughout the session.

we want to know how the ideas evolved/progressed throughout its iteration. that is, what is the similarity score of the ideas to each other as it iterated using our tool? A higher similarity score between the iteration of ideas may indicate the users remain fixated on the idea and didn't explore/branch out.

In [None]:
# Install the required libraries
!pip install boto3 cohere



In [None]:
import cohere
import numpy as np
from scipy.spatial.distance import cosine
import os
import boto3
import json

# Initialize Cohere client
co = cohere.Client('your_api_key_here')

# Set AWS credentials and region
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'

# Initialize the Bedrock client
bedrock = boto3.client(service_name='bedrock-runtime')
model_id = "cohere.embed-english-v3"

def get_embeddings(texts):
    embeddings = []
    for text in texts:
        body = json.dumps({
            "texts": [text],
            "input_type": "clustering"
        })
        response = bedrock.invoke_model(
            body=body,
            modelId=model_id,
            accept="application/json",
            contentType="application/json"
        )
        response_body = json.loads(response['body'].read())
        embeddings.append(response_body['embeddings'][0])
    return embeddings

def calculate_similarity_scores(data):
    results = {}

    for person, ideas in data.items():
        person_results = []

        # DEBUG
        print("--", person, "--")

        for idea_key, idea_branches in ideas.items():

            # DEBUG
            print(" -", idea_key, "- ")

            all_similarities = []

            for branch in idea_branches:
                path_similarities = process_branch(branch)
                if path_similarities:
                    all_similarities.extend(path_similarities)

            if all_similarities:
                mean_similarity = np.mean(all_similarities)
                std_similarity = np.std(all_similarities)

                person_results.append({
                    'idea_key': idea_key,
                    'mean_similarity': mean_similarity,
                    'std_similarity': std_similarity,
                    'similarities': all_similarities
                })

        results[person] = person_results

    return results

def process_branch(branch):
    all_paths = []

    def recurse(sub_branch, current_path):
        is_leaf = True  # Assume it's a leaf node until proven otherwise

        for item in sub_branch:
            if isinstance(item, list):
                recurse(item, current_path[:])
                is_leaf = False  # It's not a leaf node if it has sub-branches
            else:
                current_path.append(item)

        # Add valid paths (with at least two nodes and ending at a leaf)
        if is_leaf and len(current_path) > 1:
            all_paths.append(current_path)

    recurse(branch, [])

    all_similarities = []
    seen_pairs = set()  # Reset seen pairs for each process_branch call

    for path in all_paths:
        embeddings = get_embeddings(path)
        for i in range(len(embeddings) - 1):
            pair = (path[i], path[i + 1])
            if pair not in seen_pairs:
                similarity = 1 - cosine(embeddings[i], embeddings[i + 1])
                all_similarities.append(similarity)
                seen_pairs.add(pair)

    # DEBUG
    print(f"{all_paths=}")
    print(f"{all_similarities=}")

    return all_similarities

# New dataset
data = {
    "user 2": {
        "idea 1": [
            [
                "Easily accessible big groups to find people to communicate with",
                [
                    [
                        "Social media persona that isn't bound to specific directions"
                    ],
                    [
                        "Differentiate personal and professional social media presence"
                    ]
                ]
            ],
            [
                "Cater to occasional posting about job opportunities and projects"
            ]
        ],
        "idea 2": [
            [
                "A social media account different from the user's personal account would be good"
            ]
        ],
        "idea 3": [
            [
                "Not intending to replace social media platforms since that isn't the goal"
            ],
            [
                "Professional-focused social media presence that can be a central source to branch off from",
                [
                    [
                        "Branching out to focus on accessibility"
                    ],
                    [
                        "Researching about several different companies", "Show experience and work-related hobbies"
                    ]
                ]
            ]
        ]
    },
    "user 3": {
        "idea 1": [
            ["Promote fun interactions between hosts and viewers in the social media platform"]
        ]
    },
    "user 4": {
        "idea 1": [
            [
                "Simple, familiar menu items that will easily catch their eye",
                "Simple, user-friendly tech like an order status monitor",
                [
                    ["App with ordering feature and rewards program"],
                    ["Cozy, comfortable space for students. But, concerned about students staying too long and running out of tables for new customers."]
                ]
            ]
        ]
    },
    "user 5": {
        "idea 1": [
            ["Loyalty card program offering discounts and free items on the menu"]
        ],
        "idea 2": [
            ["After-hours transformation into a bar for a social drinking setting"]
        ]
    },
    "user 7": {
        "idea 1": [
            ["Allowing users to discover and connect with potential mentors in the user's field is more efficient than the traditional method of in-person interaction", "Exclusive paid subscriptions for premium content and services", "Allows users to see a sneak peek before selecting and paying for a subscription"]
        ],
        "idea 2": [
            ["Add related posts or ads in the user's news feed in line with the usual content they post or view", "Interrupt the user every now and then to rate their experience within the app through feedback surveys"]
        ]
    },
    "user 8": {
        "idea 1": [
            [
                "Cafe for college students to work and study",
                "Arcade, bar, and other recreational activities for college students",
                "Where can college students go when they need to take time off studies",
                "Which is better paired with a restaurant: arcade or bar?",
                "Menu offerings that are nutritious and lower stress levels to keep college students healthy",
                [
                    ["Build-your-own meal options to cater to picky eaters while still providing healthy options", "Buffet-style dining?"],
                    ["Comfort foods and desserts to bring in more people"]
                ]
            ]
        ]
    },
    "user 9": {
        "idea 1": [
            ["Should the food be pricey and fancy or cheap and locally sourced?"]
        ],
        "idea 2": [
            ["Thoughts on poor students?"]
        ]
    },
    "user 10": {
        "idea 1": [
            ["Examples of features"]
        ]
    },
    "user 11": {
        "idea 1": [
            ["Social media monetization", "Balancing user experience with revenue generation to avoid deterring and frustrating users"],
            ["Project/task management and tracking, and team collaboration is unnecessary"]
        ],
        "idea 2": [
            ["Mentorship and project collaboration features may not cater to professions not interested in catering to neophytes", "Modular design and different monetization schemas for different user segments"],
            ["Students unable to access premium features due to financial constraints"]
        ],
        "idea 3": [
            ["Gamifying the platform might be distracting, which is not the theme of the platform", "Concrete examples of modular gamification?"],
            ["How might users connect with one another to form accountability groups?"]
        ]
    },
    "user 12": {
        "idea 1": [
            ["Provide entertainment, drinks, and additions like extra rice, extra sauce, etc."],
            ["Combat abuse of the loyalty program by requiring valid government ID"]
        ],
        "idea 2": [
            ["Offer the most unique and tasty dishes of a certain cuisine (Chinese)"]
        ],
        "idea 3": [
            ["Design a lively and colorful Chinese restaurant"]
        ]
    }
}

# Calculate similarity scores
similarity_results = calculate_similarity_scores(data)

# Print results

all_mean_similarities = []
for person, results in similarity_results.items():
    print(f"{person}:")
    for result in results:
        print(f"  {result['idea_key']}:")
        print(f"    Mean similarity: {result['mean_similarity']:.4f} (std: {result['std_similarity']:.4f})")
        print(f"    Similarity scores: {result['similarities']}")

        all_mean_similarities.append(result['mean_similarity'])
    print()

print(len(all_mean_similarities), all_mean_similarities)
print(f"{np.mean(all_mean_similarities)=}")

-- user 2 --
 - idea 1 - 
all_paths=[['Easily accessible big groups to find people to communicate with', "Social media persona that isn't bound to specific directions"], ['Easily accessible big groups to find people to communicate with', 'Differentiate personal and professional social media presence']]
all_similarities=[0.34887001878807, 0.26483153769254975]
all_paths=[]
all_similarities=[]
 - idea 2 - 
all_paths=[]
all_similarities=[]
 - idea 3 - 
all_paths=[]
all_similarities=[]
all_paths=[['Professional-focused social media presence that can be a central source to branch off from', 'Branching out to focus on accessibility'], ['Professional-focused social media presence that can be a central source to branch off from', 'Researching about several different companies', 'Show experience and work-related hobbies']]
all_similarities=[0.4071668173774108, 0.3084838746143186, 0.3418373566249575]
-- user 3 --
 - idea 1 - 
all_paths=[]
all_similarities=[]
-- user 4 --
 - idea 1 - 
all_paths=[[

### Analysis Methodology

1. **Idea Representation**:
   We convert each subidea into a numerical representation (embedding) to capture its semantic meaning. Each subidea is treated as a point in a high-dimensional space, where semantically similar ideas are closer together.

2. **Pairwise Comparisons**:
   For each user, we compare consecutive subideas within the same branch. Cosine similarity is used to measure the closeness of these ideas, reflecting how similar or different they are in meaning.

3. **Similarity Distribution**:
   For each user and idea, we calculate a collection of similarity scores between consecutive subideas. This distribution represents the spread of similarity among the subideas, giving insights into how ideas evolve and refine.

4. **Statistical Summary**:
   - **Mean Similarity**: Represents the average similarity between consecutive subideas. A higher mean suggests that ideas are generally more cohesive and focused.
   - **Standard Deviation (std)**: Represents the variability in similarity scores. A higher std indicates more variability in how similar or different the subideas are from each other.

5. **Comparing Distributions**:
   We interpret the mean similarities and standard deviations to understand the user's approach to idea refinement. A smaller std suggests consistent refinement, while a larger std indicates diverse refinement paths.

6. **Interpretation**:
   - Higher mean similarity indicates less diversity or variation among ideas, suggesting a focused thought process.
   - Higher standard deviation indicates more variability, suggesting a wider exploration of ideas.

### Results and Discussion

#### User Analysis

**User 2**:
- **Idea 1**:
  - **Mean similarity**: 0.3070
  - **Standard deviation**: 0.0419
  - Interpretation: Moderate similarity with low variability, showing a reasonable consistency in refining the subideas.
  
- **Idea 3**:
  - **Mean similarity**: 0.3526
  - **Standard deviation**: 0.0411
  - Interpretation: Slightly higher mean similarity indicates coherent subideas with low variability, reflecting a structured refinement process.

**User 3**:
- No data available for interpretation.

**User 4**:
- **Idea 1**:
  - **Mean similarity**: 0.3840
  - **Standard deviation**: 0.1003
  - Interpretation: Higher mean similarity with notable variability suggests the user refines ideas closely but explores divergent paths at certain points.

**User 5**:
- No data available for interpretation.

**User 7**:
- **Idea 1**:
  - **Mean similarity**: 0.3474
  - **Standard deviation**: 0.1774
  - Interpretation: Moderate mean similarity with high variability, indicating a tendency to explore diverse idea refinements but with a coherent underlying theme.

- **Idea 2**:
  - **Mean similarity**: 0.3643
  - **Standard deviation**: 0.0000
  - Interpretation: Single comparison suggests consistency, but limited data for further insights.

**User 8**:
- **Idea 1**:
  - **Mean similarity**: 0.4400
  - **Standard deviation**: 0.1434
  - Interpretation: Higher mean similarity with moderate variability indicates a structured yet flexible refinement process, with coherent evolution of ideas and occasional diverse explorations.

**User 9**:
- No data available for interpretation.

**User 10**:
- No data available for interpretation.

**User 11**:
- **Idea 1**:
  - **Mean similarity**: 0.3815
  - **Standard deviation**: 0.0000
  - Interpretation: Single comparison indicates consistency in refinement but limited divergence.

- **Idea 2**:
  - **Mean similarity**: 0.3099
  - **Standard deviation**: 0.0000
  - Interpretation: Single comparison indicates consistent refinement, with no variability observed.

- **Idea 3**:
  - **Mean similarity**: 0.4758
  - **Standard deviation**: 0.0000
  - Interpretation: Single comparison, showing consistent refinement without divergence.

**User 12**:
- No data available for interpretation.

### Interpretation Summary
- **Mean Similarity**: Higher mean similarity generally reflects more focused and cohesive idea development, suggesting the user followed a more structured refinement process.
- **Standard Deviation**: Higher variability (standard deviation) indicates more diverse paths taken during the refinement process, with the user exploring different dimensions of the ideas.

### Conclusions
1. **Individual Trends**:
   - Users with higher mean similarities typically show focused thought processes, indicating coherent refinement of ideas.
   - Users with higher standard deviations show a mix of focused and diverse refinements, suggesting a flexible approach to idea iteration.

2. **Consistency and Divergence**:
   - Single comparisons (std = 0) highlight consistent refinement without divergence, often seen in users with straightforward idea refinement processes.

3. **Variability**:
   - Moderate to high variability (std > 0) indicates diverse exploration paths, where users iteratively refine and explore different aspects of their ideas.

### Limitations
- **Sample Size**: The number of comparisons may vary significantly among users, impacting the consistency of the results.
- **Divergence Handling**: Not all users have divergent paths, limiting the interpretation of variability for users with linear idea refinement.

Overall, the analysis provides insights into individual refinement processes, highlighting the balance between focused vs. diverse idea evolution. Future studies might consider equalizing sample sizes and exploring deeper refinement stages for more comprehensive insights.

### Scratch code

In [None]:
import cohere
import numpy as np
from scipy.spatial.distance import cosine

# Replace with your actual API key
API_KEY = 'your_api_key_here'
# Initialize Cohere client
co = cohere.Client(API_KEY)

# Set AWS credentials and region
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'

def get_embeddings(texts):
    embeddings = []
    for text in texts:
        body = json.dumps({
            "texts": [text],
            "input_type": "clustering"
        })
        response = bedrock.invoke_model(
            body=body,
            modelId=model_id,
            accept="application/json",
            contentType="application/json"
        )
        response_body = json.loads(response['body'].read())
        embeddings.append(response_body['embeddings'][0])
    return embeddings

def calculate_cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Define the texts for user 2's idea 3
texts = [
    "Professional-focused social media presence that can be a central source to branch off from",
    "Branching out to focus on accessibility",
    "Researching about several different companies",
    "Show experience and work-related hobbies"
]

# Get embeddings for these texts
embeddings = get_embeddings(texts)

# Print embeddings for verification
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
    print(f"Text {i+1}: {text}")
    print(f"Embedding {i+1}: {embedding[:10]}...")  # Print first 10 values for brevity

# Compute similarities
similarity1 = calculate_cosine_similarity(embeddings[0], embeddings[1])
similarity2 = calculate_cosine_similarity(embeddings[0], embeddings[2])
similarity3 = calculate_cosine_similarity(embeddings[2], embeddings[3])

# Print similarities
print(f"\nSimilarity between Text 1 and Text 2: {similarity1}")
print(f"Similarity between Text 1 and Text 3: {similarity2}")
print(f"Similarity between Text 3 and Text 4: {similarity3}")
print(f"{np.mean([similarity1, similarity2, similarity3])=}")

Text 1: Professional-focused social media presence that can be a central source to branch off from
Embedding 1: [0.0625, -0.007850647, 0.014640808, -0.06298828, -0.0043945312, 0.0062026978, -0.03942871, -0.0053863525, -0.042663574, 0.042022705]...
Text 2: Branching out to focus on accessibility
Embedding 2: [-0.00073862076, -0.031341553, -0.04559326, -0.06677246, -0.02268982, 0.00014734268, -0.040924072, 0.033691406, 0.016525269, 0.038024902]...
Text 3: Researching about several different companies
Embedding 3: [-0.017791748, -0.0055007935, -0.048339844, -0.004802704, -0.023239136, -0.030181885, -0.032928467, 0.0035152435, -0.043518066, 0.020614624]...
Text 4: Show experience and work-related hobbies
Embedding 4: [0.041290283, -0.004535675, -0.04647827, -0.051940918, -0.0020198822, -0.027618408, -0.0046958923, -0.010696411, -0.035705566, 0.022872925]...

Similarity between Text 1 and Text 2: 0.40690857978381967
Similarity between Text 1 and Text 3: 0.30876363310541044
Similarity betwee

The output checks out with our main code above

In [None]:
import cohere
import numpy as np
from scipy.spatial.distance import cosine

# Replace with your actual API key
API_KEY = 'your_api_key_here'
# Initialize Cohere client
co = cohere.Client(API_KEY)

# Set AWS credentials and region
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'

def get_embeddings(texts):
    embeddings = []
    for text in texts:
        body = json.dumps({
            "texts": [text],
            "input_type": "clustering"
        })
        response = bedrock.invoke_model(
            body=body,
            modelId=model_id,
            accept="application/json",
            contentType="application/json"
        )
        response_body = json.loads(response['body'].read())
        embeddings.append(response_body['embeddings'][0])
    return embeddings

def calculate_cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Define the texts for user 2's idea 3
texts = [
    "Professional-focused social media presence that can be a central source to branch off from",
    "Branching out to focus on accessibility",
    "Researching about several different companies",
    "Show experience and work-related hobbies"
]

# Get embeddings for these texts
embeddings = get_embeddings(texts)

# Print embeddings for verification
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
    print(f"Text {i+1}: {text}")
    print(f"Embedding {i+1}: {embedding[:10]}...")  # Print first 10 values for brevity

# Compute first and last node similarities
similarity1 = calculate_cosine_similarity(embeddings[0], embeddings[1])
similarity2 = calculate_cosine_similarity(embeddings[0], embeddings[3])

print(similarity1)
print(similarity2)
print(f"{np.mean([similarity1, similarity2])=}")

Text 1: Professional-focused social media presence that can be a central source to branch off from
Embedding 1: [0.062408447, -0.008201599, 0.014541626, -0.06317139, -0.004760742, 0.005874634, -0.03942871, -0.004764557, -0.04260254, 0.042053223]...
Text 2: Branching out to focus on accessibility
Embedding 2: [-0.00073862076, -0.031341553, -0.04559326, -0.06677246, -0.02268982, 0.00014734268, -0.040924072, 0.033691406, 0.016525269, 0.038024902]...
Text 3: Researching about several different companies
Embedding 3: [-0.017791748, -0.0055007935, -0.048339844, -0.004802704, -0.023239136, -0.030181885, -0.032928467, 0.0035152435, -0.043518066, 0.020614624]...
Text 4: Show experience and work-related hobbies
Embedding 4: [0.04055786, -0.004638672, -0.047058105, -0.052886963, -0.0012283325, -0.027404785, -0.0050315857, -0.010345459, -0.036346436, 0.022354126]...
0.40718770264421533
0.4019415696354778
np.mean([similarity1, similarity2])=0.40456463613984656


In [None]:
import cohere
import numpy as np
from scipy.spatial.distance import cosine

# Replace with your actual API key
API_KEY = 'your_api_key_here'
# Initialize Cohere client
co = cohere.Client(API_KEY)

# Set AWS credentials and region
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'

def get_embeddings(texts):
    embeddings = []
    for text in texts:
        body = json.dumps({
            "texts": [text],
            "input_type": "clustering"
        })
        response = bedrock.invoke_model(
            body=body,
            modelId=model_id,
            accept="application/json",
            contentType="application/json"
        )
        response_body = json.loads(response['body'].read())
        embeddings.append(response_body['embeddings'][0])
    return embeddings

def calculate_cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Define the texts for user 2's idea 3
texts = [
    "Simple familiar menu items that will easily catch their eye",
    "Simple, user-friendly tech like an order status monitor",
    "App with ordering feature and rewards program",
    "Cozy, comfortable space for students. But, concerned about students staying too long and running out of tables for new customers."
]

# Get embeddings for these texts
embeddings = get_embeddings(texts)

# Print embeddings for verification
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
    print(f"Text {i+1}: {text}")
    print(f"Embedding {i+1}: {embedding[:10]}...")  # Print first 10 values for brevity

# Compute first and last node similarities
similarity1 = calculate_cosine_similarity(embeddings[0], embeddings[1])
similarity2 = calculate_cosine_similarity(embeddings[0], embeddings[3])

print(similarity1)
print(similarity2)
print(f"{np.mean([similarity1, similarity2])=}")

Text 1: Simple familiar menu items that will easily catch their eye
Embedding 1: [0.0552063, 0.019454956, 0.029647827, 0.02609253, 0.02658081, -0.010726929, 0.072265625, -0.05999756, 0.035614014, 0.03503418]...
Text 2: Simple, user-friendly tech like an order status monitor
Embedding 2: [0.061309814, -0.0051574707, -0.011161804, -0.0703125, -0.00088882446, -0.019897461, -0.009391785, -0.008720398, 0.040100098, 0.047729492]...
Text 3: App with ordering feature and rewards program
Embedding 3: [0.015625, -0.0121536255, -0.011520386, -0.020370483, 0.007911682, -0.0035629272, 0.0064697266, 0.008331299, -0.020874023, 0.032806396]...
Text 4: Cozy, comfortable space for students. But, concerned about students staying too long and running out of tables for new customers.
Embedding 4: [0.06384277, 0.027694702, 0.037231445, 0.027236938, 0.0041122437, -0.016677856, 0.04849243, -0.088378906, -0.029220581, 0.030578613]...
0.3748746819730524
0.3427397501995475
np.mean([similarity1, similarity2])=0.3