In [40]:
from langchain_community.llms import Ollama
from langchain.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain.schema import SystemMessage, HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import itertools

llm = Ollama(model="mistral")

# Setup

In [41]:
big5_characteristics = {
    "Openness": "creative, perceptual, curious and philosophical",
    "Conscientiousness": "organized, responsible, disciplined and prudent",
    "Neuroticism": "anxious, depressed, insecure and emotional",
    "Agreeableness": "cooperative, compasionnate, trustworthy and humble",
    "Extraversion": "friendly, positive, assertive and energetic"
}

In [42]:
role = "a helpful, expert personality methodologist who generates items for each of the Big Five Personality traits. You know how to carefully craft succinct items that meaningfully target the many aspects of a given trait."

# Génération des items

In [43]:
chat = ChatOllama(model="mistral")
responses = {}
for trait, characteristics in big5_characteristics.items():
    instructions = f"""
Generate a total of eight NEW items that assess one’s level of {trait}. Here are the
characteristics of {trait}: {characteristics}. Generate exactly TWO items per
characteristic. It is VERY important that you generate EXACTLY eight TOTAL items in
the CORRECT formatting. FOLLOW this format EXACTLY for each item: :: Do NOT
add any characteristics. Do NOT leave any out. The stem ‘I am someone who’ MUST
be explicitly written out at the beginning of each item. Separate items using \n. Do NOT
number the items. Lastly, be creative and explore novel ideas!

example1:
I am someone who is compassionate, has a soft heart.

example2:
I am someone who starts arguments with others.
"""
    # Set role and instructions
    messages = [
        SystemMessage(content=role),
        HumanMessage(content=instructions)
    ]

    response = chat.invoke(messages)
    responses[trait] = response.content.split("\n  ")
    break

In [44]:
responses

{'Openness': ['1. I am someone who often finds myself daydreaming about abstract concepts or future possibilities.\n',
  ' 2. I am someone who enjoys exploring new ideas and perspectives, even if they challenge my current beliefs.\n',
  ' 3. I am someone who appreciates artistic and cultural activities like attending concerts, visiting museums, or reading books from different genres.\n',
  ' 4. I am someone who tends to see things from multiple angles and considers various viewpoints before making decisions.\n',
  " 5. I am someone who enjoys learning new skills, whether it's a foreign language, playing an instrument, or mastering a new software.\n",
  ' 6. I am someone who is open-minded about new experiences and opportunities, frequently seeking out novel adventures.\n',
  ' 7. I am someone who tends to be imaginative and creative in problem-solving, often coming up with unique solutions that others might not consider.\n',
  ' 8. I am someone who enjoys engaging in philosophical disc

# Encodage des items

In [45]:
# setting up the embedding model
embedding_model = OllamaEmbeddings(model="mistral")

trait_embeddings = dict()

for trait, items in responses.items():
    # Get the embedding
    items_embeddings = [embedding_model.embed_query([item]) for item in items]
    trait_embeddings[trait] = items_embeddings

embeddings_list = [embedding for embedding_list in trait_embeddings.values() for embedding in embedding_list]

similarity_matrix = cosine_similarity(embeddings_list)

In [46]:
similarity_matrix

array([[1.        , 0.96187124, 0.9553633 , 0.96090042, 0.91264115,
        0.95929188, 0.95834501, 0.87563361],
       [0.96187124, 1.        , 0.97041953, 0.97634778, 0.94892492,
        0.97590125, 0.97815568, 0.90686605],
       [0.9553633 , 0.97041953, 1.        , 0.97759384, 0.92040837,
        0.98115128, 0.96030357, 0.88351191],
       [0.96090042, 0.97634778, 0.97759384, 1.        , 0.92199883,
        0.98906317, 0.9697    , 0.89266981],
       [0.91264115, 0.94892492, 0.92040837, 0.92199883, 1.        ,
        0.92077411, 0.94332399, 0.90954205],
       [0.95929188, 0.97590125, 0.98115128, 0.98906317, 0.92077411,
        1.        , 0.9636793 , 0.89842896],
       [0.95834501, 0.97815568, 0.96030357, 0.9697    , 0.94332399,
        0.9636793 , 1.        , 0.9011643 ],
       [0.87563361, 0.90686605, 0.88351191, 0.89266981, 0.90954205,
        0.89842896, 0.9011643 , 1.        ]])

# TMFG

## Finding the base tetrahedron (4 closest items)

In [47]:
dim = len(embeddings_list)

labels = ["A", "B", "C", "D", "E", "F", "G", "H"]

quaduplets = list(itertools.combinations(labels, 4))

best_quaduplet = None
best_similarity = -1

for quaduplet in quaduplets:
    i, j, k, l= [labels.index(t) for t in quaduplet]
    similarity = [
        similarity_matrix[i,j],
        similarity_matrix[i,k],
        similarity_matrix[i,l],
        similarity_matrix[j,k],
        similarity_matrix[j,l],
        similarity_matrix[k,l]
    ]

    min_similarity = min(similarity)

    if min_similarity > best_similarity:
        best_similarity = min_similarity
        best_quaduplet = (i, j, k, l)

In [93]:
Clique1 = best_quaduplet
Triangles = list(itertools.combinations(best_quaduplet, 3))
Vertices = set(range(dim)).difference(set(best_quaduplet))
Separators = list()
cliques = [Clique1]
P = similarity_matrix[np.ix_(best_quaduplet, best_quaduplet)]

In [94]:
Maxgain = {triangle:None for triangle in Triangles}


In [95]:
Maxgain = list()
Bestvertices = list()
for triangle in Triangles:
    best_vertex = np.argmax(np.sum(similarity_matrix[np.ix_(triangle, list(Vertices))], axis=0))
    Bestvertices.append(best_vertex)
    Maxgain.append(np.sum(similarity_matrix[np.ix_(triangle, list(Vertices))], axis=0)[best_vertex])

In [96]:
while Vertices != set():
    maxgain_index = np.argmax(Maxgain)
    best_vertex = Bestvertices.pop(maxgain_index)
    triangle = Triangles.pop(maxgain_index)
    break
    

In [97]:
best_vertex

np.int64(2)

In [98]:
best_vertices = [vertex for vertex in triangle] + [int(best_vertex)]
best_triangles = list(itertools.combinations(best_vertices,3))
best_triangles.remove(triangle)
best_triangles
Triangles.append(best_triangles)


In [99]:
Separators.append(triangle)

In [100]:
Separators

[(1, 3, 5)]