In [8]:
from gensim.models import word2vec
import numpy as np

In [5]:
hassan   = ["speed", "aggression", "adaptability", "technical_skill", "teamwork", "risk_taking", "consistency"]
redbull  = ["speed", "aggression", "adaptability", "technical_skill", "teamwork", "risk_taking", "inconsistency"]
ferrari  = ["passion", "emotion", "adaptability", "technical_skill", "teamwork", "risk_taking", "inconsistency"]
mercedes = ["precision", "discipline", "adaptability", "technical_skill", "teamwork", "control", "consistency"]

teams = {
    'Red_Bull': redbull,
    'Ferrari': ferrari,
    'Mercedes': mercedes
}

print(f"Hassan:   {hassan}")
print(f"Red Bull: {teams['Red_Bull']}")
print(f"Ferrari:  {teams['Ferrari']}")
print(f"Mercedes: {teams['Mercedes']}")


Hassan:   ['speed', 'aggression', 'adaptability', 'technical_skill', 'teamwork', 'risk_taking', 'consistency']
Red Bull:  ['speed', 'aggression', 'adaptability', 'technical_skill', 'teamwork', 'risk_taking', 'inconsistency']
Ferrari:  ['passion', 'emotion', 'adaptability', 'technical_skill', 'teamwork', 'risk_taking', 'inconsistency']
Mercedes: ['precision', 'discipline', 'adaptability', 'technical_skill', 'teamwork', 'control', 'consistency']


In [29]:
#some traning data to allow model learn from it
training = [
    # Speed-related words
    ["speed", "fast", "quick", "racing", "velocity", "rapid"],
    ["speed", "acceleration", "swift", "performance"],
    
    # Aggression-related words
    ["aggression", "aggressive", "bold", "fierce", "attacking"],
    ["aggression", "intensity", "forceful", "dynamic"],
    
    # Adaptability-related words
    ["adaptability", "flexible", "adjustable", "versatile"],
    ["adaptability", "responsive", "adaptive", "changeable"],
    
    # Technical skill-related words
    ["technical_skill", "expertise", "precision", "engineering"],
    ["technical_skill", "knowledge", "competence", "proficiency"],
    
    # Teamwork-related words
    ["teamwork", "collaboration", "cooperation", "unity"],
    ["teamwork", "collective", "partnership", "together"],
    
    # Risk-taking related words
    ["risk_taking", "bold", "daring", "brave", "adventurous"],
    ["risk_taking", "courage", "fearless", "gambling"],
    
    # Consistency-related words
    ["consistency", "reliable", "steady", "dependable", "stable"],
    ["consistency", "regular", "uniform", "predictable"],
    
    # Inconsistency-related words (opposite of consistency)
    ["inconsistency", "unpredictable", "erratic", "variable"],
    ["inconsistency", "irregular", "unreliable", "unstable"],
    
    # Passion-related words
    ["passion", "emotion", "love", "enthusiasm", "dedication"],
    ["passion", "fervor", "intensity", "commitment"],
    
    # Emotion-related words
    ["emotion", "feeling", "passionate", "expressive", "heart"],
    ["emotion", "sentiment", "emotional", "intense"],
    
    # Precision-related words
    ["precision", "accuracy", "exact", "meticulous", "careful"],
    ["precision", "detailed", "systematic", "methodical"],
    
    # Discipline-related words
    ["discipline", "control", "order", "systematic", "structured"],
    ["discipline", "focus", "dedication", "commitment"],
    
    # Control-related words
    ["control", "management", "regulation", "command", "mastery"],
    ["control", "restraint", "discipline", "governance"],

]

print(f"Created {len(training)} training sentences")

Created 26 training sentences


In [30]:

model = Word2Vec(
    sentences = training,
    vector_size=100,
    window=5,
    min_count=1,
    workers=1,
    epochs=100
)

print(f"Model vocab studied: {len(model.wv.key_to_index)} words")
print(f"Vector dimension: {model.vector_size}")

#let's test the model
test_pairs = [
    ("speed", "fast"),
    ("consistency", "reliable"), 
    ("teamwork", "collaboration"),
    ("consistency", "inconsistency"),
    ("passion", "emotion")
]

for w1, w2 in test_pairs:
    if w1 in model.wv and w2 in model.wv:
        sim = model.wv.similarity(w1,w2)
        print(f"Similarity between the {w1} and {w2}: {sim:.4f}")


Model vocab studied: 92 words
Vector dimension: 100
Similarity between the speed and fast: -0.0085
Similarity between the consistency and reliable: 0.1176
Similarity between the teamwork and collaboration: 0.2122
Similarity between the consistency and inconsistency: 0.2947
Similarity between the passion and emotion: 0.2184


In [36]:
def get_embedding(traits):
    embeddings = []
    for trait in traits:
        if trait in model.wv:
            embeddings.append(model.wv[trait])
        else:
            print(f"{trait} not found in vocabulary")
            embeddings.append(np.zeroes(model.vector_size))
    
    profile_vector = np.mean(embeddings, axis=0)
    return profile_vector

#convert word to numerical list
hassan_embedding = get_embedding(hassan)
redbull_embedding = get_embedding(teams['Red_Bull'])
ferrari_embedding = get_embedding(teams['Ferrari'])
mercedes_embedding = get_embedding(teams['Mercedes'])

print(f"Hassan embedding shape: {hassan_embedding.shape}")
print(f"Red Bull embedding shape: {redbull_embedding.shape}")
print(f"Ferrari embedding shape: {ferrari_embedding.shape}")
print(f"Mercedes embedding shape: {mercedes_embedding.shape}")

print(f"Sample from hassan's embedded vector: {hassan_embedding[:5]}")

Hassan embedding shape: (100,)
Red Bull embedding shape: (100,)
Ferrari embedding shape: (100,)
Mercedes embedding shape: (100,)
Sample from hassan's embedded vector: [ 0.00204716 -0.00146923  0.0009835  -0.00185944  0.00180215]


In [32]:
from sklearn.metrics.pairwise import cosine_similarity

hassan_vs_redbull = cosine_similarity([hassan_embedding], [redbull_embedding])[0][0]
hassan_vs_ferrari = cosine_similarity([hassan_embedding], [ferrari_embedding])[0][0]
hassan_vs_mercedes = cosine_similarity([hassan_embedding], [mercedes_embedding])[0][0]

embedding_results = {
    'Red_Bull': hassan_vs_redbull,
    'Ferrari': hassan_vs_ferrari,
    'Mercedes': hassan_vs_mercedes
}

print("SEMANTIC SIMILARITY RESULTS:")
print("-" * 40)

for team, sim in embedding_results.items():
    print(f"Hassan and {team} similarity: {sim:.4f}")

#the best match
best_team_semantic = max(embedding_results, key=embedding_results.get)
best_score_semantic = embedding_results[best_team_semantic]

print("-" * 40)
print(f"Best semantic match: {best_team_semantic}")
print(f"Similarity score: {best_score_semantic:.4f}")



SEMANTIC SIMILARITY RESULTS:
----------------------------------------
Hassan and Red_Bull similarity: 0.9166
Hassan and Ferrari similarity: 0.5944
Hassan and Mercedes similarity: 0.7291
----------------------------------------
Best semantic match: Red_Bull
Similarity score: 0.9166


In [39]:
print("\n🔄 STEP 6: COMPARING NUMERICAL VS SEMANTIC APPROACHES")
print("="*70)

# Let's compare results from both approaches
print("COMPARISON: NUMERICAL vs SEMANTIC RESULTS")
print("="*70)

numerical_results = {
    'Cosine Similarity': {'Red_Bull': 0.9918, 'Ferrari': 0.9973, 'Mercedes': 0.9564},
    'Euclidean Similarity': {'Red_Bull': 0.2743, 'Ferrari': 0.3333, 'Mercedes': 0.1412},
    'Manhattan Similarity': {'Red_Bull': 0.1250, 'Ferrari': 0.2000, 'Mercedes': 0.0625}
}

# Current semantic results
semantic_results = {
    'Word2Vec Embeddings': embedding_results
}

print("NUMERICAL APPROACH RESULTS:")
print("-" * 50)
for method, results in numerical_results.items():
    winner = max(results, key=results.get)
    score = results[winner]
    print(f"{method:<20}: {winner} ({score:.4f})")

print("\nSEMANTIC APPROACH RESULTS:")
print("-" * 50)
for method, results in semantic_results.items():
    winner = max(results, key=results.get)
    score = results[winner]
    print(f"{method:<20}: {winner} ({score:.4f})")

print("\nWINNER COMPARISON:")
print("-" * 50)
print("Numerical Methods → Ferrari (all 3 methods agreed)")
print("Semantic Method   → Red Bull (word embeddings)")

print("\nWHY THE DIFFERENCE?")
print("-" * 50)
print("\n1.NUMERICAL: Compares arbitrary numbers")
print("   - Ferrari had closest numerical pattern")
print("   - No real meaning behind the numbers")

print("\n2.SEMANTIC: Compares actual word meanings")
print("   -  Red Bull shares more traits with Hassan:")
print("   -  Hassan:   speed, aggression, consistency")
print("   -  Red Bull: speed, aggression, inconsistency")
print("   -  Ferrari: passion, emotion (very different from Hassan)")
print("   -  Mercedes: precision, discipline (more systematic)")

print("\nSEMANTIC SIMILARITY SCORES ANALYSIS:")
print("Hassan ↔ Red Bull: 0.9166 (Very High - shares speed, aggression)")
print("Hassan ↔ Mercedes: 0.7291 (Moderate - some overlap)")  
print("Hassan ↔ Ferrari: 0.5944 (Lower - passion/emotion vs speed/aggression)")


🔄 STEP 6: COMPARING NUMERICAL VS SEMANTIC APPROACHES
COMPARISON: NUMERICAL vs SEMANTIC RESULTS
NUMERICAL APPROACH RESULTS:
--------------------------------------------------
Cosine Similarity   : Ferrari (0.9973)
Euclidean Similarity: Ferrari (0.3333)
Manhattan Similarity: Ferrari (0.2000)

SEMANTIC APPROACH RESULTS:
--------------------------------------------------
Word2Vec Embeddings : Red_Bull (0.9166)

WINNER COMPARISON:
--------------------------------------------------
Numerical Methods → Ferrari (all 3 methods agreed)
Semantic Method   → Red Bull (word embeddings)

WHY THE DIFFERENCE?
--------------------------------------------------

1.NUMERICAL: Compares arbitrary numbers
   - Ferrari had closest numerical pattern
   - No real meaning behind the numbers

2.SEMANTIC: Compares actual word meanings
   -  Red Bull shares more traits with Hassan:
   -  Hassan:   speed, aggression, consistency
   -  Red Bull: speed, aggression, inconsistency
   -  Ferrari: passion, emotion (very 

we can see that the numerical approach just works on the number patterns, so it's too common for it to be differen from the embedding similarity measure as it tracks simialarities between word from different vectors. maybe if we used a pre trained model unlike the one we trained on a manual small data may get another results and better similarities between words