# Embedding Models: Steam Game Descriptions

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util, evaluation
from pyvi.ViTokenizer import tokenize

import random
RANDOM_STATE = 42
random.seed(RANDOM_STATE)

  from tqdm.autonotebook import tqdm, trange


In [2]:
model = SentenceTransformer("all-mpnet-base-v2")



## Loading Data

In [3]:
game_df = pd.read_csv("steam_games.csv")
game_df.head(5)

Unnamed: 0,Title,Original Price,Discounted Price,Release Date,Link,Game Description,Recent Reviews Summary,All Reviews Summary,Recent Reviews Number,All Reviews Number,Developer,Publisher,Supported Languages,Popular Tags,Game Features,Minimum Requirements
0,Baldur's Gate 3,$29.99,$29.99,"3 Aug, 2023",https://store.steampowered.com/app/1086940/Bal...,"Baldur’s Gate 3 is a story-rich, party-based R...",Overwhelmingly Positive,Very Positive,"- 96% of the 128,900 user reviews in the last ...","- 94% of the 188,617 user reviews for this gam...",Larian Studios,Larian Studios,"['English', 'French', 'German', 'Spanish - Spa...","['RPG', 'Choices Matter', 'Character Customiza...","['Single-player', 'Online Co-op', 'LAN Co-op',...",Requires a 64-bit processor and operating syst...
1,Counter-Strike: Global Offensive,$14.99,$14.99,"21 Aug, 2012",https://store.steampowered.com/app/730/Counter...,Counter-Strike: Global Offensive (CS: GO) expa...,Very Positive,Very Positive,"- 89% of the 75,284 user reviews in the last 3...","- 88% of the 7,428,921 user reviews for this g...","Valve, Hidden Path Entertainment",Valve,"['English', 'Czech', 'Danish', 'Dutch', 'Finni...","['FPS', 'Shooter', 'Multiplayer', 'Competitive...","['Steam Achievements', 'Full controller suppor...",OS: | Windows® 7/Vista/XP | Processor: | Int...
2,Apex Legends™,Free,Free,"4 Nov, 2020",https://store.steampowered.com/app/1172470/Ape...,"Apex Legends is the award-winning, free-to-pla...",Mixed,Very Positive,"- 65% of the 18,581 user reviews in the last 3...","- 80% of the 701,597 user reviews for this gam...",Respawn Entertainment,Electronic Arts,"['English', 'French', 'Italian', 'German', 'Sp...","['Free to Play', 'Multiplayer', 'Battle Royale...","['Online PvP', 'Online Co-op', 'Steam Achievem...",Requires a 64-bit processor and operating syst...
3,Forza Horizon 5,$34.78,$17.39,"8 Nov, 2021",https://store.steampowered.com/app/1551360/For...,Your Ultimate Horizon Adventure awaits! Explor...,Very Positive,Very Positive,"- 87% of the 4,120 user reviews in the last 30...","- 88% of the 123,162 user reviews for this gam...",Playground Games,Xbox Game Studios,"['English', 'French', 'Italian', 'German', 'Sp...","['Racing', 'Open World', 'Driving', 'Multiplay...","['Single-player', 'Online PvP', 'Online Co-op'...",Requires a 64-bit processor and operating syst...
4,Call of Duty®,Free,Free,"27 Oct, 2022",https://store.steampowered.com/app/1938090/Cal...,"Welcome to Call of Duty® HQ, the home of Call ...",Mixed,Mixed,"- 49% of the 8,257 user reviews in the last 30...","- 57% of the 236,876 user reviews for this gam...","Infinity Ward, Raven Software, Beenox, Treyarc...",Activision,"['English', 'French', 'Italian', 'German', 'Sp...","['FPS', 'Multiplayer', 'Shooter', 'Action', 'S...","['Single-player', 'Online PvP', 'Online Co-op'...",Requires a 64-bit processor and operating syst...


In [4]:
game_df.drop_duplicates("Title", inplace=True)
game_df["Game Description"] = game_df["Game Description"].astype(str)

sample_descriptions = game_df[["Title", "Game Description", "Popular Tags"]].head(100).reset_index()

In [5]:
def deacronym(tags):
    new_tags = []
    for tag in tags:
        match tag.split():
            case ["MMORPG"]:
                new_tag = "Massively Multiplayer Online Role-Playing Game"
            case [s1, "RPG", s2]:
                new_tag = s1.strip() + " Role-Playing Game " + s2.strip()
            case ["RPG"]:
                new_tag = "Role-Playing Game"
            case ["FPS"]:
                new_tag = "First Person Shooter"
            case ["Sci-fi"]:
                new_tag = "Science Fiction"
            case ["PvP"]:
                new_tag = "Player versus Player"
            case ["PvE"]:
                new_tag = "Player versus Entity"
            case ["MOBA"]:
                new_tag = "Multiplayer Online Battle Arena Video Games"
            case _:
                new_tag = tag
        new_tags.append(new_tag)
    return new_tags

In [6]:
from ast import literal_eval

# Combines descriptions and tags for each game
for idx, (tag, desc) in enumerate(zip(sample_descriptions["Popular Tags"], sample_descriptions["Game Description"])):
    tag = deacronym(literal_eval(tag))
    if "Free to Play" in tag:
        tag.remove("Free to Play")
    if "Early Access" in tag:
        tag.remove("Early Access")
    if "Online Co-Op" in tag:
        tag.remove("Online Co-Op")
    if "Co-op" in tag:
        tag.remove("Co-op")

    tag = ", ".join(tag[:7])

    sample_descriptions.loc[idx, "Popular Tags"] = tag
    sample_descriptions.loc[idx, "Description and Tags"] = desc + " " + tag

In [7]:
# Ideas:
# ngrams
# train better lmao

pd.set_option('display.max_colwidth', None)
sample_descriptions

Unnamed: 0,index,Title,Game Description,Popular Tags,Description and Tags
0,0,Baldur's Gate 3,"Baldur’s Gate 3 is a story-rich, party-based RPG set in the universe of Dungeons & Dragons, where your choices shape a tale of fellowship and betrayal, survival and sacrifice, and the lure of absolute power.","Role-Playing Game, Choices Matter, Character Customization, Story Rich, Adventure, CRPG, Multiplayer","Baldur’s Gate 3 is a story-rich, party-based RPG set in the universe of Dungeons & Dragons, where your choices shape a tale of fellowship and betrayal, survival and sacrifice, and the lure of absolute power. Role-Playing Game, Choices Matter, Character Customization, Story Rich, Adventure, CRPG, Multiplayer"
1,1,Counter-Strike: Global Offensive,"Counter-Strike: Global Offensive (CS: GO) expands upon the team-based action gameplay that it pioneered when it was launched 19 years ago. CS: GO features new maps, characters, weapons, and game modes, and delivers updated versions of the classic CS content (de_dust2, etc.).","First Person Shooter, Shooter, Multiplayer, Competitive, Action, Team-Based, eSports","Counter-Strike: Global Offensive (CS: GO) expands upon the team-based action gameplay that it pioneered when it was launched 19 years ago. CS: GO features new maps, characters, weapons, and game modes, and delivers updated versions of the classic CS content (de_dust2, etc.). First Person Shooter, Shooter, Multiplayer, Competitive, Action, Team-Based, eSports"
2,2,Apex Legends™,"Apex Legends is the award-winning, free-to-play Hero Shooter from Respawn Entertainment. Master an ever-growing roster of legendary characters with powerful abilities, and experience strategic squad play and innovative gameplay in the next evolution of Hero Shooter and Battle Royale.","Multiplayer, Battle Royale, Shooter, First Person Shooter, First-Person, Player versus Player, Action","Apex Legends is the award-winning, free-to-play Hero Shooter from Respawn Entertainment. Master an ever-growing roster of legendary characters with powerful abilities, and experience strategic squad play and innovative gameplay in the next evolution of Hero Shooter and Battle Royale. Multiplayer, Battle Royale, Shooter, First Person Shooter, First-Person, Player versus Player, Action"
3,3,Forza Horizon 5,"Your Ultimate Horizon Adventure awaits! Explore the vibrant open world landscapes of Mexico with limitless, fun driving action in the world’s greatest cars. Conquer the rugged Sierra Nueva in the ultimate Horizon Rally experience. Requires Forza Horizon 5 game, expansion sold separately.","Racing, Open World, Driving, Multiplayer, Automobile Sim, Realistic, Adventure","Your Ultimate Horizon Adventure awaits! Explore the vibrant open world landscapes of Mexico with limitless, fun driving action in the world’s greatest cars. Conquer the rugged Sierra Nueva in the ultimate Horizon Rally experience. Requires Forza Horizon 5 game, expansion sold separately. Racing, Open World, Driving, Multiplayer, Automobile Sim, Realistic, Adventure"
4,4,Call of Duty®,"Welcome to Call of Duty® HQ, the home of Call of Duty®: Modern Warfare® III, Call of Duty®: Modern Warfare® II and Warzone™.","First Person Shooter, Multiplayer, Shooter, Action, Singleplayer, Military, First-Person","Welcome to Call of Duty® HQ, the home of Call of Duty®: Modern Warfare® III, Call of Duty®: Modern Warfare® II and Warzone™. First Person Shooter, Multiplayer, Shooter, Action, Singleplayer, Military, First-Person"
...,...,...,...,...,...
95,95,Valheim,"A brutal exploration and survival game for 1-10 players, set in a procedurally-generated purgatory inspired by viking culture. Battle, build, and conquer your way to a saga worthy of Odin’s patronage!","Open World Survival Craft, Survival, Open World, Multiplayer, Building, Crafting, Exploration","A brutal exploration and survival game for 1-10 players, set in a procedurally-generated purgatory inspired by viking culture. Battle, build, and conquer your way to a saga worthy of Odin’s patronage! Open World Survival Craft, Survival, Open World, Multiplayer, Building, Crafting, Exploration"
96,96,Guild Wars 2,"Guild Wars 2 is an award-winning online roleplaying game with fast-paced action combat, deep character customization, and no subscription fee required. Choose from an arsenal of professions and weapons, explore a vast open world, compete in PVP modes and more. Join over 16 million players now!","Massively Multiplayer Online Role-Playing Game, Adventure, Role-Playing Game, Fantasy, Character Customization, Third Person, 3D","Guild Wars 2 is an award-winning online roleplaying game with fast-paced action combat, deep character customization, and no subscription fee required. Choose from an arsenal of professions and weapons, explore a vast open world, compete in PVP modes and more. Join over 16 million players now! Massively Multiplayer Online Role-Playing Game, Adventure, Role-Playing Game, Fantasy, Character Customization, Third Person, 3D"
97,97,Dying Light,"First-person action survival game set in a post-apocalyptic open world overrun by flesh-hungry zombies. Roam a city devastated by a mysterious virus epidemic. Scavenge for supplies, craft weapons, and face hordes of the infected.","Zombies, Survival Horror, Horror, Open World, Parkour, First-Person, Survival","First-person action survival game set in a post-apocalyptic open world overrun by flesh-hungry zombies. Roam a city devastated by a mysterious virus epidemic. Scavenge for supplies, craft weapons, and face hordes of the infected. Zombies, Survival Horror, Horror, Open World, Parkour, First-Person, Survival"
98,98,BlazBlue Entropy Effect,Experience the brilliant action combat gameplay! The pinnacle of roguelite action games!,"Action Roguelike, Action-Adventure, Platformer, Metroidvania, Pixel Graphics, Replay Value, Procedural Generation","Experience the brilliant action combat gameplay! The pinnacle of roguelite action games! Action Roguelike, Action-Adventure, Platformer, Metroidvania, Pixel Graphics, Replay Value, Procedural Generation"


In [8]:
sample_descriptions["tokenize_desc"] = sample_descriptions["Game Description"].apply(tokenize)

sample_descriptions["tokenize_tag"] = sample_descriptions["Popular Tags"].apply(tokenize)

sample_descriptions["tokenize_desc_tag"] = sample_descriptions["Description and Tags"].apply(tokenize)

## Visualization

Checking cosine similarity of tags.

In [9]:
# for i in range(10):
#     s1, t1 = sample_descriptions.loc[i, ["Title", "tokenize_tag"]]
#     s2, t2 = sample_descriptions.loc[i + 1, ["Title", "tokenize_tag"]]
#     print(s1 + " | " + s2 + " | " + str(int(util.cos_sim(model.encode(t1), model.encode(t2)))))
#     print()

In [10]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import plotly.express as px

In [11]:
def clustering(embeddings, num_clusters):
    model = KMeans(n_clusters=num_clusters, random_state=RANDOM_STATE)
    model.fit(embeddings)
    return model.labels_

def visualization(descriptions, embeddings, n_cluster=6):
    X = np.array(embeddings)
    pca = PCA(n_components=3, random_state=RANDOM_STATE)
    result = pca.fit_transform(X)
    df = pd.DataFrame({
    'sent': descriptions["Title"],
    'cluster': clustering(embeddings, n_cluster).astype(str),
    'x': result[:, 0],
    'y': result[:, 1],
    'z': result[:, 2]
    })
    fig = px.scatter_3d(df, x='x', y='y', z='z',
              color='cluster', hover_name='sent',
              range_x = [df.x.min()-0.5, df.x.max()+0.5],
              range_y = [df.y.min()-0.5, df.y.max()+0.5],
              range_z = [df.z.min()-0.5, df.z.max()+0.5])
    fig.update_layout(
    width=1280,   # Set the desired width
    height=720   # Set the desired height
    )
    fig.update_traces(hovertemplate= '<b>%{hovertext}</b>')
    fig.show()

def visualize(model, df, col, n_cluster=6):
    embeddings = model.encode(df[col].to_numpy())
    visualization(df, embeddings, n_cluster)

### Visualizing Descriptions

In [12]:
visualize(model, sample_descriptions, "tokenize_desc", 8)

  incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
  super()._check_params_vs_input(X, default_n_init=10)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Visualizing Tags

In [13]:
visualize(model, sample_descriptions, "tokenize_tag", 8)





### Visualizing Combination: Description + Tags

In [14]:
visualize(model, sample_descriptions, "tokenize_desc_tag", 7)





## Fine Tuning

In [15]:
import datasets
import torch
from sentence_transformers import(
    losses,
    evaluation,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments, 
)
from tqdm import tqdm

Create the training data

In [16]:
train_df = game_df.sample(1000, random_state=RANDOM_STATE).reset_index()

for idx, (tag, desc) in enumerate(zip(train_df["Popular Tags"], train_df["Game Description"])):
    tag = deacronym(literal_eval(tag))
    if "Free to Play" in tag:
        tag.remove("Free to Play")
    if "Early Access" in tag:
        tag.remove("Early Access")
    if "Online Co-Op" in tag:
        tag.remove("Online Co-Op")
    if "Co-Op" in tag:
        tag.remove("Co-Op")

    tag = ", ".join(tag[:7])

    train_df.loc[idx, "Popular Tags"] = tag
    train_df.loc[idx, "Description and Tags"] = tag + ". " + desc + " " + tag

train_df["tokenize_tag"] = train_df["Description and Tags"].apply(tokenize)
train_df["enc_tag"] = train_df["tokenize_tag"].apply(model.encode)

In [17]:
# Uses Contrastive Loss
# train_examples = []
# for i in tqdm(range(len(train_df)), desc="Creating Training Examples"):
#         for j in random.sample(range(len(train_df)), 50):
#         # Make an input with the similarity of their tags as the label
#                 example = InputExample(
#                         texts=[train_df.loc[i, "Game Description"], train_df.loc[j, "Game Description"]], 
#                         label=int(util.cos_sim(train_df.loc[i, "enc_tag"], train_df.loc[j, "enc_tag"])> 0.75))
#                 train_examples.append(example)

train_cols = ["desc1", "desc2", "score"]
train_examples = []
for i in tqdm(range(len(train_df)), desc="Creating Training Examples"):
        for j in random.sample(range(len(train_df)), 50):
        # Make an input with the similarity of their tags as the label
                example = [train_df.loc[i, "Game Description"], train_df.loc[j, "Game Description"], util.cos_sim(train_df.loc[i, "enc_tag"], train_df.loc[j, "enc_tag"]).item()]
                train_examples.append(example)

train_examples = pd.DataFrame(train_examples, columns=train_cols)
train_dataset =  datasets.Dataset.from_pandas(train_examples)

Creating Training Examples: 100%|██████████| 1000/1000 [00:02<00:00, 465.90it/s]


Creates arguments for training/tuning the model and specifies the loss function.

In [18]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/mpnet-base",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    run_name="mpnet-base",
)

train_loss = losses.CoSENTLoss(model)

Creates an evaluator for the training/tuning.

In [19]:
sentences1 = [
    "Super Mario Galaxy 2, the sequel to the galaxy-hopping original game, includes the gravity-defying, physics-based exploration from the first game, but is loaded with entirely new galaxies and features to challenge players. On some stages, Mario can pair up with his dinosaur buddy Yoshi and use his tongue to grab items and spit them back at enemies. Players can also have fun with new items such as a drill that lets our hero tunnel through solid rock.",
    "Going beyond 'run and gun corridors,' 'monster-closet AIs' and static worlds, BioShock creates a living, unique and unpredictable FPS experience. After your plane crashes into icy uncharted waters, you discover a rusted bathysphere and descend into Rapture, a city hidden beneath the sea. Constructed as an idealistic society for a hand picked group of scientists, artists and industrialists, the idealism is no more. Now the city is littered with corpses, wildly powerful guardians roam the corridors as little girls loot the dead, and genetically mutated citizens ambush you at every turn. Take control of your world by hacking mechanical devices, commandeering security turrets and crafting unique items critical to your very survival. Upgrade your weapons with ionic gels, explosives and toxins to customize them to the enemy and environment. Genetically modify your body through dozens of Plasmid Stations scattered throughout the city, empowering you with fantastic and often grotesque abilities. Explore a living world powered by Ecological A.I., where the inhabitants have interesting and consequential relationships with one another that impact your gameplay experience. Experience truly next generation graphics that vividly illustrate the forlorn art deco city, highlighted by the most detailed and realistic water effects ever developed in a video game. Make meaningful choices and mature decisions, ultimately culminating in the grand question: do you exploit the innocent survivors of Rapture...or save them?",
    "Forget everything you know about The Legend of Zelda games. Step into a world of discovery, exploration and adventure in The Legend of Zelda: Breath of the Wild, a boundary-breaking new game in the acclaimed series. Travel across fields, through forests and to mountain peaks as you discover what has become of the ruined kingdom of Hyrule in this open-air adventure. Explore the wilds of Hyrule any way you like - Climb up towers and mountain peaks in search of new destinations, then set your own path to get there and plunge into the wilderness. Along the way, you'll battle towering enemies, hunt wild beasts and gather ingredients for the food and elixirs you'll need to sustain you on your journey. More than 100 Shrines of Trials to discover and explore - Shrines dot the landscape, waiting to be discovered in any order you want. Search for them in various ways, and solve a variety of puzzles inside. Work your way through the traps and devices inside to earn special items and other rewards that will help you on your adventure.",
    "Developed by Rockstar San Diego, as a follow up to the 2004 hit game Red Dead Revolver, Red Dead Redemption is a Western epic, set at the turn of the 20th century when the lawless and chaotic badlands began to give way to the expanding reach of government and the spread of the Industrial Age. The story of former outlaw, John Marston, Red Dead Redemption takes players on a great adventure across the American frontier. Utilizing Rockstar's proprietary Rockstar Advanced Game Engine (RAGE), Red Dead Redemption features an open-world environment for players to explore, including frontier towns, rolling prairies teeming with wildlife, and perilous mountain passes - each packed with an endless flow of varied distractions. Along the way, players experience the heat of gunfights and battles, meet a host of unique characters, struggle against the harshness of one of the world’s last remaining wildernesses, and ultimately pick their own precarious path through an epic story about the death of the Wild West and the gunslingers that inhabited it.",
    "Take Your Game Online: Access an online community where you can meet, chat, and play head-to-head against other gamers. Audio Dream Team: Dynamic play-by-play and analysis by Madden and Michaels. All-New Mini-camp: Hop on John Madden's Cruiser Tour Bus and travel to all NFL cities to complete skilled tasks. Create-A-Playbook: Customize receiver routes, player formations, and your team's entire playbook. Deepest Franchise Mode Ever: Play 30 years of Franchise mode and draft players each season with tips from your scouts.",
    "The Covenant alien race threatens to destroy all humankind, and the only thing standing in its way is Master Chief, a genetically enhanced supersoldier. Master Chief returns in Halo 2, which features new vehicles, weapons, environments, and more. This time, you can interact with your environment, wield two weapons at the same time, board opponents' vehicles, and even switch sides to play the role of a Covenant Elite. Halo 2 also supports broadband multiplayer action via Xbox Live.",
    "First Person Shooter", "Building", "Shooter", "Horror", "First Person",
    "Shooter", "Zombie", "Farming", "Blood", "Horror", "Card",
    "Survival", "Horror", "Life Sim", "Puzzle", "Fighting", "Magic"
]
sentences2 = [
    "The ultimate Nintendo hero is taking the ultimate step ... out into space. Join Mario as he ushers in a new era of video games, defying gravity across all the planets in the galaxy. When some creature escapes into space with Princess Peach, Mario gives chase, exploring bizarre planets all across the galaxy. Mario, Peach and enemies new and old are here. Players run, jump and battle enemies as they explore all the planets in the galaxy. Since this game makes full use of all the features of the Wii Remote, players have to do all kinds of things to succeed: pressing buttons, swinging the Wii Remote and the Nunchuk, and even pointing at and dragging things with the pointer. Since he's in space, Mario can perform mind-bending jumps unlike anything he's done before. He'll also have a wealth of new moves that are all based around tilting, pointing and shaking the Wii Remote. Shake, tilt and point! Mario takes advantage of all the unique aspects of the Wii Remote and Nunchuk controller, unleashing new moves as players shake the controller and even point at and drag items with the pointer.",
    "By taking the suspense, challenge and visceral charge of the original, and adding startling new realism and responsiveness, Half-Life 2 opens the door to a world where the player's presence affects everything around him, from the physical environment to the behaviors -- even the emotions -- of both friends and enemies. The player again picks up the crowbar of research scientist Gordon Freeman, who finds himself on an alien-infested Earth being picked to the bone, its resources depleted, its populace dwindling. Freeman is thrust into the unenviable role of rescuing the world from the wrong he unleashed back at Black Mesa. And a lot of people -- people he cares about -- are counting on him.",
    "As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human uses Link to gain access to the Sacred Realm, where he places his tainted hands on Triforce and transforms the beautiful Hyrulean landscape into a barren wasteland. Link is determined to fix the problems he helped to create, so with the help of Rauru he travels through time gathering the powers of the Seven Sages.",
    "An ancient evil force has reawakened, attacking in secret and feeding on the souls of mighty warriors. To lure it out of hiding will take the greatest fighting contest the world has ever seen...Tekken 3. Some are fighting for revenge, some for honor, Ultimately, all are fighting for their lives and the fate of all mankind.",
    "Prepare for an all-new RPG experience in Persona 5 Royal based in the universe of the award-winning series, Persona! Don the mask of Joker and join the Phantom Thieves of Hearts. Break free from the chains of modern society and stage grand heists to infiltrate the minds of the corrupt and make them change their ways! Persona 5 Royal is packed with new characters, confidants, story depth, new locations to explore, and a new grappling hook mechanic for stealthy access to new areas. With a new semester at Shujin Academy, get ready to strengthen your abilities in the metaverse and in your daily life. Persona 5 Royal presents a unique visual style and award nominated composer Shoji Meguro returns with an all-new soundtrack. Explore Tokyo, unlock new Personas, customize your own personal Thieves Den, discover a never-before-seen story arc, cutscenes, alternate endings, and more! Even for the most seasoned Phantom Thieves among us, Persona 5 Royal is a new challenge to defy conventions, discover the power within, and fight for justice. Wear the mask. Reveal your truth.",
    "With the addition of two new courses (a downtown Tokyo circuit and the Monte Carlo Grand Prix course), an impressive range of cars (from Vipers to Cooper Minis), and a soundtrack featuring everything from Lenny Kravitz to Jimi Hendrix, this highly anticipated title is finally ready to be driven.",
    "Racing","Exploration","Open World","Sandbox","Third Person","Sports",
    "Horror","Military","Gore","Casual","Shooter","Sports",
    "Card","Fighting","Battle Royale","Shooter", "Fantasy"
]
scores = [0.99, 0.7, 0.9, 0.2, 0.1, 0.1, 0, 0.6, 0.3, 0.2, 0.7, 0.1, 0.9, 0.1, 0.99, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.8]

eval_cols = ["sentence1", "sentence2", "score"]
eval_examples = []
for s1, s2, score in zip(sentences1, sentences2, scores):
    example = [s1, s2, score]
    eval_examples.append(example)
eval_df = pd.DataFrame(eval_examples, columns=eval_cols)
eval_dataset = datasets.Dataset.from_pandas(eval_df)

evaluator = evaluation.EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["sentence1"],
    sentences2=eval_dataset["sentence2"],
    scores=eval_dataset["score"],
    main_similarity=evaluation.SimilarityFunction.COSINE,
    name="sts-dev",
)

Trains the model.

In [20]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=evaluator,
)

trainer.train()

  0%|          | 0/3125 [00:00<?, ?it/s]

{'loss': 4.6724, 'grad_norm': 10.783743858337402, 'learning_rate': 4.6674964438122335e-05, 'epoch': 0.16}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 4.005044460296631, 'eval_sts-dev_pearson_cosine': 0.5738086752182747, 'eval_sts-dev_spearman_cosine': 0.44114845191199525, 'eval_sts-dev_pearson_manhattan': 0.5986259072215355, 'eval_sts-dev_spearman_manhattan': 0.48521264860125307, 'eval_sts-dev_pearson_euclidean': 0.587066584474308, 'eval_sts-dev_spearman_euclidean': 0.44114845191199525, 'eval_sts-dev_pearson_dot': 0.5738078912842106, 'eval_sts-dev_spearman_dot': 0.44114845191199525, 'eval_sts-dev_pearson_max': 0.5986259072215355, 'eval_sts-dev_spearman_max': 0.48521264860125307, 'eval_runtime': 3.3285, 'eval_samples_per_second': 6.91, 'eval_steps_per_second': 0.601, 'epoch': 0.16}


Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

{'loss': 4.3131, 'grad_norm': 11.479540824890137, 'learning_rate': 3.778449502133713e-05, 'epoch': 0.32}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 4.256532192230225, 'eval_sts-dev_pearson_cosine': 0.46326514502009253, 'eval_sts-dev_spearman_cosine': 0.36821460911598225, 'eval_sts-dev_pearson_manhattan': 0.5018310194442815, 'eval_sts-dev_spearman_manhattan': 0.42646038634890926, 'eval_sts-dev_pearson_euclidean': 0.4896913852943567, 'eval_sts-dev_spearman_euclidean': 0.36821460911598225, 'eval_sts-dev_pearson_dot': 0.4632648097879877, 'eval_sts-dev_spearman_dot': 0.36821460911598225, 'eval_sts-dev_pearson_max': 0.5018310194442815, 'eval_sts-dev_spearman_max': 0.42646038634890926, 'eval_runtime': 1.623, 'eval_samples_per_second': 14.171, 'eval_steps_per_second': 1.232, 'epoch': 0.32}


Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

{'loss': 4.1948, 'grad_norm': 9.571090698242188, 'learning_rate': 2.889402560455192e-05, 'epoch': 0.48}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 4.7077131271362305, 'eval_sts-dev_pearson_cosine': 0.46008893813674584, 'eval_sts-dev_spearman_cosine': 0.27096948538796495, 'eval_sts-dev_pearson_manhattan': 0.48679889108005336, 'eval_sts-dev_spearman_manhattan': 0.3150336820772227, 'eval_sts-dev_pearson_euclidean': 0.4756745653828944, 'eval_sts-dev_spearman_euclidean': 0.27096948538796495, 'eval_sts-dev_pearson_dot': 0.4600889191434579, 'eval_sts-dev_spearman_dot': 0.27096948538796495, 'eval_sts-dev_pearson_max': 0.48679889108005336, 'eval_sts-dev_spearman_max': 0.3150336820772227, 'eval_runtime': 1.6211, 'eval_samples_per_second': 14.188, 'eval_steps_per_second': 1.234, 'epoch': 0.48}


Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

{'loss': 4.1296, 'grad_norm': 11.812318801879883, 'learning_rate': 2.0003556187766715e-05, 'epoch': 0.64}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 4.4049577713012695, 'eval_sts-dev_pearson_cosine': 0.6038004730753506, 'eval_sts-dev_spearman_cosine': 0.5490297610477644, 'eval_sts-dev_pearson_manhattan': 0.6355996042424832, 'eval_sts-dev_spearman_manhattan': 0.5778994071545196, 'eval_sts-dev_pearson_euclidean': 0.6184797364282462, 'eval_sts-dev_spearman_euclidean': 0.5490297610477644, 'eval_sts-dev_pearson_dot': 0.6038005150883305, 'eval_sts-dev_spearman_dot': 0.5490297610477644, 'eval_sts-dev_pearson_max': 0.6355996042424832, 'eval_sts-dev_spearman_max': 0.5778994071545196, 'eval_runtime': 2.0505, 'eval_samples_per_second': 11.217, 'eval_steps_per_second': 0.975, 'epoch': 0.64}


Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

{'loss': 4.032, 'grad_norm': 19.08639144897461, 'learning_rate': 1.1113086770981508e-05, 'epoch': 0.8}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 4.9160261154174805, 'eval_sts-dev_pearson_cosine': 0.5792408225250146, 'eval_sts-dev_spearman_cosine': 0.5049655643585066, 'eval_sts-dev_pearson_manhattan': 0.6083991209095803, 'eval_sts-dev_spearman_manhattan': 0.518640659882759, 'eval_sts-dev_pearson_euclidean': 0.5997451095036523, 'eval_sts-dev_spearman_euclidean': 0.5049655643585066, 'eval_sts-dev_pearson_dot': 0.5792404302245836, 'eval_sts-dev_spearman_dot': 0.5049655643585066, 'eval_sts-dev_pearson_max': 0.6083991209095803, 'eval_sts-dev_spearman_max': 0.518640659882759, 'eval_runtime': 1.9053, 'eval_samples_per_second': 12.072, 'eval_steps_per_second': 1.05, 'epoch': 0.8}


Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

{'loss': 3.9797, 'grad_norm': 17.48697280883789, 'learning_rate': 2.2226173541963018e-06, 'epoch': 0.96}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 4.761734485626221, 'eval_sts-dev_pearson_cosine': 0.6273173196225443, 'eval_sts-dev_spearman_cosine': 0.5292768452905109, 'eval_sts-dev_pearson_manhattan': 0.6539363027576667, 'eval_sts-dev_spearman_manhattan': 0.5383935756400126, 'eval_sts-dev_pearson_euclidean': 0.6390840229245605, 'eval_sts-dev_spearman_euclidean': 0.5292768452905109, 'eval_sts-dev_pearson_dot': 0.6273172474785175, 'eval_sts-dev_spearman_dot': 0.5292768452905109, 'eval_sts-dev_pearson_max': 0.6539363027576667, 'eval_sts-dev_spearman_max': 0.5383935756400126, 'eval_runtime': 1.9588, 'eval_samples_per_second': 11.742, 'eval_steps_per_second': 1.021, 'epoch': 0.96}


Computing widget examples:   0%|          | 0/5 [00:00<?, ?example/s]

{'train_runtime': 2673.6807, 'train_samples_per_second': 18.701, 'train_steps_per_second': 1.169, 'train_loss': 4.206983974609375, 'epoch': 1.0}


TrainOutput(global_step=3125, training_loss=4.206983974609375, metrics={'train_runtime': 2673.6807, 'train_samples_per_second': 18.701, 'train_steps_per_second': 1.169, 'total_flos': 0.0, 'train_loss': 4.206983974609375, 'epoch': 1.0})

## Visualization Pt. 2

### Visualizing Descriptions

In [21]:
visualize(model, sample_descriptions, "tokenize_desc", 7)





### Visualizing Tags

In [22]:
visualize(model, sample_descriptions, "tokenize_tag", 7)





### Visualizing Combination: Description + Tags

In [23]:
visualize(model, sample_descriptions, "tokenize_desc_tag", 7)





In [24]:
old_model = SentenceTransformer("all-mpnet-base-v2")

game1_desc = sample_descriptions.loc[0, "tokenize_desc"]
game2_desc = sample_descriptions.loc[1, "tokenize_desc"]

print(game1_desc)
print(game2_desc)

old_enc_game1 = old_model.encode(game1_desc)
old_enc_game2 = old_model.encode(game2_desc)
new_enc_game1 = model.encode(game1_desc)
new_enc_game2 = model.encode(game2_desc)

print(util.cos_sim(old_enc_game1, old_enc_game2))
print(util.cos_sim(new_enc_game1, new_enc_game2))


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



Baldur ’ s Gate 3 is a story - rich , party - based RPG set in the universe of Dungeons & Dragons , where your choices shape a tale of fellowship and betrayal , survival and sacrifice , and the lure of absolute power .
Counter - Strike : Global_Offensive ( CS : GO ) expands upon the team - based action gameplay that it pioneered when it was launched 19 years ago . CS : GO features new maps , characters , weapons , and game modes , and delivers updated versions of the classic CS content ( de_dust2 , etc . ) .
tensor([[0.1416]])
tensor([[0.4916]])


In [25]:
# for i in range(len(sample_descriptions) - 1):
#     s1, t1 = sample_descriptions.loc[i, ["Title", "tokenize_tag"]]
#     s2, t2 = sample_descriptions.loc[i + 1, ["Title", "tokenize_tag"]]
#     print(s1, s2, (score := util.cos_sim(model.encode(t1), model.encode(t2))), score > 0.75)
#     print()

In [26]:
# from openTSNE import TSNE
# import matplotlib.pyplot as plt

In [27]:
# tsne = TSNE(
#     perplexity=6.33,
#     metric=cosine_similarity,
#     random_state=RANDOM_STATE,
#     verbose=True,
# )

# embeddings_tsne = tsne.fit(sample_descriptions["tokenize_desc"])

# res = tsne.transform(sample_descriptions["tokenize_desc"])

In [28]:
# plt.scatter(res[:, 0], res[:, 1])