In [1]:
# General
import pandas as pd
import math
import json
import os
import glob

# Bert
from transformers import XLMRobertaTokenizer, XLMRobertaModel
import torch

In [13]:
df = pd.read_csv('metadata-clean-extracted.csv').drop(columns=['min_requirements', 'rec_requirements'])
df

Unnamed: 0,name,steam_appid,descriptions,is_free,total_recommendations,num_languages,num_developers,num_publishers,num_genres,num_categories,...,developers_12,developers_13,developers_14,developers_15,day,month,year,age,era,price
0,Clash of Warlords,1430720,Join the Epic Battles in this most addicting t...,False,0,2,1,1,1,3,...,1.0,0.0,1.0,1.0,7,2,2021,4,2020,699.0
1,Mine Crazy: The Korean Grinder,1430740,This is the essence of Korean grinder MMO. Thi...,False,0,1,1,1,4,2,...,1.0,0.0,0.0,0.0,8,10,2020,5,2020,199.0
2,Fade,1430100,Set in a small town in the Mid-west. You are a...,False,0,1,1,1,2,2,...,1.0,1.0,0.0,1.0,29,10,2020,5,2020,199.0
3,Clash: Artifacts of Chaos,1430680,ZENO EDITION About the Game You play as Pseudo...,False,759,13,1,1,3,6,...,0.0,1.0,0.0,1.0,9,3,2023,2,2020,2999.0
4,Astatos,1430970,"Astatos, a card-battling adventure filled with...",False,243,3,1,1,4,9,...,1.0,1.0,0.0,0.0,16,12,2021,4,2020,1499.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91278,Survive Avalon,1617600,The big 1.5 update is coming in next month!We ...,False,0,1,1,1,2,3,...,0.0,0.0,1.0,0.0,31,5,2021,4,2020,1299.0
91279,Cam Circle VR,1617770,Cam Circle VR is a mixed reality (MR) tool tha...,False,0,1,1,1,1,2,...,1.0,0.0,1.0,0.0,16,11,2021,4,2020,299.0
91280,Neon Light,1617780,Neon Light is a 2D platformer with interesting...,False,0,1,1,1,3,3,...,1.0,0.0,1.0,0.0,17,5,2021,4,2020,199.0
91281,G for Gravity,1617790,Humankind is in danger! Asteroids hit and dest...,True,0,1,1,1,3,2,...,0.0,0.0,1.0,1.0,10,5,2021,4,2020,0.0


In [3]:
progress_path = 'vectorization_progress.json'

def read_progress():
    if not os.path.exists(progress_path):
        return -1
    
    with open(progress_path, 'r') as file:
        progress = json.load(file)
        batch = progress.get('batch', None)
        if batch: return batch
        else: return -1
        
def write_progress(batch):
    with open(progress_path, 'w') as file:
        json.dump({'batch': batch}, file)

In [14]:
def get_text_embedding(text):
    model_name = "xlm-roberta-base" 
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    model = XLMRobertaModel.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {key: value.to(device) for key, value in encoded_input.items()}
    with torch.no_grad():
        output = model(**encoded_input)
    embeddings = output.last_hidden_state[:, 0, :].cpu().numpy()
    return pd.DataFrame(embeddings)

In [None]:
def get_batch_embedding(series: pd.Series, batch_size=500):
    n_batch = math.ceil(df.shape[0] / batch_size)
    next_batch_index = read_progress()
    model_name = "xlm-roberta-base" 
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    model = XLMRobertaModel.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for i in range(next_batch_index + 1, n_batch):
        start = i * batch_size
        end = start + batch_size
        batch = series.iloc[start:end].to_list()
        
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        encoded_input = {key: value.to(device) for key, value in encoded_input.items()}
        with torch.no_grad():
            output = model(**encoded_input)
        embeddings = output.last_hidden_state[:, 0, :].cpu().numpy()
        
        df_emb = pd.DataFrame(embeddings)
        df_emb.to_csv(f'Text Vectors/vec{i}.csv', index=False)
        write_progress(i)
        print(f'Vectorized batch {i}')

In [11]:
vector_paths = glob.glob('Text Vectors/*.csv')

df_vectors = pd.read_csv(vector_paths[0])
for path in vector_paths[1:]:
    df_vec = pd.read_csv(path)
    df_vectors = pd.concat([df_vectors, df_vec], ignore_index=True)

df_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.120230,0.032435,0.072783,-0.007580,0.093149,-0.029818,0.052937,0.010010,0.110298,-0.111588,...,0.044189,0.008990,0.037291,0.093946,0.068796,-0.014621,0.158587,-0.118073,0.070505,-0.006734
1,0.068001,0.068762,0.064851,0.016615,0.057755,-0.035969,0.025135,-0.025863,0.092729,-0.119969,...,0.032085,0.007106,0.072068,0.074086,0.099032,-0.022591,0.149205,-0.071686,0.079684,-0.027101
2,0.128737,0.058814,0.064431,-0.006308,0.100606,-0.004734,0.016509,-0.017240,0.128302,-0.144646,...,0.059402,0.015776,0.053784,0.073522,0.099779,-0.015478,0.108809,-0.095704,0.088155,-0.010766
3,0.119251,0.037627,0.088111,-0.007414,0.055154,-0.051587,0.041245,0.000535,0.111813,-0.130479,...,0.032469,0.027759,0.061394,0.065656,0.083638,-0.000995,0.122059,-0.096047,0.087325,-0.036018
4,0.144066,0.031486,0.082451,-0.010148,0.050963,-0.037090,0.022749,-0.004857,0.113908,-0.156980,...,0.031386,0.017251,0.064261,0.061784,0.095770,-0.002748,0.133868,-0.117349,0.086446,-0.025368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91278,0.145327,0.075877,0.090401,0.017259,0.075890,-0.017609,0.023992,-0.052036,0.079123,-0.140810,...,0.091528,0.001114,0.127067,0.017323,0.151991,-0.017150,0.192407,-0.080582,0.097609,-0.004026
91279,0.117025,0.057656,0.058227,0.002681,0.074998,-0.038512,0.020128,-0.018914,0.117137,-0.112692,...,0.023619,0.020511,0.079572,0.074883,0.084747,0.001139,0.128561,-0.078396,0.074881,0.018819
91280,0.112214,0.075753,0.061717,0.004959,0.114069,-0.006993,0.017646,-0.015965,0.085327,-0.118103,...,0.080847,-0.002328,0.075592,0.029822,0.063645,0.004751,0.114775,-0.085941,0.106455,-0.000475
91281,0.131596,0.027756,0.096019,-0.007007,0.066382,-0.069157,0.029493,-0.006060,0.083698,-0.105396,...,0.040040,-0.009075,0.070354,0.069413,0.104669,-0.033823,0.136881,-0.058575,0.060078,-0.020842


In [12]:
df_vectors.to_csv('des_vec.csv', index=False)

In [21]:
text = 'Astatos, a card-battling adventure filled with gods, heroes, and the instability of yourself and those around you. Engaging narrative meets tabletop-inspired card action about the influence you have on others. THE TRIAL BEGINS Experience narrative battles, explore solo, or play online— with strategic 1v1 battles, and chaotic co-op action for up to 8 players. Match cards, cast spells, summon heroes, and turn the opposing team over to your side to come out victorious! CALL FORTH YOUR HEROES Summon a wide range of Heroes to win the Trials ahead! Each with stunning artwork and unique abilities that lead to endless combinations. Just be sure your opponent doesn’t turn them against you! THE FUTURE OF THE CROWN MAGES As rebels approach, Vitus and Volesus must flee the academy and leave their order behind. Unleashing an ancient god and devil, the two adventure to uncover their past, influence the land around them, and find a place to call home.FREEDOM OR ORDER With the country in chaos, everyone has their own thoughts to share. Meet a wide range of characters who each have their own morals and desires. Make decisions and convince others to follow your lead, no matter the consequences.TIME TO TAKE A BREAK Collect mementos of your adventure and head to the bathhouse. Interludes between chapters let you spend mementos and get closer to your companions, which may in turn, influence later events.'

get_text_embedding(text)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.144067,0.031486,0.082451,-0.010148,0.050963,-0.03709,0.022749,-0.004857,0.113908,-0.15698,...,0.031386,0.017251,0.064261,0.061784,0.09577,-0.002748,0.133868,-0.117349,0.086446,-0.025369


In [17]:
df_vectors.columns = [f"description_{i}" for i in range(len(df_vectors.columns))]
df_vectors

Unnamed: 0,description_0,description_1,description_2,description_3,description_4,description_5,description_6,description_7,description_8,description_9,...,description_758,description_759,description_760,description_761,description_762,description_763,description_764,description_765,description_766,description_767
0,0.120230,0.032435,0.072783,-0.007580,0.093149,-0.029818,0.052937,0.010010,0.110298,-0.111588,...,0.044189,0.008990,0.037291,0.093946,0.068796,-0.014621,0.158587,-0.118073,0.070505,-0.006734
1,0.068001,0.068762,0.064851,0.016615,0.057755,-0.035969,0.025135,-0.025863,0.092729,-0.119969,...,0.032085,0.007106,0.072068,0.074086,0.099032,-0.022591,0.149205,-0.071686,0.079684,-0.027101
2,0.128737,0.058814,0.064431,-0.006308,0.100606,-0.004734,0.016509,-0.017240,0.128302,-0.144646,...,0.059402,0.015776,0.053784,0.073522,0.099779,-0.015478,0.108809,-0.095704,0.088155,-0.010766
3,0.119251,0.037627,0.088111,-0.007414,0.055154,-0.051587,0.041245,0.000535,0.111813,-0.130479,...,0.032469,0.027759,0.061394,0.065656,0.083638,-0.000995,0.122059,-0.096047,0.087325,-0.036018
4,0.144066,0.031486,0.082451,-0.010148,0.050963,-0.037090,0.022749,-0.004857,0.113908,-0.156980,...,0.031386,0.017251,0.064261,0.061784,0.095770,-0.002748,0.133868,-0.117349,0.086446,-0.025368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91278,0.145327,0.075877,0.090401,0.017259,0.075890,-0.017609,0.023992,-0.052036,0.079123,-0.140810,...,0.091528,0.001114,0.127067,0.017323,0.151991,-0.017150,0.192407,-0.080582,0.097609,-0.004026
91279,0.117025,0.057656,0.058227,0.002681,0.074998,-0.038512,0.020128,-0.018914,0.117137,-0.112692,...,0.023619,0.020511,0.079572,0.074883,0.084747,0.001139,0.128561,-0.078396,0.074881,0.018819
91280,0.112214,0.075753,0.061717,0.004959,0.114069,-0.006993,0.017646,-0.015965,0.085327,-0.118103,...,0.080847,-0.002328,0.075592,0.029822,0.063645,0.004751,0.114775,-0.085941,0.106455,-0.000475
91281,0.131596,0.027756,0.096019,-0.007007,0.066382,-0.069157,0.029493,-0.006060,0.083698,-0.105396,...,0.040040,-0.009075,0.070354,0.069413,0.104669,-0.033823,0.136881,-0.058575,0.060078,-0.020842


In [23]:
df = df.merge(df_vectors, left_index=True, right_index=True)

In [25]:
df.drop(columns=['descriptions'], inplace=True)

In [26]:
df

Unnamed: 0,name,steam_appid,is_free,total_recommendations,num_languages,num_developers,num_publishers,num_genres,num_categories,lang_Afrikaans,...,description_758,description_759,description_760,description_761,description_762,description_763,description_764,description_765,description_766,description_767
0,Clash of Warlords,1430720,False,0,2,1,1,1,3,False,...,0.044189,0.008990,0.037291,0.093946,0.068796,-0.014621,0.158587,-0.118073,0.070505,-0.006734
1,Mine Crazy: The Korean Grinder,1430740,False,0,1,1,1,4,2,False,...,0.032085,0.007106,0.072068,0.074086,0.099032,-0.022591,0.149205,-0.071686,0.079684,-0.027101
2,Fade,1430100,False,0,1,1,1,2,2,False,...,0.059402,0.015776,0.053784,0.073522,0.099779,-0.015478,0.108809,-0.095704,0.088155,-0.010766
3,Clash: Artifacts of Chaos,1430680,False,759,13,1,1,3,6,False,...,0.032469,0.027759,0.061394,0.065656,0.083638,-0.000995,0.122059,-0.096047,0.087325,-0.036018
4,Astatos,1430970,False,243,3,1,1,4,9,False,...,0.031386,0.017251,0.064261,0.061784,0.095770,-0.002748,0.133868,-0.117349,0.086446,-0.025368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91278,Survive Avalon,1617600,False,0,1,1,1,2,3,False,...,0.091528,0.001114,0.127067,0.017323,0.151991,-0.017150,0.192407,-0.080582,0.097609,-0.004026
91279,Cam Circle VR,1617770,False,0,1,1,1,1,2,False,...,0.023619,0.020511,0.079572,0.074883,0.084747,0.001139,0.128561,-0.078396,0.074881,0.018819
91280,Neon Light,1617780,False,0,1,1,1,3,3,False,...,0.080847,-0.002328,0.075592,0.029822,0.063645,0.004751,0.114775,-0.085941,0.106455,-0.000475
91281,G for Gravity,1617790,True,0,1,1,1,3,2,False,...,0.040040,-0.009075,0.070354,0.069413,0.104669,-0.033823,0.136881,-0.058575,0.060078,-0.020842


In [27]:
df.to_csv('features_extracted.csv', index=False)