In [1]:
import os
import sys
import networkx as nx
import numpy as np
import pandas as pd
import torch

In [2]:
os.chdir('..')
sys.path.append('src')

In [3]:
# Define the path to the "data" directory
data_dir = "data"

# Load the descriptions Parquet file
descriptions = pd.read_parquet(os.path.join(data_dir, "descriptions.parquet"))

# Load the recommendations Parquet file
recommendations = pd.read_parquet(os.path.join(data_dir, "recommendations.parquet"))

# Load the games Parquet file
games = pd.read_parquet(os.path.join(data_dir, "games.parquet"))

In [4]:
descriptions = descriptions.rename(columns={"AppID": "app_id"})
games = pd.merge(games, descriptions, on="app_id", how="inner")

In [5]:
# Utils
# # Dictionary of all categorical mappings
MAPPINGS = {}


# # Maps a set / list of unique categories to their index
def make_map(categories):
    return {x: i for i, x in enumerate(categories)}

In [6]:
# Pads and truncates a list of torch tensors
def pad_and_truncate(sequences, max_len, padding_value=-1):
    # Pads and truncates a list of lists
    lengths = [min(len(x), max_len) for x in sequences]

    sequences = [torch.Tensor(x) for x in sequences]

    for i, tensor in enumerate(sequences):
        if tensor.dim() == 0:
            sequences[i] = torch.tensor([padding_value])
    # Make sure that the max length is at least max_len
    sequences.append(torch.zeros(max_len))
    # Pad
    sequences = torch.nn.utils.rnn.pad_sequence(
        sequences, batch_first=True, padding_value=padding_value
    )
    # Truncate and remove the added torch zeros
    sequences = sequences[:-1, :max_len]

    # Add 1 so that 0 becomes pad instead
    sequences = sequences + 1

    lengths = torch.tensor(lengths).int().view(-1, 1)
    return sequences, lengths

# Data Preprocessing

In [7]:
def preprocess(recommendations, games):
    recommendations = recommendations.copy()
    games = games.copy()

    games["app_id"] = games["app_id"].astype("int")
    recommendations["app_id"] = recommendations["app_id"].astype("int")
    recommendations["user_id"] = recommendations["user_id"].astype("int")

    # Filter the recommendations dataframe to keep only the rows corresponding to games that are in the games dataframe
    recommendations = recommendations[recommendations["app_id"].isin(games["app_id"])]

    # Merge the resulting dataframe with the games dataframe to get the release dates of the games
    recommendations = pd.merge(
        recommendations, games[["app_id", "date_release"]], on="app_id"
    )

    # Filter the resulting dataframe to keep only the rows where the game is recommended and where the helpfulness of the review is greater than 1
    recommendations = recommendations.query("is_recommended == True & helpful > 1")

    # Filter the users to keep only those who have reviewed more than 1 game
    s = recommendations.groupby("user_id").count()["app_id"]
    filt = s[s > 1]
    recommendations = recommendations[recommendations["user_id"].isin(filt.index)]

    # Filter the games to only keep those who have at least 5 recommendations
    s = recommendations.groupby("app_id").count()["user_id"]
    filt = s[s > 5]
    recommendations = recommendations[recommendations["app_id"].isin(filt.index)]

    # Filter games to keep only those in recommendations
    games = games[games["app_id"].isin(recommendations["app_id"])]

    return recommendations, games

In [8]:
recommendations, games = preprocess(recommendations, games)

# Adjencency Matrix creation

In [9]:
# Map ids to ranges
MAPPINGS |= {
    "app": make_map(
        list(recommendations.sort_values("date_release")["app_id"].unique())
    )
}
MAPPINGS |= {"user": (make_map(list(recommendations["user_id"].unique())))}

# Apply mappings
recommendations["mapped_appid"] = recommendations["app_id"].map(MAPPINGS["app"])
recommendations["mapped_userid"] = recommendations["user_id"].map(MAPPINGS["user"])

games["mapped_appid"] = games["app_id"].map(MAPPINGS["app"])

In [10]:
def create_app_user_pairs(recommendations):
    user_app_pairs = (
        recommendations.groupby(["mapped_userid", "mapped_appid"])
        .size()
        .unstack(fill_value=0)
    )

    # Move to sparse representations
    user_app_pairs = user_app_pairs.astype(pd.SparseDtype("int", 0))
    user_app_pairs = user_app_pairs.sparse.to_coo()

    return user_app_pairs


user_app_pairs = create_app_user_pairs(recommendations)

In [11]:
edges = user_app_pairs.T.dot(user_app_pairs)

In [12]:
edges = edges.toarray()
# Remove self references
np.fill_diagonal(edges, 0)

In [13]:
edges

array([[  0, 266,  68, ...,   0,   0,   0],
       [266,   0,  70, ...,   1,   0,   0],
       [ 68,  70,   0, ...,   0,   0,   0],
       ...,
       [  0,   1,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

In [14]:
# Make sure the above sorcery checks out
assert (
    recommendations.query("mapped_appid <2").copy().groupby("user_id").count()["app_id"]
    > 1
).sum() == edges[0, 1]

In [15]:
# When taking threshold 0.005% of reviews, on average each node will be connected to 8% of nodes
(edges.dot(np.diag(1 / edges.sum(axis=1))) > 0.005).mean()

0.07919158675532093

In [16]:
edges = edges.dot(np.diag(1 / edges.sum(axis=1))) > 0.005

# Features Creation

In [17]:
features_cols = [
    "mapped_appid",
    "title",
    "date_release",
    "price_original",
    "Developers",
    "Genres",
    "Tags",
    "About the game",
]
features = games[features_cols].set_index("mapped_appid")

In [18]:
features = features.sort_index()

In [19]:
features.sample(n=5)

Unnamed: 0_level_0,title,date_release,price_original,Developers,Genres,Tags,About the game
mapped_appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
341,Shop Titans,2020-05-05,0.0,"Kabam Games, Inc.","Adventure,Casual,Free to Play,RPG,Simulation","Free to Play,Simulation,RPG,Adventure,Casual,M...",As a thriving new shop owner in a bustling adv...
467,FOREWARNED,2021-09-10,12.99,Dreambyte Games,"Action,Adventure,Indie,Early Access","Horror,Online Co-Op,Survival Horror,Co-op,Mult...","Having long studied ancient Egyptian lore, you..."
244,The Jackbox Party Pack 5,2018-10-17,29.99,"Jackbox Games, Inc.",Casual,"Casual,Local Multiplayer,Comedy,Trivia,Funny,M...",It’s the biggest Party Pack yet with five part...
427,NieR Replicant™ ver.1.22474487139...,2021-04-23,59.99,"Square Enix,Toylogic Inc.","Action,Adventure,RPG","Great Soundtrack,Story Rich,Action,RPG,Adventu...",A thousand-year lie that would live on for ete...
82,Total War: ATTILA,2015-02-17,44.99,CREATIVE ASSEMBLY,Strategy,"Strategy,Historical,War,Turn-Based Strategy,Gr...","Against a darkening background of famine, dise..."


## Price

Divide the price by 5 and round to nearest integer

In [20]:
features["_price"] = (features["price_original"] / 5).round().astype(int)

## Developer

Categorical

In [21]:
MAPPINGS |= {"developper": make_map(features["Developers"].unique())}

In [22]:
features["_developer"] = features["Developers"].map(MAPPINGS["developper"])

## Genres

In [23]:
# Inperfect: Remove Early Access tag from later games
features["Genres"] = features["Genres"].str.replace(",Early Access", "")

In [24]:
def get_tensor_from_category_enumerations(col, seq_len):
    # This is a bit inefficient but will do with this size of data
    categories = set()
    categories.update(*list(features[col].str.split(",")))

    map_categories = make_map(categories)
    global MAPPINGS
    MAPPINGS |= {col: map_categories}

    # This is a bit inefficient but will do with this size of data
    categories_vec = list(
        features[col].str.split(",").apply(lambda l: [map_categories[x] for x in l])
    )

    categories_tensor = pad_and_truncate(categories_vec, seq_len)
    return categories_tensor

In [25]:
from params import GENRES_SEQ_LEN

genres_tensor = get_tensor_from_category_enumerations("Genres", GENRES_SEQ_LEN)

## Tags

Very similar to genres

In [26]:
features["Tags"] = features["Tags"].fillna("NoTags")

In [27]:
from params import TAGS_SEQ_LEN

tags_tensor = get_tensor_from_category_enumerations("Tags", TAGS_SEQ_LEN)

## "About the game" section

Using a TF-IDF weighted - fasttext embedding

In [28]:
del descriptions
del games
del recommendations

In [29]:
import gensim

In [30]:
embedding = gensim.models.fasttext.load_facebook_model("wiki.en/wiki.en.bin")

In [32]:
documents = list(features["About the game"])

In [33]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel


In [36]:
documents = ["None" if doc is None else doc for doc in documents]

In [37]:
tokenized_docs = [doc.lower().split() for doc in documents]

In [38]:
# Create a dictionary from the tokenized documents
dictionary = Dictionary(tokenized_docs)

# Create a bag of words (BoW) representation for each document
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Fit the TfidfModel using the BoW corpus
tfidf_model = TfidfModel(corpus)

In [40]:
import numpy as np

def get_tfidf_weighted_embeddings(tfidf_model, corpus, fasttext_model, dictionary):
    weighted_embeddings = []
    
    for doc_bow in corpus:
        tfidf_scores = dict(tfidf_model[doc_bow])
        doc_embedding = np.zeros(fasttext_model.vector_size)
        total_weight = 0
        
        for word_id, tfidf_score in tfidf_scores.items():
            word = dictionary[word_id]
            if word in fasttext_model.wv:
                word_embedding = fasttext_model.wv[word]
                doc_embedding += word_embedding * tfidf_score
                total_weight += tfidf_score
        
        if total_weight > 0:
            doc_embedding /= total_weight
        
        weighted_embeddings.append(doc_embedding)
    
    return np.array(weighted_embeddings)

weighted_embeddings = get_tfidf_weighted_embeddings(tfidf_model, corpus, embedding, dictionary)


In [42]:
weighted_embeddings.shape

(692, 300)

# Write to disk

In [43]:
output_dir = "run_artifacts/preprocess"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [44]:
## edges

In [45]:
pd.DataFrame(edges).to_csv(f"{output_dir}/edges.csv")

In [46]:
## mappings

In [47]:
MAPPINGS["app"] = {int(a): int(b) for a, b in enumerate(MAPPINGS["app"])}
MAPPINGS["user"] = {int(a): int(b) for a, b in enumerate(MAPPINGS["user"])}

In [48]:
import json

In [49]:
with open(f"{output_dir}/mappings.json", "w") as f:
    json.dump(MAPPINGS, f)

In [50]:
## features + edges as labels

In [51]:
t1 = torch.Tensor(features[[x for x in features.columns if x[0] == "_"]].values).int()

In [52]:
X = torch.cat((t1, *genres_tensor, *tags_tensor), dim=1)

In [53]:
dataset = torch.utils.data.TensorDataset(X, torch.Tensor(edges).bool())
torch.save(dataset, f"{output_dir}/dataset.t")

In [None]:
weighted_embeddigns