In [9]:
import pandas as pd

headers = ['user_id', 'game', 'behavior', 'hours_played', 'other_columns']
# Load the dataset
data = pd.read_csv('steam-200k.csv', header=None, names=headers)

# Data cleaning
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

# Filter games and users with sufficient interactions
data = data[data['hours_played'] > 0]

# Split into training and testing sets
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [10]:
import networkx as nx

# Create a bipartite graph
G = nx.Graph()

# Add nodes and edges
for row in train_data.itertuples():
    G.add_node(row.user_id, bipartite=0)
    G.add_node(row.game, bipartite=1)
    G.add_edge(row.user_id, row.game, weight=row.hours_played)


In [3]:
from node2vec import Node2Vec

# Generate random walks and learn embeddings
node2vec = Node2Vec(G, dimensions=64, walk_length=60, num_walks=1000, workers=4)
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# Get embeddings for all nodes
embeddings = model.wv

  from .autonotebook import tqdm as notebook_tqdm
Computing transition probabilities: 100%|██████████| 17041/17041 [01:31<00:00, 185.88it/s]
Generating walks (CPU: 2): 100%|██████████| 250/250 [3:03:55<00:00, 44.14s/it]  
Generating walks (CPU: 4): 100%|██████████| 250/250 [3:04:15<00:00, 44.22s/it]
Generating walks (CPU: 1): 100%|██████████| 250/250 [3:04:53<00:00, 44.37s/it]
Generating walks (CPU: 3): 100%|██████████| 250/250 [3:07:30<00:00, 45.00s/it]


In [4]:
# Save model
model.save('node2vec_model_walk_1000_length_60')

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend(user_id, num_recommendations=5):
    user_id_str = str(user_id)
    if user_id_str not in embeddings:
        print(f"User ID {user_id} was not found in embeddings")
        return []
    user_embedding = embeddings[str(user_id)]

    # Filter item nodes (those with bipartite=1)
    item_nodes = [node for node, data in G.nodes(data=True) if data.get('bipartite') == 1]
    
    # Compute similarity between user embedding and all item embeddings
    item_embeddings = [embeddings[str(item)] for item in item_nodes if str(item) in embeddings]
    similarities = cosine_similarity([user_embedding], item_embeddings).flatten()
    
    # Get top N most similar items
    top_indices = similarities.argsort()[-num_recommendations:][::-1]
    recommended_items = [item_nodes[i] for i in top_indices]
    
    return recommended_items

sample_user_id = 23717586
# Example recommendation
recommendations = recommend(user_id=sample_user_id, num_recommendations=5)
print(recommendations)

['Nancy Drew Ghost Dogs of Moon Lake ', 'Amnesia The Dark Descent', 'All Zombies Must Die!', 'Battlepaths', 'Jurassic Park The Game']


In [13]:
import numpy as np
from sklearn.metrics import mean_squared_error

def evaluate(test_data):
    predictions, actuals = [], []
    for row in test_data.itertuples():
        user_id, game_id = str(row.user_id), str(row.game)
        if user_id in embeddings and game_id in embeddings:
            user_embedding = embeddings[user_id]
            game_embedding = embeddings[game_id]
            prediction = cosine_similarity([user_embedding], [game_embedding]).flatten()[0]
            predictions.append(prediction)
            actuals.append(row.hours_played)
        # else:
            # print(f"Skipping missing ID: User {user_id}, Game {game_id}")
    
    if not predictions:  # Handle case where no valid predictions are made
        return float('inf')
    
    mse = mean_squared_error(actuals, predictions)
    return mse

# Calculate the mean rating from the test set
mean_rating = test_data['hours_played'].mean()

# Predict all ratings as the mean rating
baseline_predictions = [mean_rating] * len(test_data)

# Calculate MSE for the baseline model
from sklearn.metrics import mean_squared_error

baseline_mse = mean_squared_error(test_data['hours_played'], baseline_predictions)
print(f'Baseline Mean Squared Error: {baseline_mse}')

# Generate random predictions within the range of actual ratings
min_rating = test_data['hours_played'].min()
max_rating = test_data['hours_played'].max()
random_predictions = np.random.uniform(min_rating, max_rating, size=len(test_data))

# Calculate MSE for random predictions
random_mse = mean_squared_error(test_data['hours_played'], random_predictions)
print(f'Random Mean Squared Error: {random_mse}')

mse = evaluate(test_data)
print(f'Mean Squared Error of Node2Vec Model: {mse}')

Baseline Mean Squared Error: 18665.57929391597
Random Mean Squared Error: 30771769.723452054
embeddings KeyedVectors<vector_size=64, 17041 keys>
Mean Squared Error of Node2Vec Model: 18819.268193589043
