In [5]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

os.chdir("C:\\Users\\reind\\Documents\\INST414\\Module 3")

In [6]:
df = pd.read_csv("steam.csv")

df = df.dropna(subset=['name', 'steamspy_tags', 'genres', 'positive_ratings', 'negative_ratings', 'average_playtime'])
df = df.drop_duplicates(subset='name')

df = df.reset_index(drop=True)

In [7]:
df['all_tags'] = df['steamspy_tags'].fillna('') + ';' + df['genres'].fillna('')

df['all_tags'] = df['all_tags'].apply(lambda x: list(set(x.lower().split(';'))))

mlb = MultiLabelBinarizer()
tag_features = mlb.fit_transform(df['all_tags'])

In [11]:
numeric_features = df[['positive_ratings', 'negative_ratings', 'average_playtime']].copy()
scaler = MinMaxScaler()
numeric_scaled = scaler.fit_transform(numeric_features)

combined_features = np.hstack([tag_features, numeric_scaled])

In [12]:
# Convert tag_features (NumPy array) into a DataFrame
tag_df = pd.DataFrame(tag_features, columns=mlb.classes_)

# Convert numeric_scaled (NumPy array) into a DataFrame with the original column names
numeric_df = pd.DataFrame(numeric_scaled, columns=['positive_ratings', 'negative_ratings', 'average_playtime'])

# Combine all features into one DataFrame with the game name
combined_df = pd.concat([df[['name']].reset_index(drop=True), tag_df, numeric_df], axis=1)

# Show the first few rows
combined_df.head()

Unnamed: 0,name,1980s,1990's,2.5d,2d,2d fighter,360 video,3d,3d platformer,3d vision,...,werewolves,western,word game,world war i,world war ii,wrestling,zombies,positive_ratings,negative_ratings,average_playtime
0,Counter-Strike,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.047093,0.006855,0.092391
1,Team Fortress Classic,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.001255,0.0013,0.001453
2,Day of Defeat,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0.001292,0.000817,0.000981
3,Deathmatch Classic,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.000481,0.000548,0.001353
4,Half-Life: Opposing Force,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.001985,0.000591,0.003273


In [9]:
similarity_matrix = cosine_similarity(combined_features)

def get_similar_games(game_name, top_n=10):
    if game_name not in df['name'].values:
        return f"Game '{game_name}' not found in dataset."
    
    idx = df[df['name'] == game_name].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_indices = sim_scores[1:top_n+1]
    
    results = [(df.loc[i, 'name'], round(score, 4)) for i, score in top_indices]
    return pd.DataFrame(results, columns=["Game", "Similarity Score"])

query_games = ["Counter-Strike", "Bloons TD 5", "Beat Saber"] 

for game in query_games:
    similar_games = get_similar_games(game)
    print(f"Top 10 games similar to '{game}':")
    print(similar_games.to_string(index=False))

Top 10 games similar to 'Counter-Strike':
                            Game  Similarity Score
          Counter-Strike: Source            0.9994
Call of Duty®: Modern Warfare® 2            0.9985
     Battlefield: Bad Company™ 2            0.9985
                       Homefront            0.9985
Call of Duty® 4: Modern Warfare®            0.9985
     Call of Duty®: Black Ops II            0.9984
                      Battleborn            0.9983
  Counter-Strike: Condition Zero            0.9983
       Unreal Tournament 3 Black            0.9983
         Half-Life 2: Deathmatch            0.9983
Top 10 games similar to 'Bloons TD 5':
                                 Game  Similarity Score
                     Orcs Must Die! 2            0.8944
                        Anomaly Korea            0.8660
Anomaly Warzone Earth Mobile Campaign            0.8660
                       Rush for Glory            0.8660
             Bloons Adventure Time TD            0.8660
                      