In [None]:
#project created

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import re

# ==========================================
# 1. DATA LOADING & DYNAMIC COLUMN HANDLING
# ==========================================
print("Step 1: Loading and Cleaning Data...")
# Load dataset - Ensure 'games.csv' is in your project folder
try:
    games = pd.read_csv('games.csv')
except FileNotFoundError:
    print("Error: 'games.csv' not found. Please check your file path.")
    raise

# Limiting to 15,000 records for performance while meeting assignment size requirements
games = games.head(15000).copy() 

# Detect actual column names to prevent KeyError
cols = games.columns.tolist()
print(f"Detected columns in CSV: {cols}")

genre_col = next((c for c in ['genres', 'genre', 'categories'] if c in cols), None)
tag_col = next((c for c in ['tags', 'popular_tags'] if c in cols), None)
desc_col = next((c for c in ['description', 'short_description', 'about_the_game'] if c in cols), None)

# ==========================================
# 2. ADVANCED FEATURE ENGINEERING (Fixes ValueError)
# ==========================================
print("Step 2: Preparing features...")

# Combine text columns. If a column is missing, it's ignored.
games['features'] = ""
if genre_col:
    games['features'] += games[genre_col].fillna('') + " "
if tag_col:
    games['features'] += games[tag_col].fillna('') + " "
if desc_col:
    games['features'] += games[desc_col].fillna('')

# Preprocessing: Lowercase and strip whitespace
games['features'] = games['features'].str.lower().str.strip()

# CRITICAL FIX: Ensure no row is purely empty or just whitespace
# This prevents the 'Empty Vocabulary' error in TfidfVectorizer
games['features'] = games['features'].replace('', 'steam game indie casual')

# ==========================================
# 3. MODEL BUILDING (TF-IDF & COSINE SIMILARITY)
# ==========================================
print("Step 3: Vectorizing and Calculating Similarity...")

# Use a custom token_pattern to catch words of at least 1 character (Fixes ValueError)
tfidf = TfidfVectorizer(
    stop_words='english', 
    max_features=5000,
    token_pattern=r"(?u)\b\w+\b" 
)

try:
    tfidf_matrix = tfidf.fit_transform(games['features'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    print("Success! Similarity matrix created.")
except ValueError:
    # Final fallback if TF-IDF still struggles with stop words
    tfidf = TfidfVectorizer(stop_words=None, token_pattern=r"(?u)\b\w+\b")
    tfidf_matrix = tfidf.fit_transform(games['features'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    print("Success! Similarity matrix created using fallback settings.")

# ==========================================
# 4. RECOMMENDATION LOGIC & TESTING
# ==========================================
def get_recommendations(title, cosine_sim=cosine_sim):
    try:
        # Find index of the game (case-insensitive search)
        idx = games[games['title'].str.contains(re.escape(title), case=False, na=False)].index[0]
        
        # Calculate similarity scores
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get top 5 recommendations (skipping the first one as it's the input game)
        sim_scores = sim_scores[1:6]
        game_indices = [i[0] for i in sim_scores]
        
        return games.iloc[game_indices][['title', genre_col if genre_col else 'features']]
    except Exception as e:
        return f"Could not find recommendations for '{title}'. Try another game."

# Test with the first game in the list
test_game = games['title'].iloc[0]
print(f"\nTesting recommendations for: {test_game}")
print(get_recommendations(test_game))

# ==========================================
# 5. EXPORT FOR STREAMLIT WEB GUI
# ==========================================
print("\nStep 4: Saving pkl files for app.py...")
# We save the dataframe and the similarity matrix to load them into Streamlit
joblib.dump(cosine_sim, 'similarity_model.pkl')
joblib.dump(games, 'games_data.pkl')

print("All set! 'similarity_model.pkl' and 'games_data.pkl' are in your folder.")

Step 1: Loading and Cleaning Data...
Detected columns in CSV: ['app_id', 'title', 'date_release', 'win', 'mac', 'linux', 'rating', 'positive_ratio', 'user_reviews', 'price_final', 'price_original', 'discount', 'steam_deck']
Step 2: Preparing features...
Step 3: Vectorizing and Calculating Similarity...
Success! Similarity matrix created.

Testing recommendations for: Prince of Persia: Warrior Within™
                          title                 features
1       BRINK: Agents of Change  steam game indie casual
2  Monaco: What's Yours Is Mine  steam game indie casual
3            Escape Dead Island  steam game indie casual
4       Dungeon of the ENDLESS™  steam game indie casual
5                  METAL SLUG 3  steam game indie casual

Step 4: Saving pkl files for app.py...
All set! 'similarity_model.pkl' and 'games_data.pkl' are in your folder.
