In [9]:
# Game Recommendation System - Phase 2

# Import the tools we need
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
# skip for now.. 
# import matplotlib.pyplot as plt
# import seaborn as sns

# Step 1: Prepare our data
def prepare_data(df):
    print("Preparing our data...")
    
    # Let's create features from the genres and categories
    # We'll use a simple technique to convert text into numbers
    
    # Convert genres into features
    genre_vectorizer = CountVectorizer(max_features=50)
    genre_features = genre_vectorizer.fit_transform(df['genres_y'])
    genre_feature_names = genre_vectorizer.get_feature_names_out()
    genre_df = pd.DataFrame(genre_features.toarray(), columns=[f'genre_{g}' for g in genre_feature_names])
    
    # Convert categories into features
    category_vectorizer = CountVectorizer(max_features=50)
    category_features = category_vectorizer.fit_transform(df['categories_y'])
    category_feature_names = category_vectorizer.get_feature_names_out()
    category_df = pd.DataFrame(category_features.toarray(), columns=[f'cat_{c}' for c in category_feature_names])
    
    # Add price as a feature (games with similar prices might be similar)
    # We'll create price ranges
    # First handle NaN values in price
    df['price'] = df['price'].fillna(0)
    df['price_range'] = pd.cut(df['price'], bins=[0, 5, 10, 20, 100], labels=['free_to_cheap', 'budget', 'standard', 'premium'])
    
    # Handle any NaN values that might still be in price_range
    # This can happen if there are prices outside our bin ranges
    df['price_range'] = df['price_range'].cat.add_categories(['other'])
    df['price_range'] = df['price_range'].fillna('other')
    
    price_encoder = OneHotEncoder(sparse_output=False)
    price_features = price_encoder.fit_transform(df[['price_range']])
    price_feature_names = [f"price_{cat}" for cat in price_encoder.categories_[0]]
    price_df = pd.DataFrame(price_features, columns=price_feature_names)
    
    # Add platform features
    platform_df = df[['windows', 'mac', 'linux']]
    
    # Combine all features
    features_df = pd.concat([genre_df, category_df, price_df, platform_df], axis=1)
    
    print(f"Created {features_df.shape[1]} features for each game!")
    
    return features_df, genre_vectorizer, category_vectorizer, price_encoder

# Step 2: Train our models
def train_models(features_df, df):
    print("\nTraining our models...")
    
    # Prepare our data
    X = features_df  # All the features we created
    y = df['AppID']  # The game ID (what we want to predict)
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    print(f"Training with {X_train.shape[0]} games, testing with {X_test.shape[0]} games")
    
    # Train our first model - Decision Tree
    print("Training Decision Tree...")
    tree_model = DecisionTreeClassifier(max_depth=10)  # Limiting depth to avoid overfitting
    tree_model.fit(X_train, y_train)
    
    # Train our second model - Neural Network
    print("Training Neural Network...")
    # We'll use a simpler network since we have many features
    nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, early_stopping=True)
    nn_model.fit(X_train, y_train)
    
    # Test how good our models are
    tree_pred = tree_model.predict(X_test)
    tree_accuracy = accuracy_score(y_test, tree_pred)
    
    nn_pred = nn_model.predict(X_test)
    nn_accuracy = accuracy_score(y_test, nn_pred)
    
    print(f"Decision Tree Accuracy: {tree_accuracy:.4f}")
    print(f"Neural Network Accuracy: {nn_accuracy:.4f}")
    
    # Instead of visualization, just print a comparison
    print("\nModel Accuracy Comparison:")
    print(f"Decision Tree: {tree_accuracy:.4f}")
    print(f"Neural Network: {nn_accuracy:.4f}")
    
    # Return the better model
    if nn_accuracy > tree_accuracy:
        print("The Neural Network did better!")
        return nn_model, "Neural Network"
    else:
        print("The Decision Tree did better!")
        return tree_model, "Decision Tree"

# Step 3: Create a function to find similar games
def find_similar_games(game_id, features_df, df, top_n=3):
    print(f"\nFinding similar games to {df[df['AppID'] == game_id]['name'].values[0]}...")
    
    # Get the features of our game
    game_features = features_df.loc[df['AppID'] == game_id].values[0]
    
    # Calculate similarity with all other games
    similarity_scores = []
    
    # We'll use cosine similarity (like measuring the angle between two arrows)
    # This is a common way to measure how similar two sets of features are
    
    for index, row in features_df.iterrows():
        if df.iloc[index]['AppID'] == game_id:
            continue  # Skip the game we already found
        
        # Get features for this game
        other_features = row.values
        
        # Calculate cosine similarity
        # (dot product divided by product of magnitudes)
        dot_product = np.dot(game_features, other_features)
        game_magnitude = np.sqrt(np.dot(game_features, game_features))
        other_magnitude = np.sqrt(np.dot(other_features, other_features))
        
        # Avoid division by zero
        if game_magnitude * other_magnitude == 0:
            similarity = 0
        else:
            similarity = dot_product / (game_magnitude * other_magnitude)
        
        similarity_scores.append((df.iloc[index]['AppID'], similarity))
    
    # Sort by similarity and get top N
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_similar = similarity_scores[:top_n]
    
    return top_similar

# Step 4: Make our recommendation function
def recommend_games(model, model_name, features_df, df, image_url, vectorizers):
    print("\nStarting the recommendation process...")
    genre_vectorizer, category_vectorizer, price_encoder = vectorizers
    
    # In a real system, we would:
    # 1. Download the image from the URL
    # 2. Extract features from the image
    # 3. Use the model to predict which game it is
    
    # For now, let's pretend we found the game by matching the image URL
    found_game = df[df['header_image'] == image_url]
    
    if found_game.empty:
        print("Sorry, I couldn't find a game with that image URL.")
        # For testing, let's just pick a random game
        found_game = df.sample(1)
        print(f"Let's pretend we found: {found_game['name'].values[0]}")
    else:
        print(f"I found your game! It's {found_game['name'].values[0]}")
    
    found_game_id = found_game['AppID'].values[0]
    
    # Find similar games
    similar_games = find_similar_games(found_game_id, features_df, df)
    
    # Display recommendations
    print("\nYou might also like these games:")
    for game_id, score in similar_games:
        game_info = df[df['AppID'] == game_id]
        print(f"- {game_info['name'].values[0]} (Similarity: {score:.2f})")
        print(f"  Price: ${game_info['price'].values[0]}")
        print(f"  Genres: {game_info['genres_y'].values[0]}")
        print(f"  Image: {game_info['header_image'].values[0]}")
        print()
    
    return [id for id, _ in similar_games]

# Put it all together
def main():
    print("Welcome to the Game Recommendation System!")
    
    # Load the dataset from GitHub
    url = "https://raw.githubusercontent.com/AljawharahAlotaibi/swe485/main/Dataset/updated_cleaned_games.xls"
    df = pd.read_csv(url)
    
    # Drop any missing values
    df.dropna(inplace=True)
    
    # Show basic info about the dataset
    print("Dataset information:")
    df.info()
    
    # Prepare our data
    features_df, genre_vectorizer, category_vectorizer, price_encoder = prepare_data(df)
    vectorizers = (genre_vectorizer, category_vectorizer, price_encoder)
    
    # Train our models
    best_model, model_name = train_models(features_df, df)
    
    # Test our recommendation system
    print("\nLet's test our recommendation system!")
    
    # Pick a random image URL from our dataset
    random_game = df.sample(1)
    image_url = random_game['header_image'].values[0]
    print(f"Testing with image: {image_url}")
    
    # Get recommendations
    recommended_games = recommend_games(best_model, model_name, features_df, df, image_url, vectorizers)
    
    print("All done!")

# program start
if __name__ == "__main__":
    main()

Welcome to the Game Recommendation System!
Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5005 entries, 0 to 5004
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   AppID                 5005 non-null   int64  
 1   name                  5005 non-null   object 
 2   required_age          5005 non-null   int64  
 3   price                 5005 non-null   float64
 4   detailed_description  5005 non-null   object 
 5   header_image          5005 non-null   object 
 6   windows               5005 non-null   bool   
 7   mac                   5005 non-null   bool   
 8   linux                 5005 non-null   bool   
 9   supported_languages   5005 non-null   object 
 10  categories_x          5005 non-null   int64  
 11  genres_x              5005 non-null   int64  
 12  publishers            5005 non-null   int64  
 13  tags                  5005 non-null   int64  
 14  genres_y

In [None]:
# Step 2: Train our models
def train_models(features_df, df):
    print("\nTraining our models...")
    
    # Prepare our data
    X = features_df  # All the features we created
    y = df['AppID']  # The game ID (what we want to predict)
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    print(f"Training with # Game Recommendation System - Phase 2
# This code helps recommend games based on a Steam game dataset

# Import the tools we need
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
# We'll skip visualization libraries
# import matplotlib.pyplot as plt
# import seaborn as sns

# Step 1: Prepare our data
def prepare_data(df):
    print("Preparing our data...")
    
    # Let's create features from the genres and categories
    # We'll use a simple technique to convert text into numbers
    
    # Convert genres into features
    genre_vectorizer = CountVectorizer(max_features=50)
    genre_features = genre_vectorizer.fit_transform(df['genres_y'])
    genre_feature_names = genre_vectorizer.get_feature_names_out()
    genre_df = pd.DataFrame(genre_features.toarray(), columns=[f'genre_{g}' for g in genre_feature_names])
    
    # Convert categories into features
    category_vectorizer = CountVectorizer(max_features=50)
    category_features = category_vectorizer.fit_transform(df['categories_y'])
    category_feature_names = category_vectorizer.get_feature_names_out()
    category_df = pd.DataFrame(category_features.toarray(), columns=[f'cat_{c}' for c in category_feature_names])
    
    # Add price as a feature (games with similar prices might be similar)
    # We'll create price ranges
    # First handle NaN values in price
    df['price'] = df['price'].fillna(0)
    df['price_range'] = pd.cut(df['price'], bins=[0, 5, 10, 20, 100], labels=['free_to_cheap', 'budget', 'standard', 'premium'])
    
    # Handle any NaN values that might still be in price_range
    # This can happen if there are prices outside our bin ranges
    df['price_range'] = df['price_range'].cat.add_categories(['other'])
    df['price_range'] = df['price_range'].fillna('other')
    
    price_encoder = OneHotEncoder(sparse_output=False)
    price_features = price_encoder.fit_transform(df[['price_range']])
    price_feature_names = [f"price_{cat}" for cat in price_encoder.categories_[0]]
    price_df = pd.DataFrame(price_features, columns=price_feature_names)
    
    # Add platform features
    platform_df = df[['windows', 'mac', 'linux']]
    
    # Combine all features
    features_df = pd.concat([genre_df, category_df, price_df, platform_df], axis=1)
    
    print(f"Created {features_df.shape[1]} features for each game!")
    
    return features_df, genre_vectorizer, category_vectorizer, price_encoder

# Step 2: Train our models
def train_models(features_df, df):
    print("\nTraining our models...")
    
    # Prepare our data
    X = features_df  # All the features we created
    y = df['AppID']  # The game ID (what we want to predict)
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    print(f"Training with {X_train.shape[0]} games, testing with {X_test.shape[0]} games")
    
    # Scale the features for KNN and SVM (these algorithms work better with scaled data)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train our first model - Decision Tree
    print("Training Decision Tree...")
    tree_model = DecisionTreeClassifier(max_depth=10)  # Limiting depth to avoid overfitting
    tree_model.fit(X_train, y_train)
    
    # Train our second model - Neural Network
    print("Training Neural Network...")
    # We'll use a simpler network since we have many features
    nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, early_stopping=True)
    nn_model.fit(X_train_scaled, y_train)  # Using scaled features for NN
    
    # Train our third model - K-Nearest Neighbors
    print("Training K-Nearest Neighbors...")
    knn_model = KNeighborsClassifier(n_neighbors=5)  # Using 5 neighbors
    knn_model.fit(X_train_scaled, y_train)  # Using scaled features for KNN
    
    # Train our fourth model - Support Vector Machine
    print("Training Support Vector Machine...")
    # Using a linear kernel for speed, can use 'rbf' for potentially better results
    svm_model = SVC(kernel='linear', probability=True)
    svm_model.fit(X_train_scaled, y_train)  # Using scaled features for SVM
    
    # Test how good our models are
    tree_pred = tree_model.predict(X_test)
    tree_accuracy = accuracy_score(y_test, tree_pred)
    
    nn_pred = nn_model.predict(X_test_scaled)  # Using scaled features
    nn_accuracy = accuracy_score(y_test, nn_pred)
    
    knn_pred = knn_model.predict(X_test_scaled)  # Using scaled features
    knn_accuracy = accuracy_score(y_test, knn_pred)
    
    svm_pred = svm_model.predict(X_test_scaled)  # Using scaled features
    svm_accuracy = accuracy_score(y_test, svm_pred)
    
    print(f"Decision Tree Accuracy: {tree_accuracy:.4f}")
    print(f"Neural Network Accuracy: {nn_accuracy:.4f}")
    
    # Detailed comparison of all models
    print("\n===== MODEL COMPARISON =====")
    print(f"Decision Tree Accuracy: {tree_accuracy:.4f}")
    print(f"Neural Network Accuracy: {nn_accuracy:.4f}")
    print(f"K-Nearest Neighbors Accuracy: {knn_accuracy:.4f}")
    print(f"Support Vector Machine Accuracy: {svm_accuracy:.4f}")
    
    # Find the best model
    accuracies = {
        "Decision Tree": tree_accuracy,
        "Neural Network": nn_accuracy,
        "K-Nearest Neighbors": knn_accuracy,
        "Support Vector Machine": svm_accuracy
    }
    
    best_model_name = max(accuracies, key=accuracies.get)
    best_accuracy = accuracies[best_model_name]
    
    print("\nPerformance Comparison:")
    for model_name, accuracy in accuracies.items():
        if model_name != best_model_name:
            diff = best_accuracy - accuracy
            print(f"The {best_model_name} is better than {model_name} by {diff:.4f} points")
    
    # Calculate precision, recall, and f1-score for all models
    from sklearn.metrics import precision_recall_fscore_support
    
    # For Decision Tree
    tree_precision, tree_recall, tree_f1, _ = precision_recall_fscore_support(
        y_test, tree_pred, average='weighted', zero_division=0
    )
    
    # For Neural Network
    nn_precision, nn_recall, nn_f1, _ = precision_recall_fscore_support(
        y_test, nn_pred, average='weighted', zero_division=0
    )
    
    # For KNN
    knn_precision, knn_recall, knn_f1, _ = precision_recall_fscore_support(
        y_test, knn_pred, average='weighted', zero_division=0
    )
    
    # For SVM
    svm_precision, svm_recall, svm_f1, _ = precision_recall_fscore_support(
        y_test, svm_pred, average='weighted', zero_division=0
    )
    
    print("\nDetailed Metrics:")
    print(f"{'Metric':<15} {'Decision Tree':<15} {'Neural Network':<15} {'KNN':<15} {'SVM':<15}")
    print("-" * 75)
    print(f"{'Accuracy':<15} {tree_accuracy:<15.4f} {nn_accuracy:<15.4f} {knn_accuracy:<15.4f} {svm_accuracy:<15.4f}")
    print(f"{'Precision':<15} {tree_precision:<15.4f} {nn_precision:<15.4f} {knn_precision:<15.4f} {svm_precision:<15.4f}")
    print(f"{'Recall':<15} {tree_recall:<15.4f} {nn_recall:<15.4f} {knn_recall:<15.4f} {svm_recall:<15.4f}")
    print(f"{'F1 Score':<15} {tree_f1:<15.4f} {nn_f1:<15.4f} {knn_f1:<15.4f} {svm_f1:<15.4f}")
    
    print("\nWhat These Metrics Mean:")
    print("- Accuracy: How often the model is correct overall")
    print("- Precision: How accurate the model's positive predictions are")
    print("- Recall: How good the model is at finding all positive cases")
    print("- F1 Score: Balance between precision and recall")
    
    print("\nAlgorithm Strengths:")
    print("- Decision Tree: Easy to understand, handles mixed data types well")
    print("- Neural Network: Great at finding complex patterns, handles large feature sets")
    print("- K-Nearest Neighbors: Simple but effective, works well for similar items")
    print("- Support Vector Machine: Excellent for classification, handles high-dimensional data")
    
    print(f"\nCONCLUSION: The {best_model_name} performs best overall for this task.")
    
    # Select the best model
    model_dict = {
        "Decision Tree": (tree_model, tree_accuracy),
        "Neural Network": (nn_model, nn_accuracy),
        "K-Nearest Neighbors": (knn_model, knn_accuracy),
        "Support Vector Machine": (svm_model, svm_accuracy)
    }
    
    best_model_name = max(model_dict, key=lambda k: model_dict[k][1])
    best_model = model_dict[best_model_name][0]
    
    print(f"\nThe {best_model_name} did best with an accuracy of {model_dict[best_model_name][1]:.4f}!")
    
    # Also save the scaler for later use with the best model
    return best_model, best_model_name, scaler

# Step 3: Create a function to find similar games
def find_similar_games(game_id, features_df, df, top_n=3):
    print(f"\nFinding similar games to {df[df['AppID'] == game_id]['name'].values[0]}...")
    
    # Get the features of our game
    game_features = features_df.loc[df['AppID'] == game_id].values[0]
    
    # Calculate similarity with all other games
    similarity_scores = []
    
    # We'll use cosine similarity (like measuring the angle between two arrows)
    # This is a common way to measure how similar two sets of features are
    
    for index, row in features_df.iterrows():
        if df.iloc[index]['AppID'] == game_id:
            continue  # Skip the game we already found
        
        # Get features for this game
        other_features = row.values
        
        # Calculate cosine similarity
        # (dot product divided by product of magnitudes)
        dot_product = np.dot(game_features, other_features)
        game_magnitude = np.sqrt(np.dot(game_features, game_features))
        other_magnitude = np.sqrt(np.dot(other_features, other_features))
        
        # Avoid division by zero
        if game_magnitude * other_magnitude == 0:
            similarity = 0
        else:
            similarity = dot_product / (game_magnitude * other_magnitude)
        
        similarity_scores.append((df.iloc[index]['AppID'], similarity))
    
    # Sort by similarity and get top N
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_similar = similarity_scores[:top_n]
    
    return top_similar

# Step 4: Make our recommendation function
def recommend_games(model, model_name, features_df, df, image_url, vectorizers, scaler=None):
    print("\nStarting the recommendation process...")
    genre_vectorizer, category_vectorizer, price_encoder = vectorizers
    
    # In a real system, we would:
    # 1. Download the image from the URL
    # 2. Extract features from the image
    # 3. Use the model to predict which game it is
    
    # For now, let's pretend we found the game by matching the image URL
    found_game = df[df['header_image'] == image_url]
    
    if found_game.empty:
        print("Sorry, I couldn't find a game with that image URL.")
        # For testing, let's just pick a random game
        found_game = df.sample(1)
        print(f"Let's pretend we found: {found_game['name'].values[0]}")
    else:
        print(f"I found your game! It's {found_game['name'].values[0]}")
    
    found_game_id = found_game['AppID'].values[0]
    
    # Find similar games
    similar_games = find_similar_games(found_game_id, features_df, df)
    
    # Display recommendations
    print("\nYou might also like these games:")
    for game_id, score in similar_games:
        game_info = df[df['AppID'] == game_id]
        print(f"- {game_info['name'].values[0]} (Similarity: {score:.2f})")
        print(f"  Price: ${game_info['price'].values[0]}")
        print(f"  Genres: {game_info['genres_y'].values[0]}")
        print(f"  Image: {game_info['header_image'].values[0]}")
        print()
    
    return [id for id, _ in similar_games]

# Put it all together
def main():
    print("Welcome to the Game Recommendation System!")
    
    # Load the dataset from GitHub
    url = "https://raw.githubusercontent.com/AljawharahAlotaibi/swe485/main/Dataset/updated_cleaned_games.xls"
    df = pd.read_csv(url)
    
    # Drop any missing values
    df.dropna(inplace=True)
    
    # Show basic info about the dataset
    print("Dataset information:")
    df.info()
    
    # Prepare our data
    features_df, genre_vectorizer, category_vectorizer, price_encoder = prepare_data(df)
    vectorizers = (genre_vectorizer, category_vectorizer, price_encoder)
    
    # Train our models
    best_model, model_name, scaler = train_models(features_df, df)
    
    # Test our recommendation system
    print("\nLet's test our recommendation system!")
    
    # Pick a random image URL from our dataset
    random_game = df.sample(1)
    image_url = random_game['header_image'].values[0]
    print(f"Testing with image: {image_url}")
    
    # Get recommendations
    recommended_games = recommend_games(best_model, model_name, features_df, df, image_url, vectorizers, scaler)
    
    print("All done! Our recommendation system is working!")

# This is where the program would start
if __name__ == "__main__":
    main()