In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import random
import json

# Function to train the binary predictor
def train_predictor(file_path):
    """
    Reads the dataset, preprocesses the features, and trains a Logistic Regression
    binary classifier.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"ERROR: File not found at {file_path}. Please make sure 'augmented_games_interactions_final.csv' is in your Colab session.")
        return None, None, None, None, None

    # Data Cleaning and Preparation
    df.dropna(subset=['metascore', 'description', 'rating', 'platforms'], inplace=True)
    df['description'] = df['description'].fillna('')
    df['title'] = df['title'].fillna('')

    X = df.drop('Interested', axis=1)
    y = df['Interested']

    # Define feature types for preprocessing
    numeric_features = ['metascore']
    categorical_features = ['UserID', 'rating', 'genres', 'platforms']
    text_features = 'description'

    # preprocessor Definitionn
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
            ('text', TfidfVectorizer(max_features=200), text_features)
        ],
        remainder='drop'
    )

    # Pipeline Definition (Preprocessor + Model)
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', LogisticRegression(random_state=42, solver='liblinear'))])

    print("Training the predictor...")
    model_pipeline.fit(X, y)
    print("Training complete.")

    feature_names = model_pipeline['preprocessor'].get_feature_names_out()

    return model_pipeline, preprocessor, feature_names, X, y

# Function for prediction and counterfactual suggestion
def predict_and_suggest_counterfactual(user_id, game_id, trained_pipeline, preprocessor, feature_names, X_data):
    """
    Accepts a user and an entity, returns a binary prediction and, if negative,
    a counterfactual suggestion.
    """
    try:
        game_row = X_data[X_data['GameID'] == game_id].iloc[0].copy()
    except IndexError:
        return {'Error': f"GameID {game_id} not found in the training data."}

    game_row['UserID'] = user_id
    input_data = pd.DataFrame([game_row])

    prediction = trained_pipeline.predict(input_data)[0]
    probability = trained_pipeline.predict_proba(input_data)[0][1]

    result = {
        'UserID': int(user_id),
        'GameID': int(game_id),
        'GameTitle': input_data['title'].iloc[0],
        'Prediction': int(prediction),
        'Probability_Interested': round(probability, 4),
        'Counterfactual_Suggestion': None
    }

    # Generate Counterfactuals if Prediction is NEGATIVE (0)
    if prediction == 0:
        classifier = trained_pipeline['classifier']
        coefficients = classifier.coef_[0]

        coef_df = pd.DataFrame({
            'Feature': feature_names,
            'Coefficient': coefficients
        }).sort_values(by='Coefficient', ascending=False)

        # Suggestion 1: Numeric Change (Metascore/Quality)
        current_metascore = input_data['metascore'].iloc[0]
        target_metascore = np.percentile(X_data['metascore'].dropna(), 80)

        # Suggestion 2: Categorical Change (Genre/Content)
        current_genres = input_data['genres'].iloc[0]
        genre_coefs = coef_df[coef_df['Feature'].str.startswith('cat__genres')].copy()

        positive_genre_suggestions = []
        for index, row in genre_coefs.iterrows():
            genre_name = row['Feature'].split('cat__genres_')[1]
            if genre_name not in current_genres and row['Coefficient'] > 0:
                positive_genre_suggestions.append(genre_name)
            if len(positive_genre_suggestions) >= 2:
                break

        genre_suggestion_text = ", ".join(positive_genre_suggestions) if positive_genre_suggestions else "No specific genre change needed."

        #  Suggestion 3: Text Change (Description/Keywords)
        tfidf_coefs = coef_df[coef_df['Feature'].str.startswith('text__')].copy().head(3)
        top_keywords = ", ".join([f"'{f.split('text__')[1]}'" for f in tfidf_coefs['Feature']])

        # Construct the final suggestion
        suggestion = (
            f"The prediction for User {result['UserID']} and Game '{result['GameTitle']}' is **Negative (0)** "
            f"(P(Interested) = {result['Probability_Interested']}).\n"
            f"To achieve a positive prediction, the entity (game) would need to undergo the following *constrained counterfactual changes*:\n\n"
            f"1. **Improve Quality (Numeric):** The game's **Metascore** would need to increase significantly from its current value of **{int(current_metascore)}** to approximately **{int(target_metascore)}**.\n"
            f"2. **Broaden Content (Categorical):** The game should **include or emphasize** content related to the highest positively correlated genres in the model, such as **{genre_suggestion_text}**.\n"
            f"3. **Refine Marketing (Free Text):** The game's description should be revised to include high-impact, positive keywords like **{top_keywords}** to better match the user's inferred preferences."
        )
        result['Counterfactual_Suggestion'] = suggestion

    return result

# Example Usage

FILE_PATH = 'augmented_games_interactions_final.csv'
pipeline, preprocessor, feature_names, X_train, y_train = train_predictor(FILE_PATH)

if pipeline is not None:
    sample_negative_interaction = X_train[y_train == 0].sample(1, random_state=42)
    sample_user = sample_negative_interaction['UserID'].iloc[0]
    sample_game = sample_negative_interaction['GameID'].iloc[0]

    print("\n" + "="*50)
    print(f"Testing Prediction and Counterfactuals for UserID: {sample_user} and GameID: {sample_game}")
    print("="*50)

    prediction_result = predict_and_suggest_counterfactual(
        user_id=sample_user,
        game_id=sample_game,
        trained_pipeline=pipeline,
        preprocessor=preprocessor,
        feature_names=feature_names,
        X_data=X_train
    )

    print(json.dumps(prediction_result, indent=4))

Training the predictor...
Training complete.

Testing Prediction and Counterfactuals for UserID: 814 and GameID: 1300486997
{
    "UserID": 814,
    "GameID": 1300486997,
    "GameTitle": "Babylon's Fall",
    "Prediction": 0,
    "Probability_Interested": 0.1157,
    "Counterfactual_Suggestion": "The prediction for User 814 and Game 'Babylon's Fall' is **Negative (0)** (P(Interested) = 0.1157).\nTo achieve a positive prediction, the entity (game) would need to undergo the following *constrained counterfactual changes*:\n\n1. **Improve Quality (Numeric):** The game's **Metascore** would need to increase significantly from its current value of **41** to approximately **81**.\n2. **Broaden Content (Categorical):** The game should **include or emphasize** content related to the highest positively correlated genres in the model, such as **Sandbox, Trainer RPG**.\n3. **Refine Marketing (Free Text):** The game's description should be revised to include high-impact, positive keywords like **'