<a href="https://colab.research.google.com/github/ipez02/csci164/blob/main/anime_gaming_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Anime & Gaming Regression Project (scikit-learn)

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [3]:

anime = pd.read_csv('anime.csv')
games = pd.read_csv('vgsales.csv')
print("Anime dataset loaded:", anime.shape)
print("Gaming dataset loaded:", games.shape)


Anime dataset loaded: (12294, 7)
Gaming dataset loaded: (16598, 11)


In [4]:

def preprocess_anime(df):
    df = df.copy()
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    df = df.dropna(subset=['rating'])
    df = df[df['rating'] > 0]
    df['genre'] = df['genre'].fillna('Unknown')
    df['type'] = df['type'].fillna('Unknown')
    df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
    df = df.dropna(subset=['episodes', 'members'])
    X = df[['genre', 'type', 'episodes', 'members']]
    y = df['rating']
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), ['episodes', 'members']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['genre', 'type'])
    ])
    return X, y, preprocessor


In [5]:

def preprocess_games(df):
    df = df.copy()
    df = df.dropna(subset=['Global_Sales'])
    df = df[df['Global_Sales'] > 0]

    # Check available columns
    available_features = ['Genre', 'Platform', 'Year_of_Release', 'Critic_Score']
    features = [f for f in available_features if f in df.columns]

    X = df[features]
    y = df['Global_Sales']

    cat_features = [f for f in features if df[f].dtype == 'object']
    num_features = [f for f in features if df[f].dtype in ['float64', 'int64']]

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ])

    return X, y, preprocessor



In [6]:

def train_models(X, y, preprocessor):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    models = {
        'LinearRegression': LinearRegression(),
        'RandomForest': RandomForestRegressor(random_state=42),
        'MLPRegressor': MLPRegressor(max_iter=500, random_state=42)
    }
    results = {}
    for name, model in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        results[name] = {
            'MAE': mean_absolute_error(y_test, y_pred),
            'MSE': mean_squared_error(y_test, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
            'R2': r2_score(y_test, y_pred)
        }
        print(f"Model: {name}")
        print(f"MAE: {results[name]['MAE']:.3f}, RMSE: {results[name]['RMSE']:.3f}, R2: {results[name]['R2']:.3f}\n")
    return results


In [7]:

X_anime, y_anime, preprocessor_anime = preprocess_anime(anime)
anime_results = train_models(X_anime, y_anime, preprocessor_anime)

X_games, y_games, preprocessor_games = preprocess_games(games)
gaming_results = train_models(X_games, y_games, preprocessor_games)

print("\nFinal Results Summary")
print("Anime Dataset:", anime_results)
print("Gaming Dataset:", gaming_results)


Model: LinearRegression
MAE: 0.629, RMSE: 0.828, R2: 0.333

Model: RandomForest
MAE: 0.499, RMSE: 0.683, R2: 0.546

Model: MLPRegressor
MAE: 0.594, RMSE: 0.788, R2: 0.395

Model: LinearRegression
MAE: 0.592, RMSE: 2.037, R2: 0.012

Model: RandomForest
MAE: 0.590, RMSE: 2.042, R2: 0.008

Model: MLPRegressor
MAE: 0.589, RMSE: 2.042, R2: 0.008


Final Results Summary
Anime Dataset: {'LinearRegression': {'MAE': 0.6294333665373097, 'MSE': 0.6849946644815802, 'RMSE': np.float64(0.827644044551509), 'R2': 0.33254441949942437}, 'RandomForest': {'MAE': 0.49897950931725876, 'MSE': 0.4658339293521868, 'RMSE': np.float64(0.682520277612458), 'R2': 0.5460936094035973}, 'MLPRegressor': {'MAE': 0.5937578882874939, 'MSE': 0.6210175815750284, 'RMSE': np.float64(0.7880466874335735), 'R2': 0.3948833882889752}}
Gaming Dataset: {'LinearRegression': {'MAE': 0.5921588710724857, 'MSE': 4.149908671146241, 'RMSE': np.float64(2.0371324628374663), 'R2': 0.012247285783894779}, 'RandomForest': {'MAE': 0.5899342197550