# Top 1500 Steam Sales (2024)

In this notebook, we're going to try to predict the publisherClass of every game using only the revenue, price, and average playtime.

First off, libraries as usual

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import check_random_state
import warnings

# for generating synthetic data for minitority classes
from imblearn.over_sampling import SMOTE

# ignore warnings
warnings.filterwarnings("ignore")



Let's set a seed first for reproducibility

In [2]:
def set_seed(seed: int):
    np.random.seed(seed)
    check_random_state(seed)

# Set seed
rng_seed = 0
set_seed(rng_seed)

Load dataset

In [3]:
steam_revenue = pd.read_csv('steam_revenue_cleaned.csv')

Confirm that the data loaded correctly:

In [4]:
steam_revenue.head()

Unnamed: 0,name,releaseDate,copiesSold,price,revenue,avgPlaytime,reviewScore,publisherClass,publishers,developers,steamId
0,WWE 2K24,07-03-2024,165301,100.0,8055097.0,42.4,71.0,AAA,2K,Visual Concepts,2315690
1,EARTH DEFENSE FORCE 6,25-07-2024,159806,60.0,7882151.0,29.7,57.0,Indie,D3PUBLISHER,SANDLOT,2291060
2,Sins of a Solar Empire II,15-08-2024,214192,50.0,7815247.0,12.5,88.0,Indie,Stardock Entertainment,"Ironclad Games Corporation,Stardock Entertainment",1575940
3,Legend of Mortal,14-06-2024,440998,20.0,7756399.0,24.8,76.0,Indie,"Paras Games,Obb Studio Inc.",Obb Studio Inc.,1859910
4,Shin Megami Tensei V: Vengeance,13-06-2024,141306,60.0,7629252.0,34.3,96.0,AA,SEGA,ATLUS,1875830


Prepare columns (input X and output y)

In [5]:
# Extracting features and the target variable
X = steam_revenue[['price', 'revenue', 'avgPlaytime']]
y = steam_revenue['publisherClass']

Do train / test split - Note that there's no attempt to account for minority classes here (where we have about 1300 Indie, 150 AA, and 50 AAA), which will **absolutely** mess up recall!

In [6]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rng_seed)

Put all models into one giant function to call (with relevant train / test tuples)

In [7]:
## Make big function returning fitted models
def train_models(train_tuple, train_tuple_scaled, 
                 test_tuple, test_tuple_scaled):
    
    train_dat_x, train_dat_y = train_tuple
    train_dat_scaled_x, train_dat_scaled_y = train_tuple_scaled
    test_dat_x, test_dat_y = test_tuple
    test_dat_scaled_x, test_dat_scaled_y = test_tuple_scaled
    
    model_types = ['log_reg', 'naive_bayes', 'rf', 'svc', 'knn', 'ens']
    model = { elem : None for elem in model_types }
    model_test_preds = { elem : None for elem in model_types }
    model_test_acc = { elem : None for elem in model_types }
    model_classif_report = { elem : None for elem in model_types }
    
    # log reg and gaussian nb
    for key in ['log_reg','naive_bayes']:
        model[key] = LogisticRegression() if key == 'log_reg' else GaussianNB()
        model[key].fit(train_dat_x, train_dat_y)
        model_test_preds[key] = model[key].predict(test_dat_x)
        model_test_acc[key] = accuracy_score(test_dat_y, model_test_preds[key])
        model_classif_report[key] = classification_report(test_dat_y, model_test_preds[key])
        
    
    # rf
    model['rf'] = RandomForestClassifier(random_state=rng_seed)
    model['rf'].fit(train_dat_x, train_dat_y)
    model_test_preds['rf'] = model['rf'].predict(test_dat_x)
    model_test_acc['rf'] = accuracy_score(test_dat_y, model_test_preds['rf'])
    model_classif_report['rf'] = classification_report(test_dat_y, model_test_preds['rf'])
    
    # svc
    model['svc'] = SVC(kernel='linear', probability=True)
    model['svc'].fit(train_dat_scaled_x, train_dat_scaled_y)
    model_test_preds['svc'] = model['svc'].predict(test_dat_scaled_x)
    model_test_acc['svc'] = accuracy_score(test_dat_scaled_y, model_test_preds['svc'])
    model_classif_report['svc'] = classification_report(test_dat_scaled_y, model_test_preds['svc'])
    
    # knn
    model['knn'] = KNeighborsClassifier(n_neighbors=5)
    model['knn'].fit(train_dat_scaled_x, train_dat_scaled_y)
    model_test_preds['knn'] = model['knn'].predict(test_dat_scaled_x)
    model_test_acc['knn'] = accuracy_score(test_dat_scaled_y, model_test_preds['knn'])
    model_classif_report['knn'] = classification_report(test_dat_scaled_y, model_test_preds['knn'])
    
    # ens
    model['ens'] = VotingClassifier(estimators=[
        ('rf',  model['rf']), 
        ('svc', model['svc']), 
        ('knn', model['knn'])], 
        voting='soft')  # Soft voting allows averaging probabilities
    model['ens'].fit(train_dat_scaled_x, train_dat_scaled_y)
    model_test_preds['ens'] = model['ens'].predict(test_dat_scaled_x)
    model_test_acc['ens'] = accuracy_score(test_dat_scaled_y, model_test_preds['ens'])
    model_classif_report['ens'] = classification_report(test_dat_scaled_y, model_test_preds['ens'])
    
    return model, model_test_preds, model_test_acc, model_classif_report

Standardize original data

In [8]:
# Standardizing the features for models that benefit from normalization (SVC, KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Make a tuple of the train / scaled train / test / scaled test sets (scaled for some of the ML models)

In [9]:
train_tuple = (X_train, y_train)
train_tuple_scaled = (X_train_scaled, y_train)
test_tuple = (X_test, y_test)
test_tuple_scaled = (X_test_scaled, y_test)

Train models with original dataset (expecting bad recall for AAA, possibly AA)

In [14]:
models, model_test_preds, model_test_acc, model_classif_report = \
        train_models(train_tuple,
                     train_tuple_scaled,
                     test_tuple,
                     test_tuple_scaled)

Now make a similar set of tuples but for the SMOTE-created dataset:

In [10]:
#init smote
smote = SMOTE(random_state=rng_seed)
# generate additional samples of minotrity classes from SMOTE
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print( X_train.shape,'--- SMOTE INCREASED -->',X_train_resampled.shape )

# standardize for SMOTE
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# make tuples
train_tuple_s = (X_train_resampled, y_train_resampled)
train_tuple_scaled_s = (X_train_scaled, y_train_resampled)
test_tuple_s = (X_test, y_test)
test_tuple_scaled_s = (X_test_scaled, y_test)

(1199, 3) --- SMOTE INCREASED --> (3126, 3)


Now train models again with SMOTE data

In [15]:
models_s, model_test_preds_s, model_test_acc_s, model_classif_report_s = \
        train_models(train_tuple_s,
                     train_tuple_scaled_s,
                     test_tuple_s,
                     test_tuple_scaled_s)

In [16]:
for key in models:
    
    print(f'For model {key}:')
    print(model_test_acc[key])
    print(model_classif_report[key])
    
    print('++++++++++++++++++++++++++++')
    
    print(f'For SMOTE model {key}:')
    print(model_test_acc_s[key])
    print(model_classif_report_s[key])
    
    print('============================')

For model log_reg:
0.87
              precision    recall  f1-score   support

          AA       0.57      0.15      0.24        27
         AAA       0.00      0.00      0.00        14
       Indie       0.88      0.99      0.93       259

    accuracy                           0.87       300
   macro avg       0.48      0.38      0.39       300
weighted avg       0.81      0.87      0.83       300

++++++++++++++++++++++++++++
For SMOTE model log_reg:
0.04666666666666667
              precision    recall  f1-score   support

          AA       0.00      0.00      0.00        27
         AAA       0.05      1.00      0.09        14
       Indie       0.00      0.00      0.00       259

    accuracy                           0.05       300
   macro avg       0.02      0.33      0.03       300
weighted avg       0.00      0.05      0.00       300

For model naive_bayes:
0.8566666666666667
              precision    recall  f1-score   support

          AA       0.33      0.15      0.21