In [2]:
import gzip
from sklearn import linear_model
import numpy as np
import pandas as pd
import math

paths = ["australian_users_items.json.gz", "australian_user_reviews.json.gz", "bundle_data.json.gz", "steam_games.json.gz", "steam_reviews.json.gz"]

dfs = {} 

for path in paths:
    data_list = []
    
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        for line in f:
            try:
                record = eval(line)
                data_list.append(record)
            except SyntaxError:
                continue

    name = path.split(".")[0] 
    dfs[name] = pd.DataFrame(data_list)

australian_users_items = dfs['australian_users_items']
australian_user_reviews = dfs['australian_user_reviews']
bundle_data = dfs['bundle_data']
steam_games = dfs['steam_games']
steam_reviews = dfs['steam_reviews']

In [3]:
steam_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域,,
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",1.79,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32130,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",1.49,http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS""",,
32131,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",4.24,http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada,,
32132,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",1.39,http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich,,
32133,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",,http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns",1 user reviews,


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

item_matrix = steam_games.copy()
item_matrix['tags'] = item_matrix['tags'].apply(lambda x: x if isinstance(x, list) else [])
item_matrix['specs'] = item_matrix['specs'].apply(lambda x: x if isinstance(x, list) else [])
item_matrix['title'] = item_matrix.apply(lambda x: x['title'] if isinstance(x['title'], str) else x['app_name'], axis=1)

def clean_price(x):
    if isinstance(x, float):
        return x
    if isinstance(x, str):
        x_lower = x.strip().lower()
        if 'free' in x_lower:
            return 0.0
        try:
            return float(x)
        except ValueError:
            return np.nan
    return np.nan

item_matrix['price'] = item_matrix['price'].apply(clean_price)
item_matrix['discount_price'] = item_matrix['discount_price'].apply(clean_price)
item_matrix['discount_price'] = item_matrix['discount_price'].fillna(item_matrix['price'])

mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(item_matrix['tags'])
tag_columns = [f"tag_{c}" for c in mlb.classes_]
df_tags = pd.DataFrame(binary_matrix, columns=tag_columns, index=item_matrix.index)

mlb_specs = MultiLabelBinarizer()
binary_matrix_specs = mlb_specs.fit_transform(item_matrix['specs'])
spec_columns = [f"spec_{c}" for c in mlb_specs.classes_]
df_specs = pd.DataFrame(binary_matrix_specs, columns=spec_columns, index=item_matrix.index)

keep_cols = ['id', 'title', 'price', 'discount_price', 'release_date', 'developer', 'sentiment']
item_matrix = pd.concat([item_matrix[keep_cols], df_tags, df_specs], axis=1)

top_30 = item_matrix['developer'].value_counts()[:30].index
pattern = '|'.join([str(x) for x in top_30 if len(str(x)) > 0])
item_matrix['top_30_dev'] = item_matrix['developer'].str.contains(pattern, na=False, case=False).astype(int)

item_matrix = item_matrix.drop(columns=['title', 'developer'])
item_matrix['release_date'] = pd.to_datetime(item_matrix['release_date'], errors='coerce')

print(f"Shape: {item_matrix.shape}")
item_matrix.head()

Shape: (32135, 385)


  item_matrix['top_30_dev'] = item_matrix['developer'].str.contains(pattern, na=False, case=False).astype(int)


Unnamed: 0,id,price,discount_price,release_date,sentiment,tag_1980s,tag_1990's,tag_2.5D,tag_2D,tag_2D Fighter,...,spec_Steam Cloud,spec_Steam Leaderboards,spec_Steam Trading Cards,spec_Steam Turn Notifications,spec_Steam Workshop,spec_SteamVR Collectibles,spec_Tracked Motion Controllers,spec_Valve Anti-Cheat enabled,spec_Windows Mixed Reality,top_30_dev
0,761140,4.99,4.49,2018-01-04,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,643980,0.0,0.0,2018-01-04,Mostly Positive,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,670290,0.0,0.0,2017-07-24,Mostly Positive,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,767400,0.99,0.83,2017-12-07,,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,773570,2.99,1.79,NaT,,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [5]:
item_matrix.to_csv('item_matrix.csv')

In [21]:
item_matrix['release_date'].isna().sum()

np.int64(2067)

In [77]:
australian_users_items

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


In [27]:
user_matrix = australian_users_items.explode('items').reset_index(drop=True)

items_normalized = pd.json_normalize(user_matrix['items'])

user_matrix = pd.concat([user_matrix.drop('items', axis=1), items_normalized], axis=1)
print(user_matrix['playtime_forever'].sort_values().dropna()[-1000:].mean())
user_matrix = user_matrix[['steam_id','item_id','playtime_forever']]
user_matrix['y'] = pd.Series(user_matrix['playtime_forever'] >= 120.0).apply(int)
user_matrix = user_matrix[['steam_id','item_id','y']]

190173.782


In [28]:
(user_matrix['y'] == 1).sum() / user_matrix['y'].shape[0]

np.float64(0.38017471902886163)

In [29]:
user_matrix.to_csv("user_matrix.csv")

In [61]:
user_matrix[user_matrix['item_id'] == 10].shape[0]

0

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import pandas as pd


train_df, test_df = train_test_split(user_matrix, test_size=0.2, random_state=42)

train_positives = train_df[train_df['y'] == 1]
item_counts = train_positives['item_id'].value_counts()

THRESHOLD = 1500
popular_items_set = set(item_counts[item_counts > THRESHOLD].index)


test_item_ids = test_df['item_id'].values
test_labels = test_df['y'].values

y_pred_baseline = [1 if item in popular_items_set else 0 for item in test_item_ids]

acc = accuracy_score(test_labels, y_pred_baseline)
auc = roc_auc_score(test_labels, y_pred_baseline)

print("\n--- Baseline Model Performance ---")
print(f"Baseline Accuracy: {acc:.4f}")
print(f"Baseline ROC AUC:  {auc:.4f}")
print("\nClassification Report:")
print(classification_report(test_labels, y_pred_baseline))


--- Baseline Model Performance ---
Baseline Accuracy: 0.7086
Baseline ROC AUC:  0.6917

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.76      0.76    640686
           1       0.62      0.62      0.62    393317

    accuracy                           0.71   1034003
   macro avg       0.69      0.69      0.69   1034003
weighted avg       0.71      0.71      0.71   1034003

