In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import gzip
# import ast

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.svm import LinearSVC

import time

In [6]:
# Reference: https://cseweb.ucsd.edu/~jmcauley/datasets.html#steam_data
steam_game_metadata = "data/steam_games.json.gz" 
steam_reviews = "data/steam_reviews.json.gz" # https://cseweb.ucsd.edu/~wckang/steam_reviews.json.gz
steam_bundles = "data/bundle_data.json.gz"

# Helper function to read json in gz
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

# Load into pandas
def load_to_pandas(filepath):
    start_time = time.time()
    data = []
    for d in readGz(filepath):
        data.append(d)
    end_time = time.time()
    print(f"Loaded {len(data)} records from {filepath}")
    print(f"Elapsed time: {end_time - start_time:.2f} seconds")
    print(f"Sample record:\n{data[0]}")
    return pd.DataFrame(data)

In [7]:
metadata_df = load_to_pandas(steam_game_metadata)

Loaded 32135 records from data/steam_games.json.gz
Elapsed time: 1.70 seconds
Sample record:
{'publisher': 'Kotoshiro', 'genres': ['Action', 'Casual', 'Indie', 'Simulation', 'Strategy'], 'app_name': 'Lost Summoner Kitty', 'title': 'Lost Summoner Kitty', 'url': 'http://store.steampowered.com/app/761140/Lost_Summoner_Kitty/', 'release_date': '2018-01-04', 'tags': ['Strategy', 'Action', 'Indie', 'Casual', 'Simulation'], 'discount_price': 4.49, 'reviews_url': 'http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1', 'specs': ['Single-player'], 'price': 4.99, 'early_access': False, 'id': '761140', 'developer': 'Kotoshiro'}


In [23]:
metadata_df = metadata_df.rename(columns={'id': 'product_id'})

metadata_df.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,product_id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域,,
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",1.79,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,,,


In [None]:
reviews_df = load_to_pandas(steam_reviews)

Loaded 7793069 records from data/steam_reviews.json.gz
Elapsed time: 236.83 seconds
Sample record:
{'username': 'Chaos Syren', 'hours': 0.1, 'products': 41, 'product_id': '725280', 'page_order': 0, 'date': '2017-12-17', 'text': 'This would not be acceptable as an entertainment even back in the day when these graphics were all there was to be had. No effort has been made to bring the player into any story or even entertain.', 'early_access': False, 'page': 1}


In [20]:
reviews_df.head()

Unnamed: 0,username,hours,products,product_id,page_order,date,text,early_access,page,found_funny,compensation,user_id
0,Chaos Syren,0.1,41.0,725280,0,2017-12-17,This would not be acceptable as an entertainme...,False,1,,,
1,₮ʜᴇ Wᴀʀᴛᴏɴ,51.1,769.0,328100,0,2017-12-27,looks like a facebook game,False,1,,,
2,hello?<,14.6,2.0,328100,1,2017-10-16,Better than Minecraft,False,1,2.0,Product received for free,
3,Cyderine916,5.0,64.0,35140,0,2018-01-04,I love and idolized Batman and this game is Ma...,False,1,,,
4,DarklyThinking,16.6,577.0,35140,1,2018-01-04,Still worth playing in 2018.\nProbably my favo...,False,1,,,7.656119800748307e+16


In [28]:
df = reviews_df.merge(metadata_df[["product_id", "genres"]], on="product_id", how="inner")
df

Unnamed: 0,username,hours,products,product_id,page_order,date,text,early_access,page,found_funny,compensation,user_id,genres
0,Chaos Syren,0.1,41.0,725280,0,2017-12-17,This would not be acceptable as an entertainme...,False,1,,,,"[Action, Adventure, Indie, Simulation]"
1,₮ʜᴇ Wᴀʀᴛᴏɴ,51.1,769.0,328100,0,2017-12-27,looks like a facebook game,False,1,,,,"[Adventure, Indie, RPG]"
2,hello?<,14.6,2.0,328100,1,2017-10-16,Better than Minecraft,False,1,2.0,Product received for free,,"[Adventure, Indie, RPG]"
3,Cyderine916,5.0,64.0,35140,0,2018-01-04,I love and idolized Batman and this game is Ma...,False,1,,,,"[Action, Adventure]"
4,DarklyThinking,16.6,577.0,35140,1,2018-01-04,Still worth playing in 2018.\nProbably my favo...,False,1,,,76561198007483075,"[Action, Adventure]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7799533,Wildman_,174.3,208.0,252490,5,2013-12-11,A really fun game. There's always something to...,True,10221,,,,"[Action, Adventure, Indie, Massively Multiplay..."
7799534,Stony,1215.2,73.0,252490,6,2013-12-11,really fun and addictive game to play,True,10221,,,76561198089897928,"[Action, Adventure, Indie, Massively Multiplay..."
7799535,Deez Knees,50.5,288.0,252490,7,2013-12-11,gr8 game 10/10 wud buy agen,True,10221,,,76561198048207033,"[Action, Adventure, Indie, Massively Multiplay..."
7799536,Vidaar,783.5,353.0,252490,8,2013-12-11,Summary: Rust is a survival game created by Fa...,True,10221,,,,"[Action, Adventure, Indie, Massively Multiplay..."


In [30]:
df_cleaned = df.dropna(subset=['genres'])

In [49]:
# https://www.kdnuggets.com/2023/01/encoding-categorical-features-multilabelbinarizer.html
# MultiLabelBinarizer for multi-label genre classification
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df_cleaned["genres"])

genre_names = mlb.classes_
print(f"Number of unique genres: {len(mlb.classes_)}")
print(f"\nGenres: {genre_names}")
print(f"\nY shape: {Y.shape}")
print(f"\nSample binary matrix (first 5 rows):")
print(Y[:5])

Number of unique genres: 22

Genres: ['Accounting' 'Action' 'Adventure' 'Animation &amp; Modeling'
 'Audio Production' 'Casual' 'Design &amp; Illustration' 'Early Access'
 'Education' 'Free to Play' 'Indie' 'Massively Multiplayer'
 'Photo Editing' 'RPG' 'Racing' 'Simulation' 'Software Training' 'Sports'
 'Strategy' 'Utilities' 'Video Production' 'Web Publishing']

Y shape: (7759867, 22)

Sample binary matrix (first 5 rows):
[[0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [50]:
X = df_cleaned["text"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f"Training set size:  {X_train.shape[0]} samples")
print(f"Test set size:      {X_test.shape[0]} samples")

Training set size:  6207893 samples
Test set size:      1551974 samples


In [45]:
df_sample = df_cleaned.sample(n=100000, random_state=42)
Y_sample = mlb.transform(df_sample["genres"])

X_train, X_test, Y_train, Y_test = train_test_split(df_sample["text"], Y_sample, test_size=0.2, random_state=42)

In [52]:
# Given k classes, we will train k binary classifiers (One-vs-Rest) --> Week 7 255R
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=50000,
        ngram_range=(1,1),
        min_df=5,
        stop_words="english"
    )),
    ("clf", OneVsRestClassifier(
        LogisticRegression(solver='saga', max_iter=200, n_jobs=-1), 
        n_jobs=-1
    ))
])

In [53]:
# Train the model
pipeline.fit(X_train, Y_train)
Y_pred = pipeline.predict(X_test)

In [56]:
#classification_report=classification_report(Y_test, Y_pred, target_names=genre_names)
print("Classification Report:")
#print(classification_report)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.4f}")

Classification Report:
Accuracy: 0.2950


In [37]:
genre_freq = {}

for genres in metadata_df["genres"]:
    if isinstance(genres, list):
        for g in genres:
            if g not in genre_freq:
                genre_freq[g] = 0
            genre_freq[g] += 1
# Sort by frequency (highest first)
most_common = sorted(genre_freq.items(), key=lambda x: x[1], reverse=True)
most_common_genre = most_common[0][0]

print("Most common genre:", most_common_genre)
print(f"Frequency: {(genre_freq[most_common_genre]/len(metadata_df)*100):.1f}%")

Most common genre: Indie
Frequency: 49.3%


In [None]:
# Used in assignment 1

# for l in readGz("test_Category.json.gz"):
#   cat = catDict['fantasy_paranormal'] # If there's no evidence, just choose the most common category in the dataset
#   words = l['review_text'].lower()
#   if 'children' in words:
#     cat = catDict['children']
#   if 'comic' in words:
#     cat = catDict['comics_graphic']
#   if 'fantasy' in words:
#     cat = catDict['fantasy_paranormal']
#   if 'mystery' in words:
#     cat = catDict['mystery_thriller_crime']
#   if 'love' in words:
#     cat = catDict['young_adult']
#   predictions.write(l['user_id'] + ',' + l['review_id'] + "," + str(cat) + "\n")

