In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import gzip
# import ast

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, make_scorer, jaccard_score
from sklearn.svm import LinearSVC

import time

In [2]:
# Reference: https://cseweb.ucsd.edu/~jmcauley/datasets.html#steam_data
steam_game_metadata = "data/steam_games.json.gz" 
steam_reviews = "data/steam_reviews.json.gz" # https://cseweb.ucsd.edu/~wckang/steam_reviews.json.gz
steam_bundles = "data/bundle_data.json.gz"

# Helper function to read json in gz
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

# Load into pandas
def load_to_pandas(filepath):
    start_time = time.time()
    data = []
    for d in readGz(filepath):
        data.append(d)
    end_time = time.time()
    print(f"Loaded {len(data)} records from {filepath}")
    print(f"Elapsed time: {end_time - start_time:.2f} seconds")
    print(f"Sample record:\n{data[0]}")
    return pd.DataFrame(data)

In [3]:
metadata_df = load_to_pandas(steam_game_metadata)

Loaded 32135 records from data/steam_games.json.gz
Elapsed time: 1.46 seconds
Sample record:
{'publisher': 'Kotoshiro', 'genres': ['Action', 'Casual', 'Indie', 'Simulation', 'Strategy'], 'app_name': 'Lost Summoner Kitty', 'title': 'Lost Summoner Kitty', 'url': 'http://store.steampowered.com/app/761140/Lost_Summoner_Kitty/', 'release_date': '2018-01-04', 'tags': ['Strategy', 'Action', 'Indie', 'Casual', 'Simulation'], 'discount_price': 4.49, 'reviews_url': 'http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1', 'specs': ['Single-player'], 'price': 4.99, 'early_access': False, 'id': '761140', 'developer': 'Kotoshiro'}


In [4]:
metadata_df = metadata_df.rename(columns={'id': 'product_id'})

metadata_df.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,product_id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域,,
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",1.79,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,,,


In [5]:
print(f"Columns: {metadata_df.columns.tolist()}")
print(f"\nShape: {metadata_df.shape}")
print(f"\nMissing values:\n{metadata_df.isna().sum()}")
print(f"\nNumber of Duplicates: {metadata_df['product_id'].duplicated().sum()}")
#metadata_df.info()

Columns: ['publisher', 'genres', 'app_name', 'title', 'url', 'release_date', 'tags', 'discount_price', 'reviews_url', 'specs', 'price', 'early_access', 'product_id', 'developer', 'sentiment', 'metascore']

Shape: (32135, 16)

Missing values:
publisher          8052
genres             3283
app_name              2
title              2050
url                   0
release_date       2067
tags                163
discount_price    31910
reviews_url           2
specs               670
price              1377
early_access          0
product_id            2
developer          3299
sentiment          7182
metascore         29458
dtype: int64

Number of Duplicates: 2


In [6]:
# Mark all duplicates
duplicates = metadata_df[metadata_df['product_id'].duplicated(keep=False)]
print(f"Total duplicate rows: {len(duplicates)}")
duplicates

Total duplicate rows: 4


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,product_id,developer,sentiment,metascore
74,,,,,http://store.steampowered.com/,,,14.99,,,19.99,False,,,,
13894,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",,http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,False,612880.0,Machine Games,Mostly Positive,86.0
14573,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfe...,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",,http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,False,612880.0,Machine Games,Mostly Positive,86.0
30961,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",,,"[Single-player, Steam Achievements, Steam Trad...",19.99,False,,"Rocksteady Studios,Feral Interactive (Mac)",Overwhelmingly Positive,91.0


In [7]:
# Explore unique genres in the dataset
unique_genres = set()

for row in metadata_df["genres"]:
    if isinstance(row, list):
        unique_genres.update(row)

unique_genres = sorted(unique_genres)
print(f"Total unique genres: {len(unique_genres)}")
print(unique_genres)

Total unique genres: 22
['Accounting', 'Action', 'Adventure', 'Animation &amp; Modeling', 'Audio Production', 'Casual', 'Design &amp; Illustration', 'Early Access', 'Education', 'Free to Play', 'Indie', 'Massively Multiplayer', 'Photo Editing', 'RPG', 'Racing', 'Simulation', 'Software Training', 'Sports', 'Strategy', 'Utilities', 'Video Production', 'Web Publishing']


In [24]:
genre_map = {
    # Keep gameplay genres
    "Action": "Action",
    "Adventure": "Adventure",
    "Casual": "Casual",
    "Indie": "Indie",
    "Massively Multiplayer": "Massively Multiplayer",
    "RPG": "RPG",
    "Racing": "Racing",
    "Simulation": "Simulation",
    "Sports": "Sports",
    "Strategy": "Strategy",

    # Meta-tags
    "Early Access": "Early Access",
    "Free to Play": "Free to Play",

    # Collapse other categories
    "Accounting": "Productivity/Software",
    "Animation &amp; Modeling": "Productivity/Software",
    "Audio Production": "Productivity/Software",
    "Design &amp; Illustration": "Productivity/Software",
    "Education": "Productivity/Software",
    "Photo Editing": "Productivity/Software",
    "Software Training": "Productivity/Software",
    "Utilities": "Productivity/Software",
    "Video Production": "Productivity/Software",
    "Web Publishing": "Productivity/Software",
}


In [25]:
def map_genres(genres):
    if not isinstance(genres, list):
        return []
    mapped = [genre_map[g] for g in genres if g in genre_map]
    return sorted(set(mapped))  # removes duplicates

metadata_df["genres_mapped"] = metadata_df["genres"].apply(map_genres)

In [26]:
# Confirm changes
unique_genres = set()

for row in metadata_df["genres_mapped"]:
    if isinstance(row, list):
        unique_genres.update(row)

unique_genres = sorted(unique_genres)
print(f"Total unique genres: {len(unique_genres)}")
print(unique_genres)

Total unique genres: 13
['Action', 'Adventure', 'Casual', 'Early Access', 'Free to Play', 'Indie', 'Massively Multiplayer', 'Productivity/Software', 'RPG', 'Racing', 'Simulation', 'Sports', 'Strategy']


In [10]:
reviews_df = load_to_pandas(steam_reviews)

Loaded 7793069 records from data/steam_reviews.json.gz
Elapsed time: 227.02 seconds
Sample record:
{'username': 'Chaos Syren', 'hours': 0.1, 'products': 41, 'product_id': '725280', 'page_order': 0, 'date': '2017-12-17', 'text': 'This would not be acceptable as an entertainment even back in the day when these graphics were all there was to be had. No effort has been made to bring the player into any story or even entertain.', 'early_access': False, 'page': 1}


In [11]:
print(f"Columns: {reviews_df.columns.tolist()}")
print(f"\nShape: {reviews_df.shape}")
print(f"\nMissing values:\n{reviews_df.isna().sum()}")

Columns: ['username', 'hours', 'products', 'product_id', 'page_order', 'date', 'text', 'early_access', 'page', 'found_funny', 'compensation', 'user_id']

Shape: (7793069, 12)

Missing values:
username              0
hours             26537
products          14961
product_id            0
page_order            0
date                  0
text                  0
early_access          0
page                  0
found_funny     6592313
compensation    7647446
user_id         4616846
dtype: int64


In [12]:
reviews_df.head()

Unnamed: 0,username,hours,products,product_id,page_order,date,text,early_access,page,found_funny,compensation,user_id
0,Chaos Syren,0.1,41.0,725280,0,2017-12-17,This would not be acceptable as an entertainme...,False,1,,,
1,₮ʜᴇ Wᴀʀᴛᴏɴ,51.1,769.0,328100,0,2017-12-27,looks like a facebook game,False,1,,,
2,hello?<,14.6,2.0,328100,1,2017-10-16,Better than Minecraft,False,1,2.0,Product received for free,
3,Cyderine916,5.0,64.0,35140,0,2018-01-04,I love and idolized Batman and this game is Ma...,False,1,,,
4,DarklyThinking,16.6,577.0,35140,1,2018-01-04,Still worth playing in 2018.\nProbably my favo...,False,1,,,7.656119800748307e+16


In [30]:
df = reviews_df.merge(metadata_df[["product_id", "genres_mapped"]], on="product_id", how="inner")
df

Unnamed: 0,username,hours,products,product_id,page_order,date,text,early_access,page,found_funny,compensation,user_id,genres_mapped
0,Chaos Syren,0.1,41.0,725280,0,2017-12-17,This would not be acceptable as an entertainme...,False,1,,,,"[Action, Adventure, Indie, Simulation]"
1,₮ʜᴇ Wᴀʀᴛᴏɴ,51.1,769.0,328100,0,2017-12-27,looks like a facebook game,False,1,,,,"[Adventure, Indie, RPG]"
2,hello?<,14.6,2.0,328100,1,2017-10-16,Better than Minecraft,False,1,2.0,Product received for free,,"[Adventure, Indie, RPG]"
3,Cyderine916,5.0,64.0,35140,0,2018-01-04,I love and idolized Batman and this game is Ma...,False,1,,,,"[Action, Adventure]"
4,DarklyThinking,16.6,577.0,35140,1,2018-01-04,Still worth playing in 2018.\nProbably my favo...,False,1,,,76561198007483075,"[Action, Adventure]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7799533,Wildman_,174.3,208.0,252490,5,2013-12-11,A really fun game. There's always something to...,True,10221,,,,"[Action, Adventure, Early Access, Indie, Massi..."
7799534,Stony,1215.2,73.0,252490,6,2013-12-11,really fun and addictive game to play,True,10221,,,76561198089897928,"[Action, Adventure, Early Access, Indie, Massi..."
7799535,Deez Knees,50.5,288.0,252490,7,2013-12-11,gr8 game 10/10 wud buy agen,True,10221,,,76561198048207033,"[Action, Adventure, Early Access, Indie, Massi..."
7799536,Vidaar,783.5,353.0,252490,8,2013-12-11,Summary: Rust is a survival game created by Fa...,True,10221,,,,"[Action, Adventure, Early Access, Indie, Massi..."


In [31]:
df_cleaned = df.dropna(subset=['genres_mapped'])

## Creation of train, validation, and test sets

In [52]:
def train_val_test_split(X, y, train_ratio=70, val_ratio=15, test_ratio=15, random_state=42):
    # Convert percentages to fractions
    total       = train_ratio + val_ratio + test_ratio
    train_frac  = train_ratio / total
    val_frac    = val_ratio / total
    test_frac   = test_ratio / total

    # First split: train vs remainder
    X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=train_frac, random_state=random_state)

    # Normalize val/test split inside the remainder
    rem_frac            = val_frac + test_frac
    val_frac_adjusted   = val_frac / rem_frac  # fraction of the remainder that should go to val

    # Second split: validation vs test
    X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, train_size=val_frac_adjusted, random_state=random_state)

    return X_train, X_val, X_test, y_train, y_val, y_test

In [57]:
# https://www.kdnuggets.com/2023/01/encoding-categorical-features-multilabelbinarizer.html
# MultiLabelBinarizer for multi-label genre classification
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df_cleaned['genres_mapped'])

genre_names = mlb.classes_
print(f"Number of unique genres: {len(mlb.classes_)}")
print(f"\nGenres: {genre_names}")
print(f"\nY shape: {Y.shape}")
print(f"\nSample matrix (first 5 rows):")
print(Y[:5])

Number of unique genres: 13

Genres: ['Action' 'Adventure' 'Casual' 'Early Access' 'Free to Play' 'Indie'
 'Massively Multiplayer' 'Productivity/Software' 'RPG' 'Racing'
 'Simulation' 'Sports' 'Strategy']

Y shape: (7799538, 13)

Sample matrix (first 5 rows):
[[1 1 0 0 0 1 0 0 0 0 1 0 0]
 [0 1 0 0 0 1 0 0 1 0 0 0 0]
 [0 1 0 0 0 1 0 0 1 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0]]


In [58]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = train_val_test_split(df_cleaned['text'], Y)

print(f"Training set size:      {X_train.shape[0]} samples")
print(f"Validation set size:    {X_val.shape[0]} samples")
print(f"Test set size:          {X_test.shape[0]} samples")

Training set size:      5459676 samples
Validation set size:    1169931 samples
Test set size:          1169931 samples


### Baseline Model: Usage of keywords to guess

In [61]:
# Similar to assignment1 catdict, used LLM for initial brainstorming of keywords

### Category prediction baseline: Just consider some of the most common words from each category
# catDict = {
#   "children": 0,
#   "comics_graphic": 1,
#   "fantasy_paranormal": 2,
#   "mystery_thriller_crime": 3,
#   "young_adult": 4
# }

# Unique genres following processing: 
# ['Action', 'Adventure', 'Casual', 'Early Access', 'Free to Play', 'Indie', 'Massively Multiplayer', 'Productivity/Software', 'RPG', 'Racing', 'Simulation', 'Sports', 'Strategy']

action_keywords = ["combat", "fight", "shoot", "gun", "guns", "weapon", "fps", "tps", "intense", "action", "explosion", "enemy", "fast paced", "reflex", "dodging", "boss fight"]

adventure_keywords = ["story", "explore", "exploration", "journey","quests", "narrative", "adventure", "puzzles", "discovery", "atmosphere", "open world"]

casual_keywords = ["relaxing", "simple", "easy", "puzzle", "cute", "casual", "family friendly", "short game", "idle", "clicker", "cozy"]

indie_keywords = ["indie", "small dev", "pixel art", "unique", "creative", "experimental", "low budget", "solo developer"]

rpg_keywords = ["rpg", "role playing", "quest", "skills", "leveling", "loot", "xp", "inventory", "classes", "abilities", "stats", "character build"]

simulation_keywords = ["simulate", "simulation", "management", "simulator", "tycoon", "building", "construction", "farming", "driving", "realistic", "train", "aircraft", "physics"]

strategy_keywords = ["tactics", "strategy", "turn based", "rtx", "planning", "resources", "base building", "micro", "macro", "decision", "pvp strategy", "chess-like"]

racing_keywords = ["race", "racing", "cars", "drift", "track", "lap", "vehicle", "speed", "driver"]

sports_keywords = ["sports", "football", "soccer", "basketball", "tournament", "athletic"]

mmo_keywords = ["online multiplayer", "mmo", "mmorpg", "guild", "raid", "co-op", "pvp", "servers", "matchmaking", "persistent world", "clan"]

software_keywords = ["render", "edit", "animation", "modeling", "audio", "mixing", "tutorial", "training", "utility", "publish", "design", "photo", "video editing"]

f2p_keywords = ["free to play", "f2p", "microtransactions", "paywall"]

ea_keywords = ["early access", "still in development", "beta", "incomplete"]


In [None]:
def naive_genre_predict(text):
    text = text.lower()
    preds = set()

    for kw in action_keywords:
        if kw in text:
            preds.add("Action")

    for kw in adventure_keywords:
        if kw in text:
            preds.add("Adventure")

    for kw in casual_keywords:
        if kw in text:
            preds.add("Casual")
    
    for kw in indie_keywords:
        if kw in text:
            preds.add("Indie")

    for kw in rpg_keywords:
        if kw in text:
            preds.add("RPG")

    for kw in simulation_keywords:
        if kw in text:
            preds.add("Simulation")

    for kw in strategy_keywords:
        if kw in text:
            preds.add("Strategy")

    for kw in racing_keywords:
        if kw in text:
            preds.add("Racing")

    for kw in sports_keywords:
        if kw in text:
            preds.add("Sports")

    for kw in mmo_keywords:
        if kw in text:
            preds.add("Massively Multiplayer")

    for kw in software_keywords:
        if kw in text:
            preds.add("Productivity/Software")

    for kw in f2p_keywords:
        if kw in text:
            preds.add("Free to Play")

    for kw in ea_keywords:
        if kw in text:
            preds.add("Early Access")

    # fallback (most common genre)
    if not preds:
        preds.add("Indie")  

    return list(preds)

In [67]:
Y_pred_baseline = mlb.transform(X_test.apply(naive_genre_predict))
print("Baseline Classification Report:")
print(classification_report(Y_test, Y_pred_baseline, target_names=genre_names))

print(f"Jaccard Score: {jaccard_score(Y_test, Y_pred_baseline, average='samples'):.4f}")

Baseline Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                       precision    recall  f1-score   support

               Action       0.74      0.20      0.32    722102
            Adventure       0.53      0.25      0.34    392735
               Casual       0.20      0.18      0.19    131897
         Early Access       0.38      0.09      0.15     97293
         Free to Play       0.78      0.05      0.10    204992
                Indie       0.47      0.58      0.52    556728
Massively Multiplayer       0.27      0.13      0.18    118309
Productivity/Software       0.01      0.15      0.02      6186
                  RPG       0.39      0.24      0.30    304590
               Racing       0.11      0.34      0.16     22238
           Simulation       0.39      0.14      0.21    197657
               Sports       0.36      0.05      0.09     27904
             Strategy       0.53      0.11      0.18    222354

            micro avg       0.44      0.25      0.32   3004985
            macro avg       0.40      0.19      0.21 

### Key Model: TFIDF + LogReg

In [None]:
# Initial testing on a smaller sample due to resource constraints

# df_sample = df_cleaned.sample(n=100000, random_state=42)
# Y_sample = mlb.transform(df_sample["genres"])

# X_train, X_test, Y_train, Y_test = train_test_split(df_sample["text"], Y_sample, test_size=0.2, random_state=42)

In [68]:
# Given k classes, we will train k binary classifiers (One-vs-Rest) --> Week 7 255R
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=5000,
        ngram_range=(1,1),
        min_df=5,
        stop_words="english"
    )),
    ("clf", OneVsRestClassifier(
        LinearSVC(C=1, max_iter=1000), 
        n_jobs=-1
    ))
])

In [None]:
# Gridsearch to find best value of hyperparameters
begin_time = time.time()
print("Performing grid search...")

parameters = {
    'tfidf__ngram_range': [(1,1)],          
    'tfidf__max_features': [5000, 50000],   

    'clf__estimator__C': [0.25, 1, 5, 10],
    # 'clf__estimator__max_iter': [1000],     
}

# Define Jaccard scorer, since default is accuracy for multi-label is not suitable
jaccard_scorer = make_scorer(jaccard_score, average="samples") 

grid_clf = GridSearchCV(
    pipeline,
    parameters,
    scoring=jaccard_scorer,
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_clf.fit(X_train, Y_train)

best_params = grid_clf.best_params_
print("\nBest parameters:", best_params)

# Evaluate on validation set

val_preds = grid_clf.predict(X_val)
val_acc = jaccard_score(Y_val, val_preds)
print("\nValidation Jaccard Score:", val_acc)

print("\nFitting best model and evaluating on test set...")
best_model = grid_clf.best_estimator_

test_preds = best_model.predict(X_test)
test_jaccard = jaccard_score(Y_test, test_preds)
print("Test Jaccard Score:", test_jaccard)
end_time = time.time()
print(f"Time taken: {end_time - begin_time} seconds")

Performing grid search...
Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
print("Classification Report:")
print(classification_report(Y_test, test_preds, target_names=genre_names))

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                       precision    recall  f1-score   support

               Action       0.77      0.90      0.83    962793
            Adventure       0.75      0.45      0.57    522664
               Casual       0.75      0.20      0.31    175641
         Early Access       0.76      0.26      0.39    129571
         Free to Play       0.79      0.34      0.48    273468
                Indie       0.75      0.68      0.71    741325
Massively Multiplayer       0.79      0.29      0.43    157494
Productivity/Software       0.91      0.49      0.64      8274
                  RPG       0.81      0.39      0.53    406613
               Racing       0.88      0.33      0.48     29402
           Simulation       0.78      0.31      0.45    263935
               Sports       0.83      0.28      0.41     37384
             Strategy       0.84      0.40      0.54    296277

            micro avg       0.77      0.55      0.64   4004841
            macro avg       0.80      0.41      0.52 

In [None]:
# genre_freq = {}

# for genres in metadata_df["genres"]:
#     if isinstance(genres, list):
#         for g in genres:
#             if g not in genre_freq:
#                 genre_freq[g] = 0
#             genre_freq[g] += 1
# # Sort by frequency (highest first)
# most_common = sorted(genre_freq.items(), key=lambda x: x[1], reverse=True)
# most_common_genre = most_common[0][0]

# print("Most common genre:", most_common_genre)
# print(f"Frequency: {(genre_freq[most_common_genre]/len(metadata_df)*100):.1f}%")

Most common genre: Indie
Frequency: 49.3%
