# Steam â€” Business EDA (Plotly)
This notebook analyzes `data/processed/steam_cleaned.csv` and focuses on sales-related signals (estimated_downloads), price, tags and platform info for dashboard-ready visuals.

In [None]:
import os
import pandas as pd
import plotly.express as px
import numpy as np
pd.set_option('display.max_columns', 120)

In [None]:
PATH = '/home/jubaer/Downloads/GOTY 2025 analysis/data/processed/steam_cleaned.csv'
assert os.path.exists(PATH), 'Run src/clean_steam.py first'
df = pd.read_csv(PATH, dtype=str)
# parse numeric columns
for c in ['price','estimated_downloads','reviews_like_rate','all_reviews_number']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
# release date
df['release_date'] = pd.to_datetime(df.get('release_date', ''), errors='coerce')
display(df.head(5))

## Top sellers and price buckets
Use `estimated_downloads` as a proxy for sales and show the top titles and price vs downloads relationship.

In [None]:
if 'estimated_downloads' in df.columns:
    top_sales = df.sort_values('estimated_downloads', ascending=False).head(20)
    fig = px.bar(top_sales, x='game_name', y='estimated_downloads', title='Top titles by estimated downloads', template='plotly_white')
    fig.update_layout(xaxis_tickangle=-45, height=500)
    fig.show()

# price vs downloads scatter
if 'price' in df.columns and 'estimated_downloads' in df.columns:
    sub = df.dropna(subset=['price','estimated_downloads'])
    fig2 = px.scatter(sub, x='price', y='estimated_downloads', hover_data=['game_name'], title='Price vs Estimated downloads', template='plotly_white')
    fig2.update_traces(marker=dict(opacity=0.7))
    fig2.show()

## Tag analysis and segmenting genres
Explode `user_defined_tags` to get tag-level insights for marketing and discovery optimization.

In [None]:
if 'user_defined_tags' in df.columns:
    df['tags_list'] = df['user_defined_tags'].fillna('').astype(str).apply(lambda s: [x.strip() for x in s.split('|') if x.strip()])
    df_tag = df.explode('tags_list')
    tag_counts = df_tag[df_tag['tags_list'] != '']['tags_list'].value_counts().reset_index(name='count')
    tag_counts.columns = ['tag', 'count']
    fig = px.bar(tag_counts.head(30), x='tag', y='count', title='Top Steam tags (top 30)', template='plotly_white')
    fig.update_layout(xaxis_tickangle=-45, height=550)
    fig.show()

In [None]:
# Save a compact table for Tableau (title, price, downloads, top tag)
def top_tag_for_row(tags):
    if not tags:
        return ''
    return tags[0] if isinstance(tags, list) and len(tags) else ''
if 'tags_list' in df.columns:
    df['primary_tag'] = df['tags_list'].apply(top_tag_for_row)
out_cols = [c for c in ['game_name','release_date','price','estimated_downloads','primary_tag'] if c in df.columns]
os.makedirs('data/processed', exist_ok=True)
df.to_csv('data/processed/steam_dashboard_table.csv', index=False, columns=out_cols)
print('Wrote steam_dashboard_table.csv')