In [1]:
import gzip
from sklearn import linear_model
import numpy as np
import pandas as pd
import math

paths = ["australian_users_items.json.gz", "australian_user_reviews.json.gz", "bundle_data.json.gz", "steam_games.json.gz", "steam_reviews.json.gz"]

dfs = {} 

for path in paths:
    data_list = []
    
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        for line in f:
            try:
                record = eval(line)
                data_list.append(record)
            except SyntaxError:
                continue

    name = path.split(".")[0] 
    dfs[name] = pd.DataFrame(data_list)

australian_users_items = dfs['australian_users_items']
australian_user_reviews = dfs['australian_user_reviews']
bundle_data = dfs['bundle_data']
steam_games = dfs['steam_games']
steam_reviews = dfs['steam_reviews']

In [None]:
item_matrix = steam_games.copy()
item_matrix['tags'] = item_matrix['tags'].apply(lambda x: x if isinstance(x, list) else [])
item_matrix['title'] = item_matrix.apply(lambda x: x['title'] if isinstance(x['title'], str) else x['app_name'], axis=1)
def clean_price(x):
    if isinstance(x, float):
        return x
    if isinstance(x, str):
        x_lower = x.strip().lower()
        if 'free' in x_lower:
            return 0.0
        try:
            return float(x)
        except ValueError:
            return np.nan

    return np.nan
item_matrix['price'] = item_matrix['price'].apply(clean_price)
item_matrix['discount_price'] = item_matrix.apply(lambda x: x['discount_price'] if ((x['discount_price'] != np.nan) & (isinstance(x['discount_price'], str))) else x['price'], axis=1)

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(item_matrix['tags'])
df_encoded = pd.DataFrame(binary_matrix, columns=mlb.classes_, index=item_matrix.index)
item_matrix = pd.concat([item_matrix[['id', 'title', 'price', 'discount_price', 'release_date', 'developer', 'sentiment']], df_encoded], axis=1)



In [24]:
item_matrix

Unnamed: 0,id,title,price,discount_price,release_date,developer,sentiment,1980s,1990's,2.5D,...,Warhammer 40K,Web Publishing,Werewolves,Western,Word Game,World War I,World War II,Wrestling,Zombies,e-sports
0,761140,Lost Summoner Kitty,4.99,4.99,2018-01-04,Kotoshiro,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,643980,Ironbound,0.00,0.00,2018-01-04,Secret Level SRL,Mostly Positive,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,670290,Real Pool 3D - Poolians,0.00,0.00,2017-07-24,Poolians.com,Mostly Positive,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,767400,弹炸人2222,0.99,0.99,2017-12-07,彼岸领域,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,773570,Log Challenge,2.99,2.99,,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32130,773640,Colony On Mars,1.99,1.99,2018-01-04,"Nikita ""Ghost_RUS""",,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32131,733530,LOGistICAL: South Africa,4.99,4.99,2018-01-04,Sacada,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32132,610660,Russian Roads,1.99,1.99,2018-01-04,Laush Dmitriy Sergeevich,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32133,658870,EXIT 2 - Directions,4.99,4.99,2017-09-02,"xropi,stev3ns",1 user reviews,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
user_matrix = australian_users_items.explode('items').reset_index(drop=True)

items_normalized = pd.json_normalize(user_matrix['items'])

user_matrix = pd.concat([user_matrix.drop('items', axis=1), items_normalized], axis=1)

user_matrix = user_matrix[['steam_id','item_id','playtime_forever']]
user_matrix['y'] = pd.Series(user_matrix['playtime_forever'] >= 2.0).apply(int)
user_matrix = user_matrix[['steam_id','item_id','y']]

In [11]:
user_matrix

Unnamed: 0,steam_id,item_id,y
0,76561197970982479,10,1
1,76561197970982479,20,0
2,76561197970982479,30,1
3,76561197970982479,40,0
4,76561197970982479,50,0
...,...,...,...
5170010,76561198329548331,373330,0
5170011,76561198329548331,388490,1
5170012,76561198329548331,521570,1
5170013,76561198329548331,519140,1
