In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import time
import graphlab as gl
import cPickle

# Building a user-item-rating sframe for the recommender model

Dataframe of steam IDs and lists of dictionaries containing all the games they have played and owned

In [40]:
with open("/home/ubuntu/games_steamids.p", "rb") as df:
    games_df = cPickle.load(df)

Function that will return the games with above average playtime, reducing the number from 8,100 to 561

In [43]:
def most_played_games(df):
    """
    Return games with above-average playtime
    Takes in a df with steamids and game_info, 
    or lists of dictionaries for each steamid
    """
    count_games = Counter({})
    for user in df.game_info:
        game_dict = {}
        for game in user:
            game_dict[str(game["appid"])] = game["playtime_forever"]
        count_games.update(game_dict)

    avg_playtime = sum(count_games.values()) / float(len(count_games))
    top_games = {}
    for k, v in count_games.iteritems():
        if v >= avg_playtime:
            top_games[int(k)] = v
    
    return top_games

After scraping the store with the returned appids, some fields for smaller games (or other non-game apps) were not  
recognized, reducing my reduced list from 561 to 526. Below is the list of 526 appids

In [45]:
with open("top_appids.csv", "rb") as f:
    top_games = f.read().split(",")
    top_games = set(map(lambda x: int(x), top_games))

In [49]:
len(top_games)

526

Practicing getting DFs into a form usable for a Graphlab model  
Unpacks the lists of dictionaries into a dataframe for each user  
Takes quite a while for 150k users...

In [47]:
def many_games(df, game_id_set):
    """
    Prepping for a graphlab model
    """
    full_df = pd.DataFrame()
    for i in xrange(len(df)):
        user_df = pd.DataFrame(df.game_info[i]) 
        user_df["steamid"] = ((str(df.steamid[i]) + " ") * len(df.game_info[i])).split()
        user_df = user_df[user_df["appid"].isin(game_id_set)]
        full_df = full_df.append(user_df)
    return full_df

In [48]:
t0 = time.time()
practice_test = many_games(games_df[:1000], top_games)
print "{} seconds".format(time.time() - t0)

6.36846208572 seconds


6 seconds for 1000 users, and it does ***not*** scale linearly...

Below is another method, which seems to take longer. Outputs graphlab SFrame

In [34]:
def games_users_sf(df, game_id_set):
    """
    Assumes a df with public non-null values in the form:
    |   |steamid          |game_info        |
    +---+-----------------+-----------------+
    |0  |76561197967398882|[{u'playt...},{}]|
    
    Returns a large sframe with columns of 
    appid, 
    """

    one_sframe = gl.SFrame()
    df.reset_index(drop=True, inplace=True)
    for i in xrange(df.game_info.count()):
        game_dictionary = {}
        for game in df.game_info[i]:
            if game["appid"] in game_id_set:
                game["steamid"] = df.steamid[i]
                
                if "name" in game.keys():
                    del game["name"]
                
                if "playtime_2weeks" not in game.keys():
                    game["playtime_2weeks"] = 0
                
                for k, v in game.items():
                 
                    try:
                        game_dictionary[k].append(v)
                    except KeyError:
                        game_dictionary[k]=[v]
                  
        one_sframe = one_sframe.append(gl.SFrame(game_dictionary))

    return one_sframe

To save time, I did it in chunks of 10k users, rather than all 150k at once, taking ~15min

In [35]:
t0 = time.time()
games_sf1 = games_users_sf(games_df[:10000], top_games)
print "{} seconds".format(time.time() - t0)

61.5657908916 seconds


In [None]:
t0 = time.time()
games_sf2 = games_users_sf(games_df[10000:20000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf3 = games_users_sf(games_df[20000:30000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf4 = games_users_sf(games_df[30000:40000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf5 = games_users_sf(games_df[40000:50000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf6 = games_users_sf(games_df[50000:60000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf7 = games_users_sf(games_df[60000:70000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf8 = games_users_sf(games_df[70000:80000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf9 = games_users_sf(games_df[80000:90000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf10 = games_users_sf(games_df[90000:100000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf11 = games_users_sf(games_df[100000:110000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf12 = games_users_sf(games_df[110000:120000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf13 = games_users_sf(games_df[120000:130000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf14 = games_users_sf(games_df[130000:140000], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
t0 = time.time()
games_sf15 = games_users_sf(games_df[140000:], game_ids)
print "{} seconds".format(time.time() - t0)

In [None]:
total_sf = gl.SFrame()
total_sf = total_sf.append(games_sf1)
total_sf = total_sf.append(games_sf2)
total_sf = total_sf.append(games_sf3)

In [None]:
total_sf = total_sf.append(games_sf4)
total_sf = total_sf.append(games_sf5)
total_sf = total_sf.append(games_sf6)

In [None]:
total_sf = total_sf.append(games_sf7)
total_sf = total_sf.append(games_sf8)
total_sf = total_sf.append(games_sf9)

In [None]:
total_sf = total_sf.append(games_sf10)
total_sf = total_sf.append(games_sf11)
total_sf = total_sf.append(games_sf12)

In [None]:
total_sf = total_sf.append(games_sf13)
total_sf = total_sf.append(games_sf14)
total_sf = total_sf.append(games_sf15)

In [None]:
total_sf.save("final_users_item_rating.csv")