In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_pickle('testingdf2.pkl')
df = df.replace(pd.NA, np.nan)
df

Unnamed: 0,76561198153524465,76561198041864324,76561198110981529,76561198078042697,76561198072956091,76561198044215451,76561198117910816,76561198067369254,76561198095881349,76561198160606109,...,76561198064051588,76561198065637682,76561198093942856,76561198084283944,76561198068872077,76561198090802478,76561198068470798,76561198098128800,76561198098303003,76561198060478569
570,,71.0,271784.0,370587.0,30.0,127.0,,68001.0,28686.0,,...,62.0,,15850.0,,11081.0,,,17.0,11848.0,
1172470,,1740.0,,,422.0,2519.0,,,,,...,,,,0.0,,,,,,
730,52631.0,156473.0,,218577.0,56384.0,17022.0,,2826.0,,0.0,...,10009.0,102617.0,16021.0,9316.0,134.0,,,,,
578080,7881.0,1317.0,,0.0,0.0,,,105578.0,,0.0,...,8200.0,0.0,187.0,129.0,,0.0,,,,
1063730,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335330,,,,,,,,,,,...,,,,,,,,,,
21090,,,,,,,,,,,...,,,,,,,,,,
1259970,,,,,,,,,,,...,,,,,,,,,,
57900,,,,,,,,,,,...,,,,,,,,,,


In [3]:
# This cell removes all zeroes from the dataframe, if uncommented.

df = df.replace(np.nan, 0)
df = df.replace(0, np.nan)
df

Unnamed: 0,76561198153524465,76561198041864324,76561198110981529,76561198078042697,76561198072956091,76561198044215451,76561198117910816,76561198067369254,76561198095881349,76561198160606109,...,76561198064051588,76561198065637682,76561198093942856,76561198084283944,76561198068872077,76561198090802478,76561198068470798,76561198098128800,76561198098303003,76561198060478569
570,,71.0,271784.0,370587.0,30.0,127.0,,68001.0,28686.0,,...,62.0,,15850.0,,11081.0,,,17.0,11848.0,
1172470,,1740.0,,,422.0,2519.0,,,,,...,,,,,,,,,,
730,52631.0,156473.0,,218577.0,56384.0,17022.0,,2826.0,,,...,10009.0,102617.0,16021.0,9316.0,134.0,,,,,
578080,7881.0,1317.0,,,,,,105578.0,,,...,8200.0,,187.0,129.0,,,,,,
1063730,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335330,,,,,,,,,,,...,,,,,,,,,,
21090,,,,,,,,,,,...,,,,,,,,,,
1259970,,,,,,,,,,,...,,,,,,,,,,
57900,,,,,,,,,,,...,,,,,,,,,,


In [4]:
class MFRecommender():
    """
    Matrix factorization recommender model object.
    
    Attributes:
        game_weights (np.array): matrix of weights corresponding to user 
            profile weights for each game.
        user_weights (np.array): matrix of weights corresponding to how to 
            represent each user as a linear combination of user profiles.
        filled_entries (List[tuple(int)]): list of index pairs (i,j) of the
            non-null entries of df. 
        lr (float): learning rate
        l2 (float): size of the l2 penalty when fitting and predicting.
        df (pd.DataFrame): pandas dataframe representing the sparse matrix
            of data
    """
    
    
    def __init__(self, df, k, lr=0.0002, l2=0.00001, val_split=0.15):
        """
        Initializes the model.
        
        Args:
            df (pd.DataFrame): sparse dataframe of user playtime info.
            k (int): Number of user profiles for the model.
            lr (float): learning rate for SGD.
            l2 (float): l2 penalty for training the weights.
        """
        self.df = df
        self.game_weights = np.random.rand(df.shape[0],k)
        self.user_weights = np.random.rand(k, df.shape[1])
        self.filled_entries = []
        self.lr = lr
        self.l2 = l2
        for i in range(df.shape[0]):
            for j in range(df.shape[1]):
                if not pd.isna(df.iat[i,j]):
                    self.filled_entries.append((i,j))
        self.fe_train, self.fe_valid = train_test_split(self.filled_entries, 
                                                       test_size=val_split)
        
    
    
    def fit(self, epochs):
        """
        Fits the model a certain number of epochs based on the df provided.
        Args:
            epochs (int): number of epochs to train the model
        """
        for k in range(epochs):
            print('Fitting epoch {}...'.format(k+1))
            pred = np.matmul(self.game_weights, self.user_weights)
            t_game_weights = np.copy(self.game_weights)
            t_user_weights = np.copy(self.user_weights)
            
            # apply l2 penalty
            self.game_weights = t_game_weights*(1 - self.lr*self.l2)
            self.user_weights = t_user_weights*(1 - self.lr*self.l2)
            
            # Update with SGD
            for i, j in self.fe_train:
                for l in range(self.user_weights.shape[0]):
                    diff = (self.df.iat[i,j]-pred[i,j])
                    self.game_weights[i,l] += self.lr*t_user_weights[l,j]*diff
                    self.user_weights[l,j] += self.lr*t_game_weights[i,l]*diff
            print('Train MSE = {:.5f}     '.format(self.train_MSE())
                 + 'Validation MSE = {:.5f}'.format(self.valid_MSE()))
    
    
    def train_loss(self):
        loss = 0
        pred = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_train:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred[i,j])**2
        return loss
    
    def valid_loss(self):
        loss = 0
        pred2 = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_valid:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred2[i,j])**2
        return loss
    
    def train_MSE(self):
        loss = 0
        pred2 = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_train:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred2[i,j])**2
        return loss/len(self.fe_train)
    
    def valid_MSE(self):
        loss = 0
        pred2 = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_valid:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred2[i,j])**2
        return loss/len(self.fe_valid)
    
    def predict(self, user_data, epochs=100):
        """
        Given the data for a user, returns the predicted playtime series.
        
        Args:
            user_data (pd.series): sparse pandas series of user playtime
        
        Returns (pd.series): filled pandas series of predicted playtime.
        """
        user_profile = np.random.rand(self.user_weights.shape[0], 1)
        
        filled_indices = []
        for j in range(user_data.shape[0]):
            if not pd.isna(user_data.iat[j]):
                filled_indices.append(j)
        
        for j in range(epochs):
            #print('Fitting epoch {}...'.format(k+1))
            pred2 = np.matmul(self.game_weights, user_profile)
            #print(pred)
            t_user_weights = np.copy(user_profile)
            
            # apply l2 penalty
            user_profile = t_user_weights*(1 - self.lr*self.l2)
            
            # Update with SGD
            for i in filled_indices:
                for l in range(self.user_weights.shape[0]):
                    diff = (user_data.iat[i]-pred2[i])
                    user_profile[l] += self.lr*self.game_weights[i,l]*diff
        
        # return final prediction, converted to pandas
        to_return = user_data.copy()
        pred2 = np.matmul(self.game_weights, user_profile)
        for j in range(to_return.shape[0]):
            to_return.iat[j] = pred2[j]
            
        loss = 0
        for i in filled_indices:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (user_data.iat[i] - pred2[i])**2
        print('MSE: {}'.format(loss/len(filled_indices)))
        return to_return
                
                
                
        

In [5]:
# Note that the outliers in the data will prevent convergence unless we log-normalize first.

log_df = df.applymap(lambda x: np.log10(x+1) if not np.isnan(x) else x)
log_df

Unnamed: 0,76561198153524465,76561198041864324,76561198110981529,76561198078042697,76561198072956091,76561198044215451,76561198117910816,76561198067369254,76561198095881349,76561198160606109,...,76561198064051588,76561198065637682,76561198093942856,76561198084283944,76561198068872077,76561198090802478,76561198068470798,76561198098128800,76561198098303003,76561198060478569
570,,1.857332,5.434225,5.568891,1.491362,2.107210,,4.832522,4.457685,,...,1.799341,,4.200057,,4.044618,,,1.255273,4.073682,
1172470,,3.240799,,,2.626340,3.401401,,,,,...,,,,,,,,,,
730,4.721250,5.194442,,5.339606,4.751164,4.231036,,3.451326,,,...,4.000434,5.011224,4.204717,3.969276,2.130334,,,,,
578080,3.896636,3.119915,,,,,,5.023578,,,...,3.913867,,2.274158,2.113943,,,,,,
1063730,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335330,,,,,,,,,,,...,,,,,,,,,,
21090,,,,,,,,,,,...,,,,,,,,,,
1259970,,,,,,,,,,,...,,,,,,,,,,
57900,,,,,,,,,,,...,,,,,,,,,,


In [6]:
models = []
for k in [7, 4]:
    print('Training model with k={}.'.format(k))
    clf = MFRecommender(log_df, k, lr=0.0001, l2=0)
    clf.fit(200)
    models.append(clf)

Training model with k=7.
Fitting epoch 1...
Train MSE = 1.56165     Validation MSE = 1.56265
Fitting epoch 2...
Train MSE = 1.39552     Validation MSE = 1.39997
Fitting epoch 3...
Train MSE = 1.28845     Validation MSE = 1.29524
Fitting epoch 4...


KeyboardInterrupt: 

In [None]:
print('Training model with k={}.'.format(k))
clf = MFRecommender(log_df, k, lr=0.00005, l2=0)
clf.fit(200)
models.append(clf)

In [7]:
clf = MFRecommender(log_df, 5, l2=0)
clf

<__main__.MFRecommender at 0x2542244ace0>

In [20]:
clf.fit(200)

Fitting epoch 1...
Train MSE = 0.81567     Validation MSE = 0.83700
Fitting epoch 2...
Train MSE = 0.79769     Validation MSE = 0.81934
Fitting epoch 3...
Train MSE = 0.78201     Validation MSE = 0.80398
Fitting epoch 4...
Train MSE = 0.76823     Validation MSE = 0.79051
Fitting epoch 5...
Train MSE = 0.75606     Validation MSE = 0.77866
Fitting epoch 6...
Train MSE = 0.74526     Validation MSE = 0.76816
Fitting epoch 7...
Train MSE = 0.73561     Validation MSE = 0.75883
Fitting epoch 8...
Train MSE = 0.72697     Validation MSE = 0.75049
Fitting epoch 9...
Train MSE = 0.71920     Validation MSE = 0.74301
Fitting epoch 10...
Train MSE = 0.71217     Validation MSE = 0.73628
Fitting epoch 11...
Train MSE = 0.70580     Validation MSE = 0.73020
Fitting epoch 12...
Train MSE = 0.70001     Validation MSE = 0.72469
Fitting epoch 13...
Train MSE = 0.69472     Validation MSE = 0.71968
Fitting epoch 14...
Train MSE = 0.68988     Validation MSE = 0.71512
Fitting epoch 15...
Train MSE = 0.68544    

Train MSE = 0.60753     Validation MSE = 0.64689
Fitting epoch 121...
Train MSE = 0.60740     Validation MSE = 0.64684
Fitting epoch 122...
Train MSE = 0.60727     Validation MSE = 0.64680
Fitting epoch 123...
Train MSE = 0.60715     Validation MSE = 0.64675
Fitting epoch 124...
Train MSE = 0.60703     Validation MSE = 0.64671
Fitting epoch 125...
Train MSE = 0.60691     Validation MSE = 0.64667
Fitting epoch 126...
Train MSE = 0.60679     Validation MSE = 0.64663
Fitting epoch 127...
Train MSE = 0.60667     Validation MSE = 0.64659
Fitting epoch 128...
Train MSE = 0.60656     Validation MSE = 0.64655
Fitting epoch 129...
Train MSE = 0.60644     Validation MSE = 0.64651
Fitting epoch 130...
Train MSE = 0.60633     Validation MSE = 0.64648
Fitting epoch 131...
Train MSE = 0.60622     Validation MSE = 0.64644
Fitting epoch 132...
Train MSE = 0.60611     Validation MSE = 0.64641
Fitting epoch 133...
Train MSE = 0.60600     Validation MSE = 0.64638
Fitting epoch 134...
Train MSE = 0.60589 

KeyboardInterrupt: 

In [21]:
clf.valid_MSE()

0.6460781163531655

In [10]:
log_df[log_df.columns[0]]

570             NaN
1172470         NaN
730        4.721250
578080     3.896636
1063730         NaN
             ...   
335330          NaN
21090           NaN
1259970         NaN
57900           NaN
337950          NaN
Name: 76561198153524465, Length: 986, dtype: float64

In [11]:
clf.predict(log_df[log_df.columns[0]])

MSE: [0.40906991]


570        4.265553
1172470    3.589336
730        5.031911
578080     4.108318
1063730    4.020324
             ...   
335330     1.290971
21090      2.236327
1259970    1.512059
57900      2.050812
337950     1.564285
Name: 76561198153524465, Length: 986, dtype: float64

In [15]:
pd.options.display.max_rows = 986

In [24]:
#try testing with a new user
from steam import Steam
from decouple import config
with open("myapikey.txt", "r") as file:
    myapikey=file.read()
    
import os
os.environ["STEAM_API_KEY"] = myapikey
KEY = config("STEAM_API_KEY")
steam = Steam(KEY)

In [62]:
testid='76561198039393086'

In [54]:
int(testid) in df.columns

False

In [63]:
testsgamelist=steam.users.get_owned_games(testid)

In [78]:
testcol=pd.Series(np.nan,index=df.index)
for i in range(testsgamelist['game_count']):
    appid=testsgamelist['games'][i]['appid']
    if appid in df.index:
        if testsgamelist['games'][i]['playtime_forever']>0:
            testcol[appid]=testsgamelist['games'][i]['playtime_forever']
            
testcol

570            NaN
1172470        NaN
730            NaN
578080         NaN
1063730        NaN
440         1061.0
271590         NaN
1599340      395.0
550            NaN
304930         NaN
252490         NaN
230410     10912.0
105600        22.0
4000           NaN
1245620        NaN
236390         NaN
291550         NaN
359550         NaN
340            NaN
431960         NaN
1085660     1808.0
945360       994.0
238960       110.0
218620         NaN
892970       318.0
1091500     1450.0
1097150     9748.0
242760         NaN
322330       106.0
444090       379.0
291480         NaN
346110         NaN
413150     14081.0
49520       3309.0
10             NaN
272060         NaN
292030         NaN
438100        98.0
381210       909.0
227940         NaN
1240440        NaN
227300         NaN
252950       304.0
1468810        NaN
620            NaN
552990         NaN
739630       236.0
320            NaN
990080         NaN
1938090        NaN
386360      3345.0
755790         NaN
550650      

In [79]:
testfit=clf.predict(np.log10(testcol+1))

MSE: [0.55943247]


In [67]:
pd.DataFrame({'game_name':gamenames,'user_data':np.log10(testcol+1),'prediction':testfit})

Unnamed: 0,game_name,user_data,prediction
570,Dota 2,0.0,0.281695
1172470,Apex Legends,0.0,0.342354
730,Counter-Strike: Global Offensive,0.0,0.338369
578080,PUBG: BATTLEGROUNDS,0.0,0.307935
1063730,New World,0.0,0.357133
440,Team Fortress 2,3.026125,0.236428
271590,Grand Theft Auto V,0.0,0.335099
1599340,Lost Ark,2.597695,0.273329
550,Left 4 Dead 2,0.0,0.263643
304930,Unturned,0.0,0.255201


In [50]:
gamenames=pd.read_pickle('gamenames.pkl')
gamenames

['Dota 2',
 'Apex Legends',
 'Counter-Strike: Global Offensive',
 'PUBG: BATTLEGROUNDS',
 'New World',
 'Team Fortress 2',
 'Grand Theft Auto V',
 'Lost Ark',
 'Left 4 Dead 2',
 'Unturned',
 'Rust',
 'Warframe',
 'Terraria',
 "Garry's Mod",
 'ELDEN RING',
 'War Thunder',
 'Brawlhalla',
 "Tom Clancy's Rainbow Six Siege",
 'Half-Life 2: Lost Coast',
 'Wallpaper Engine',
 'Destiny 2',
 'Among Us',
 'Path of Exile',
 'PAYDAY 2',
 'Valheim',
 'Cyberpunk 2077',
 'Fall Guys: Ultimate Knockout',
 'The Forest',
 "Don't Starve Together",
 'Paladins',
 'Warface',
 'ARK: Survival Evolved',
 'Stardew Valley',
 'Borderlands 2',
 'Counter-Strike',
 'Serena',
 'The Witcher 3: Wild Hunt',
 'VRChat',
 'Dead by Daylight',
 'Heroes & Generals',
 'Halo Infinite',
 'Euro Truck Simulator 2',
 'Rocket League',
 '鬼谷八荒 Tale of Immortal',
 'Portal 2',
 'World of Warships',
 'Phasmophobia',
 'Half-Life 2: Deathmatch',
 'Hogwarts Legacy',
 'Call of Duty: Modern Warfare II',
 'SMITE',
 'Ring of Elysium',
 'Black Sq

In [69]:
'Tales of Arise' in gamenames

True

In [80]:
results=pd.DataFrame({'game_name':gamenames,'user_data':testcol,'prediction':(10**testfit)-1})
results.sort_values("user_data")

Unnamed: 0,game_name,user_data,prediction
698780,Doki Doki Literature Club!,1.0,111.320431
464920,Surviving Mars,10.0,211.772405
504370,Battlerite,15.0,313.659414
105600,Terraria,22.0,2130.889605
363970,Clicker Heroes,22.0,926.061579
346900,AdVenture Capitalist,53.0,405.14704
582500,We Were Here,60.0,95.561465
238010,Deus Ex: Human Revolution - Director's Cut,63.0,480.620049
630,Alien Swarm,66.0,150.906025
270880,American Truck Simulator,72.0,1024.670125


In [60]:
test1=pd.DataFrame({'user_data':log_df[log_df.columns[10]],'prediction':clf.predict(log_df[log_df.columns[10]])})
test1=10**test1-1
test1

MSE: [0.76746564]


Unnamed: 0,user_data,prediction
570,3817.0,3525.817623
1172470,,646.387241
730,145103.0,9030.427107
578080,15081.0,2456.545895
1063730,,2341.282185
440,,781.03544
271590,6375.0,3861.373992
1599340,,791.321628
550,464.0,527.173797
304930,,267.701998


In [23]:
test1.sort_values("user_data")

Unnamed: 0,user_data,prediction
400,1.0,252.871992
438040,8.0,816.894572
285900,47.0,188.557454
334230,84.0,1027.537437
470220,85.0,453.329474
204340,93.0,144.468393
333420,95.0,1337.141508
518790,101.0,838.42265
620,118.0,1011.321886
1818750,131.0,376.304203


In [70]:
df

Unnamed: 0,76561198153524465,76561198041864324,76561198110981529,76561198078042697,76561198072956091,76561198044215451,76561198117910816,76561198067369254,76561198095881349,76561198160606109,...,76561198064051588,76561198065637682,76561198093942856,76561198084283944,76561198068872077,76561198090802478,76561198068470798,76561198098128800,76561198098303003,76561198060478569
570,,71.0,271784.0,370587.0,30.0,127.0,,68001.0,28686.0,,...,62.0,,15850.0,,11081.0,,,17.0,11848.0,
1172470,,1740.0,,,422.0,2519.0,,,,,...,,,,,,,,,,
730,52631.0,156473.0,,218577.0,56384.0,17022.0,,2826.0,,,...,10009.0,102617.0,16021.0,9316.0,134.0,,,,,
578080,7881.0,1317.0,,,,,,105578.0,,,...,8200.0,,187.0,129.0,,,,,,
1063730,,,,,,,,,,,...,,,,,,,,,,
440,841.0,4915.0,,,6110.0,81984.0,,17.0,,,...,,,374.0,1198.0,,,,,,312.0
271590,,,,,,7506.0,,,,,...,,,,,,,,,,
1599340,,304.0,,,,,,,,,...,,,,,,,,490.0,1855.0,
550,4054.0,,,,,3824.0,,,,,...,50.0,21018.0,,2572.0,,,,,,
304930,932.0,9288.0,,,225.0,,,,,,...,,846.0,88.0,56.0,,,,,,


In [None]:
# Data seems to be normally distributed by log. 

tlist = []
for i in range(len(clf.filled_entries)):
    j, k = clf.filled_entries[i]
    tlist.append(log_df.iat[j, k])
import matplotlib.pyplot as plt
plt.hist(tlist, bins=100)
#plt.yscale('log')
plt.show()