In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_pickle('testingdf.pkl')
df = df.replace(pd.NA, np.nan)
df

Unnamed: 0,76561197996823194,76561198014242604,76561198005523871,76561198085946962,76561198054109103,76561198093620101,76561198096369389,76561197993050013,76561198089088546,76561198068751421,...,76561198098674820,76561198087694481,76561198061063257,76561198071401138,76561198094672501,76561198080752551,76561198097082417,76561198095866324,76561198071682435,76561198090997249
300,0.0,0.0,,,,,,,,,...,,,,,,,,,,
280,33.0,,,,,,,,,,...,,,,,,,,,,
360,0.0,,,,0.0,,,,0.0,,...,,,,,,,,,0.0,
20,0.0,,,,,,,,,,...,,,,,,,,,,
50,0.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1034140,,,,,,,,,,,...,,,,,,,,,72.0,
1103100,,,,,,,,,,,...,,,,,,,,,0.0,
1700870,,,,,,,,,,,...,,,,,,,,,2054.0,
1659040,,,,,,,,,,,...,,,,,,,,,1278.0,


In [3]:
# This cell removes all zeroes from the dataframe, if uncommented.

# df = df.replace(np.nan, 0)
# df = df.replace(0, np.nan)
# df

In [4]:
class MFRecommender():
    """
    Matrix factorization recommender model object.
    
    Attributes:
        game_weights (np.array): matrix of weights corresponding to user 
            profile weights for each game.
        user_weights (np.array): matrix of weights corresponding to how to 
            represent each user as a linear combination of user profiles.
        filled_entries (List[tuple(int)]): list of index pairs (i,j) of the
            non-null entries of df. 
        lr (float): learning rate
        l2 (float): size of the l2 penalty when fitting and predicting.
        df (pd.DataFrame): pandas dataframe representing the sparse matrix
            of data
    """
    
    
    def __init__(self, df, k, lr=0.0002, l2=0.00001, val_split=0.15):
        """
        Initializes the model.
        
        Args:
            df (pd.DataFrame): sparse dataframe of user playtime info.
            k (int): Number of user profiles for the model.
            lr (float): learning rate for SGD.
            l2 (float): l2 penalty for training the weights.
        """
        self.df = df
        self.game_weights = np.random.rand(df.shape[0],k)
        self.user_weights = np.random.rand(k, df.shape[1])
        self.filled_entries = []
        self.lr = lr
        self.l2 = l2
        for i in range(df.shape[0]):
            for j in range(df.shape[1]):
                if not pd.isna(df.iat[i,j]):
                    self.filled_entries.append((i,j))
        self.fe_train, self.fe_valid = train_test_split(self.filled_entries, 
                                                       test_size=val_split)
        
    
    
    def fit(self, epochs):
        """
        Fits the model a certain number of epochs based on the df provided.
        Args:
            epochs (int): number of epochs to train the model
        """
        for k in range(epochs):
            print('Fitting epoch {}...'.format(k+1))
            pred = np.matmul(self.game_weights, self.user_weights)
            t_game_weights = np.copy(self.game_weights)
            t_user_weights = np.copy(self.user_weights)
            
            # apply l2 penalty
            self.game_weights = t_game_weights*(1 - self.lr*self.l2)
            self.user_weights = t_user_weights*(1 - self.lr*self.l2)
            
            # Update with SGD
            for i, j in self.fe_train:
                for l in range(self.user_weights.shape[0]):
                    diff = (self.df.iat[i,j]-pred[i,j])
                    self.game_weights[i,l] += self.lr*t_user_weights[l,j]*diff
                    self.user_weights[l,j] += self.lr*t_game_weights[i,l]*diff
            print('Train MSE = {:.5f}     '.format(self.train_MSE())
                 + 'Validation MSE = {:.5f}'.format(self.valid_MSE()))
    
    
    def train_loss(self):
        loss = 0
        pred = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_train:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred[i,j])**2
        return loss
    
    def valid_loss(self):
        loss = 0
        pred2 = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_valid:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred2[i,j])**2
        return loss
    
    def train_MSE(self):
        loss = 0
        pred2 = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_train:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred2[i,j])**2
        return loss/len(self.fe_train)
    
    def valid_MSE(self):
        loss = 0
        pred2 = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_valid:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred2[i,j])**2
        return loss/len(self.fe_valid)
    
    def predict(self, user_data):
        """
        Given the data for a user, returns the predicted playtime series.
        
        Args:
            user_data (pd.series): sparse pandas series of user playtime
        
        Returns (pd.series): filled pandas series of predicted playtime.
        """
        user_profile = np.random.rand(self.user_weights.shape[0], 1)
        
        filled_indices = []
        for j in range(user_data.shape[0]):
            if not pd.isna(user_data.iat[j]):
                filled_indices.append(j)
        
        for j in range(200):
            #print('Fitting epoch {}...'.format(k+1))
            pred2 = np.matmul(self.game_weights, user_profile)
            #print(pred)
            t_user_weights = np.copy(user_profile)
            
            # apply l2 penalty
            self.user_weights = t_user_weights*(1 - self.lr*self.l2)
            
            # Update with SGD
            for i in filled_indices:
                for l in range(self.user_weights.shape[0]):
                    diff = (user_data.iat[i]-pred2[i])
                    user_profile[l] += self.lr*self.game_weights[i,l]*diff
        
        # return final prediction, converted to pandas
        to_return = user_data.copy()
        pred2 = np.matmul(self.game_weights, user_profile)
        for j in range(to_return.shape[0]):
            to_return.iat[j] = pred[j]
            
        loss = 0
        for i in filled_indices:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (user_data.iat[i] - pred2[i])**2
        print('MSE: {}'.format(loss/len(filled_indices)))
        return to_return
                
                
                
        

In [5]:
# Note that the outliers in the data will prevent convergence unless we log-normalize first.

log_df = df.applymap(lambda x: np.log10(x+1) if not np.isnan(x) else x)
log_df

Unnamed: 0,76561197996823194,76561198014242604,76561198005523871,76561198085946962,76561198054109103,76561198093620101,76561198096369389,76561197993050013,76561198089088546,76561198068751421,...,76561198098674820,76561198087694481,76561198061063257,76561198071401138,76561198094672501,76561198080752551,76561198097082417,76561198095866324,76561198071682435,76561198090997249
300,0.000000,0.0,,,,,,,,,...,,,,,,,,,,
280,1.531479,,,,,,,,,,...,,,,,,,,,,
360,0.000000,,,,0.0,,,,0.0,,...,,,,,,,,,0.000000,
20,0.000000,,,,,,,,,,...,,,,,,,,,,
50,0.000000,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1034140,,,,,,,,,,,...,,,,,,,,,1.863323,
1103100,,,,,,,,,,,...,,,,,,,,,0.000000,
1700870,,,,,,,,,,,...,,,,,,,,,3.312812,
1659040,,,,,,,,,,,...,,,,,,,,,3.106871,


In [6]:
clf = MFRecommender(log_df, 5, l2=0)
clf

<__main__.MFRecommender at 0x7f9f9c27d160>

In [10]:
clf.fit(1000)

Fitting epoch 1...
Train MSE = 0.82172     Validation MSE = 1.11033
Fitting epoch 2...
Train MSE = 0.82125     Validation MSE = 1.11035
Fitting epoch 3...
Train MSE = 0.82079     Validation MSE = 1.11037
Fitting epoch 4...
Train MSE = 0.82032     Validation MSE = 1.11038
Fitting epoch 5...
Train MSE = 0.81986     Validation MSE = 1.11039
Fitting epoch 6...
Train MSE = 0.81939     Validation MSE = 1.11040
Fitting epoch 7...
Train MSE = 0.81893     Validation MSE = 1.11041
Fitting epoch 8...
Train MSE = 0.81847     Validation MSE = 1.11043
Fitting epoch 9...
Train MSE = 0.81800     Validation MSE = 1.11044
Fitting epoch 10...
Train MSE = 0.81754     Validation MSE = 1.11045
Fitting epoch 11...
Train MSE = 0.81708     Validation MSE = 1.11047
Fitting epoch 12...
Train MSE = 0.81662     Validation MSE = 1.11048
Fitting epoch 13...
Train MSE = 0.81615     Validation MSE = 1.11050
Fitting epoch 14...
Train MSE = 0.81569     Validation MSE = 1.11052
Fitting epoch 15...
Train MSE = 0.81523    

KeyboardInterrupt: 

In [8]:
clf.valid_MSE()

1.110292527008686

In [9]:
log_df[log_df.columns[0]]

300        0.000000
280        1.531479
360        0.000000
20         0.000000
50         0.000000
             ...   
1034140         NaN
1103100         NaN
1700870         NaN
1659040         NaN
2119490         NaN
Name: 76561197996823194, Length: 5785, dtype: float64

In [None]:
clf.predict(log_df[log_df.columns[0]])

In [None]:
# Data seems to be normally distributed by log. 

tlist = []
for i in range(len(clf.filled_entries)):
    j, k = clf.filled_entries[i]
    tlist.append(log_df.iat[j, k])
import matplotlib.pyplot as plt
plt.hist(tlist, bins=100)
plt.yscale('log')
plt.show()