In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_pickle('testingdf2.pkl')
df = df.replace(pd.NA, np.nan)
df

Unnamed: 0,76561198153524465,76561198041864324,76561198110981529,76561198078042697,76561198072956091,76561198044215451,76561198117910816,76561198067369254,76561198095881349,76561198160606109,...,76561198064051588,76561198065637682,76561198093942856,76561198084283944,76561198068872077,76561198090802478,76561198068470798,76561198098128800,76561198098303003,76561198060478569
570,,71.0,271784.0,370587.0,30.0,127.0,,68001.0,28686.0,,...,62.0,,15850.0,,11081.0,,,17.0,11848.0,
1172470,,1740.0,,,422.0,2519.0,,,,,...,,,,0.0,,,,,,
730,52631.0,156473.0,,218577.0,56384.0,17022.0,,2826.0,,0.0,...,10009.0,102617.0,16021.0,9316.0,134.0,,,,,
578080,7881.0,1317.0,,0.0,0.0,,,105578.0,,0.0,...,8200.0,0.0,187.0,129.0,,0.0,,,,
1063730,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335330,,,,,,,,,,,...,,,,,,,,,,
21090,,,,,,,,,,,...,,,,,,,,,,
1259970,,,,,,,,,,,...,,,,,,,,,,
57900,,,,,,,,,,,...,,,,,,,,,,


In [3]:
# This cell removes all zeroes from the dataframe, if uncommented.

df = df.replace(np.nan, 0)
df = df.replace(0, np.nan)
df

Unnamed: 0,76561198153524465,76561198041864324,76561198110981529,76561198078042697,76561198072956091,76561198044215451,76561198117910816,76561198067369254,76561198095881349,76561198160606109,...,76561198064051588,76561198065637682,76561198093942856,76561198084283944,76561198068872077,76561198090802478,76561198068470798,76561198098128800,76561198098303003,76561198060478569
570,,71.0,271784.0,370587.0,30.0,127.0,,68001.0,28686.0,,...,62.0,,15850.0,,11081.0,,,17.0,11848.0,
1172470,,1740.0,,,422.0,2519.0,,,,,...,,,,,,,,,,
730,52631.0,156473.0,,218577.0,56384.0,17022.0,,2826.0,,,...,10009.0,102617.0,16021.0,9316.0,134.0,,,,,
578080,7881.0,1317.0,,,,,,105578.0,,,...,8200.0,,187.0,129.0,,,,,,
1063730,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335330,,,,,,,,,,,...,,,,,,,,,,
21090,,,,,,,,,,,...,,,,,,,,,,
1259970,,,,,,,,,,,...,,,,,,,,,,
57900,,,,,,,,,,,...,,,,,,,,,,


In [4]:
class MFRecommender():
    """
    Matrix factorization recommender model object.
    
    Attributes:
        game_weights (np.array): matrix of weights corresponding to user 
            profile weights for each game.
        user_weights (np.array): matrix of weights corresponding to how to 
            represent each user as a linear combination of user profiles.
        filled_entries (List[tuple(int)]): list of index pairs (i,j) of the
            non-null entries of df. 
        lr (float): learning rate
        l2 (float): size of the l2 penalty when fitting and predicting.
        df (pd.DataFrame): pandas dataframe representing the sparse matrix
            of data
    """
    
    
    def __init__(self, df, k, lr=0.0002, l2=0.00001, val_split=0.15):
        """
        Initializes the model.
        
        Args:
            df (pd.DataFrame): sparse dataframe of user playtime info.
            k (int): Number of user profiles for the model.
            lr (float): learning rate for SGD.
            l2 (float): l2 penalty for training the weights.
        """
        self.df = df
        self.game_weights = np.random.rand(df.shape[0],k)
        self.user_weights = np.random.rand(k, df.shape[1])
        self.filled_entries = []
        self.lr = lr
        self.l2 = l2
        for i in range(df.shape[0]):
            for j in range(df.shape[1]):
                if not pd.isna(df.iat[i,j]):
                    self.filled_entries.append((i,j))
        self.fe_train, self.fe_valid = train_test_split(self.filled_entries, 
                                                       test_size=val_split)
        
    
    
    def fit(self, epochs):
        """
        Fits the model a certain number of epochs based on the df provided.
        Args:
            epochs (int): number of epochs to train the model
        """
        for k in range(epochs):
            print('Fitting epoch {}...'.format(k+1))
            pred = np.matmul(self.game_weights, self.user_weights)
            t_game_weights = np.copy(self.game_weights)
            t_user_weights = np.copy(self.user_weights)
            
            # apply l2 penalty
            self.game_weights = t_game_weights*(1 - self.lr*self.l2)
            self.user_weights = t_user_weights*(1 - self.lr*self.l2)
            
            # Update with SGD
            for i, j in self.fe_train:
                for l in range(self.user_weights.shape[0]):
                    diff = (self.df.iat[i,j]-pred[i,j])
                    self.game_weights[i,l] += self.lr*t_user_weights[l,j]*diff
                    self.user_weights[l,j] += self.lr*t_game_weights[i,l]*diff
            print('Train MSE = {:.5f}     '.format(self.train_MSE())
                 + 'Validation MSE = {:.5f}'.format(self.valid_MSE()))
    
    
    def train_loss(self):
        loss = 0
        pred = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_train:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred[i,j])**2
        return loss
    
    def valid_loss(self):
        loss = 0
        pred2 = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_valid:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred2[i,j])**2
        return loss
    
    def train_MSE(self):
        loss = 0
        pred2 = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_train:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred2[i,j])**2
        return loss/len(self.fe_train)
    
    def valid_MSE(self):
        loss = 0
        pred2 = np.matmul(self.game_weights, self.user_weights)
        for i, j in self.fe_valid:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (self.df.iat[i,j] - pred2[i,j])**2
        return loss/len(self.fe_valid)
    
    def predict(self, user_data, epochs=200):
        """
        Given the data for a user, returns the predicted playtime series.
        
        Args:
            user_data (pd.series): sparse pandas series of user playtime
        
        Returns (pd.series): filled pandas series of predicted playtime.
        """
        user_profile = np.random.rand(self.user_weights.shape[0], 1)
        
        filled_indices = []
        for j in range(user_data.shape[0]):
            if not pd.isna(user_data.iat[j]):
                filled_indices.append(j)
        
        for j in range(epochs):
            #print('Fitting epoch {}...'.format(k+1))
            pred2 = np.matmul(self.game_weights, user_profile)
            #print(pred)
            t_user_weights = np.copy(user_profile)
            
            # apply l2 penalty
            user_profile = t_user_weights*(1 - self.lr*self.l2)
            
            # Update with SGD
            for i in filled_indices:
                for l in range(self.user_weights.shape[0]):
                    diff = (user_data.iat[i]-pred2[i])
                    user_profile[l] += self.lr*self.game_weights[i,l]*diff
        
        # return final prediction, converted to pandas
        to_return = user_data.copy()
        pred2 = np.matmul(self.game_weights, user_profile)
        for j in range(to_return.shape[0]):
            to_return.iat[j] = pred2[j]
            
        loss = 0
        for i in filled_indices:
            # print(self.df.iat[i,j], pred[i,j])
            loss += (user_data.iat[i] - pred2[i])**2
        print('MSE: {}'.format(loss/len(filled_indices)))
        return to_return
                
                
                
        

In [5]:
# Note that the outliers in the data will prevent convergence unless we log-normalize first.

log_df = df.applymap(lambda x: np.log10(x+1) if not np.isnan(x) else x)
log_df

Unnamed: 0,76561198153524465,76561198041864324,76561198110981529,76561198078042697,76561198072956091,76561198044215451,76561198117910816,76561198067369254,76561198095881349,76561198160606109,...,76561198064051588,76561198065637682,76561198093942856,76561198084283944,76561198068872077,76561198090802478,76561198068470798,76561198098128800,76561198098303003,76561198060478569
570,,1.857332,5.434225,5.568891,1.491362,2.107210,,4.832522,4.457685,,...,1.799341,,4.200057,,4.044618,,,1.255273,4.073682,
1172470,,3.240799,,,2.626340,3.401401,,,,,...,,,,,,,,,,
730,4.721250,5.194442,,5.339606,4.751164,4.231036,,3.451326,,,...,4.000434,5.011224,4.204717,3.969276,2.130334,,,,,
578080,3.896636,3.119915,,,,,,5.023578,,,...,3.913867,,2.274158,2.113943,,,,,,
1063730,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335330,,,,,,,,,,,...,,,,,,,,,,
21090,,,,,,,,,,,...,,,,,,,,,,
1259970,,,,,,,,,,,...,,,,,,,,,,
57900,,,,,,,,,,,...,,,,,,,,,,


In [6]:
clf = MFRecommender(log_df, 5, l2=0)
clf

<__main__.MFRecommender at 0x7f8852dd6a90>

In [7]:
clf.fit(50)

Fitting epoch 1...
Train MSE = 2.07400     Validation MSE = 2.08458
Fitting epoch 2...
Train MSE = 1.67639     Validation MSE = 1.68547
Fitting epoch 3...
Train MSE = 1.44380     Validation MSE = 1.45752
Fitting epoch 4...
Train MSE = 1.28836     Validation MSE = 1.30304
Fitting epoch 5...
Train MSE = 1.17676     Validation MSE = 1.19325
Fitting epoch 6...
Train MSE = 1.09281     Validation MSE = 1.11007
Fitting epoch 7...
Train MSE = 1.02748     Validation MSE = 1.04571
Fitting epoch 8...
Train MSE = 0.97532     Validation MSE = 0.99422
Fitting epoch 9...
Train MSE = 0.93283     Validation MSE = 0.95244
Fitting epoch 10...
Train MSE = 0.89765     Validation MSE = 0.91786
Fitting epoch 11...
Train MSE = 0.86812     Validation MSE = 0.88893
Fitting epoch 12...
Train MSE = 0.84306     Validation MSE = 0.86441
Fitting epoch 13...
Train MSE = 0.82157     Validation MSE = 0.84345
Fitting epoch 14...
Train MSE = 0.80298     Validation MSE = 0.82538
Fitting epoch 15...
Train MSE = 0.78678    

KeyboardInterrupt: 

In [8]:
clf.valid_MSE()

0.8037271758159168

In [9]:
log_df[log_df.columns[0]]

570             NaN
1172470         NaN
730        4.721250
578080     3.896636
1063730         NaN
             ...   
335330          NaN
21090           NaN
1259970         NaN
57900           NaN
337950          NaN
Name: 76561198153524465, Length: 986, dtype: float64

In [10]:
clf.predict(log_df[log_df.columns[0]])

MSE: [0.3640368]


570        4.316950
1172470    3.351964
730        4.897130
578080     4.226548
1063730    4.146942
             ...   
335330     1.432923
21090      1.893578
1259970    2.127962
57900      1.836917
337950     1.761078
Name: 76561198153524465, Length: 986, dtype: float64

In [None]:
# Data seems to be normally distributed by log. 

tlist = []
for i in range(len(clf.filled_entries)):
    j, k = clf.filled_entries[i]
    tlist.append(log_df.iat[j, k])
import matplotlib.pyplot as plt
plt.hist(tlist, bins=100)
#plt.yscale('log')
plt.show()