# Coldstart Predictions
The script takes in the new $V'$ matrix generated from the cold start regression model and generates new predictions for each of the users in the training data

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

import os
from glob import glob

In [2]:
#set up directories
data_dir_v = Path('../../data/itemFactors_r200_kNNupdated_v2/') #change this based on which V matrix to take in
data_dir_u = Path('../../data/userFactors_r200/') #change this based on which U matrix to take in
data_dir_val = Path('../../data/valUsers/')
data_dir = '../../data/'

In [3]:
#read in V and check shape
latentItems = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir_v.glob('*.parquet')
)

print("V shape:",latentItems.shape)
latentItems.head()

V shape: (320739, 2)


Unnamed: 0,id,features
96219,-534224064,"[-0.009564603678882122, 0.05479266121983528, 0..."
96220,-534219288,"[0.009427841752767563, -0.012406221590936184, ..."
96221,-534207492,"[-0.03337208554148674, 0.0655897706747055, -0...."
96222,-534201694,"[0.003339017741382122, 0.0071647330187261105, ..."
96223,-534182781,"[-0.001448197872377932, -0.05940604954957962, ..."


In [4]:
#read in U and check shape
latentUsers = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir_u.glob('*.parquet')
)

print("U shape:", latentUsers.shape)
latentUsers.head()

U shape: (1112050, 2)


Unnamed: 0,id,features
0,-2143737963,"[0.04503257, -0.1381214, 0.004369732, 0.024514..."
1,-2141308963,"[-0.041152455, -0.07105676, -0.06608401, -0.01..."
2,-2140438363,"[0.10122766, 0.014004348, -0.06525432, -0.0303..."
3,-2140033363,"[-0.09476472, -0.037693784, -0.17497699, -0.01..."
4,-2139593563,"[0.11070782, -0.08256894, -0.19954483, 0.04065..."


In [5]:
#read in val user list and check shape
valUsers = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir_val.glob('*.parquet')
)

print("Val Users shape:",valUsers.shape)
valUsers.head()

Val Users shape: (10000, 1)


Unnamed: 0,user_hashId
0,814602782
1,1736832637
2,-1895008463
3,-671404564
4,-1083396133


In [6]:
# Filter V to just the users we care about
valUsers = valUsers.rename(columns={"user_hashId": "id"})
latentUsers = latentUsers.merge(valUsers, on='id')

In [None]:
# Turn item matrix into dask array
latentItems_dd = dd.from_pandas(latentItems, chunksize=100)
latentItems_da = da.stack(latentItems_dd["features"])
latentItems_da = latentItems_da.rechunk((1000, 200))

In [7]:
# Turn the features column into a matrix
latentItems_feats = np.stack(latentItems["features"])

In [8]:
# Iterate through each user
# For each user, mutiply its latent factors with all item latent factors
# Take the indices of the top 500 scores
# Get the items associated with these indices and save them as an array with the user
# Write recommendations to a parquet

userList = []
recList = []

for i in range(10000):
    userInd = i
    latentUsers_single = latentUsers.iloc[userInd]
    preds = np.matmul(latentItems_feats, latentUsers_single["features"].transpose())
    inds = np.argsort(-preds)[:500]
    userList.append(latentUsers.iloc[userInd]['id'])
    recList.append(latentItems['id'][inds].to_numpy())
    
    if i % 500 == 0:
        print(f'Finished {i}')

df_recs = pd.DataFrame({'user_hashId': userList, 'recommendations': recList}
                       , columns=['user_hashId', 'recommendations'])
new_path = f'../../data/coldStartRecs_noDask_knn_v2.parquet'
df_recs.to_parquet(new_path)

Finished 0
Finished 500
Finished 1000
Finished 1500
Finished 2000
Finished 2500
Finished 3000
Finished 3500
Finished 4000
Finished 4500
Finished 5000
Finished 5500
Finished 6000
Finished 6500
Finished 7000
Finished 7500
Finished 8000
Finished 8500
Finished 9000
Finished 9500
