In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize

import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 12.5 GB  |     Proc size: 271.4 MB
GPU RAM Free: 16278MB | Used: 2MB | Util   0% | Total     16280MB


In [None]:
if torch.cuda.is_available():
  print("We Have CUDA")
else:
  print("No GPU")

We Have CUDA


In [None]:
# Load Data
small_data = pd.read_csv("/content/gdrive/MyDrive/6242 Final Project/Datasets for RecSys/yelp_filtnosamp_gt5.csv")#pd.read_csv("yelp_filtnosamp_gt5.csv")
print(len(small_data))
small_data = small_data.dropna()
print(len(small_data))

5766970
5766970


In [None]:
small_data.loc[small_data['userID'] == "ak0TdVmGKo4pwqdJSTLwWw"]["itemID"]

0    buF9druCkbuXLX526sGELQ
1    bNZ3-0rse12NKdSVqQ30xw
2    BVsIaKL-8QXVjt0Z9WoFWw
3    4MClvr12OXBNvGu8h1yGpA
4    2vH58mhkEl8GdcDug1OwWg
Name: itemID, dtype: object

In [None]:
# Load Data
name_data = pd.read_csv("/content/gdrive/MyDrive/6242 Final Project/Datasets for RecSys/list_of_names.csv")#pd.read_csv("yelp_filtnosamp_gt5.csv")
print(len(name_data))
name_data = name_data.dropna()
print(len(name_data))

365665
365665


In [None]:
list(name_data.loc[name_data['name'] == "Melanie Brown"]["userID"])[0]

'ghQz_IoBekFp0oKQD01Rog'

In [None]:
names = ["Rickey Miller", "Melanie Brown", "Jack Brown"]
out = [list(name_data.loc[name_data['name'] == name]["userID"])[0] for name in names]
out

['ak0TdVmGKo4pwqdJSTLwWw', 'ghQz_IoBekFp0oKQD01Rog', '7VdbP64ORzZs-rPOARktSA']

In [None]:
small_data.head()

Unnamed: 0.1,Unnamed: 0,userID,itemID,rating
0,0,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4.0
1,1,ak0TdVmGKo4pwqdJSTLwWw,bNZ3-0rse12NKdSVqQ30xw,4.0
2,2,ak0TdVmGKo4pwqdJSTLwWw,BVsIaKL-8QXVjt0Z9WoFWw,4.0
3,3,ak0TdVmGKo4pwqdJSTLwWw,4MClvr12OXBNvGu8h1yGpA,5.0
4,4,ak0TdVmGKo4pwqdJSTLwWw,2vH58mhkEl8GdcDug1OwWg,5.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
small_data["userID"].astype("str")

0          ak0TdVmGKo4pwqdJSTLwWw
1          ak0TdVmGKo4pwqdJSTLwWw
2          ak0TdVmGKo4pwqdJSTLwWw
3          ak0TdVmGKo4pwqdJSTLwWw
4          ak0TdVmGKo4pwqdJSTLwWw
                    ...          
5766965    zQx8eofz4u0J3cmc5C5P8g
5766966    zQx8eofz4u0J3cmc5C5P8g
5766967    zQx8eofz4u0J3cmc5C5P8g
5766968    zQx8eofz4u0J3cmc5C5P8g
5766969    zQx8eofz4u0J3cmc5C5P8g
Name: userID, Length: 5766970, dtype: object

In [None]:
# Encode categorical darta (userID and itemID) into indicies, to be fed into torch.nn.Embedding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
le = LabelEncoder()
le.fit(small_data["userID"])
small_data["userID_ndx"] = le.transform(small_data["userID"])

le_item = LabelEncoder()
le_item.fit(small_data["itemID"].astype("str"))
small_data["itemID_ndx"] = le_item.transform(small_data["itemID"].astype("str"))

In [None]:
len(le.classes_)

365665

In [None]:
len(le_item.classes_)

159108

In [None]:
len(small_data)

5766970

In [None]:
from joblib import dump, load
dump(le, 'userlabelenc_allData_correct.joblib') 
dump(le_item, 'itemlabelenc_allData_correct.joblib')

['itemlabelenc_allData_correct.joblib']

In [None]:
np.min(small_data["userID_ndx"])

0

In [None]:
# Keep only the fields we want
small_data = small_data[["userID", "itemID", "rating", "userID_ndx", "itemID_ndx"]]
small_data.head()

Unnamed: 0,userID,itemID,rating,userID_ndx,itemID_ndx
0,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4.0,221111,99388
1,ak0TdVmGKo4pwqdJSTLwWw,bNZ3-0rse12NKdSVqQ30xw,4.0,221111,98066
2,ak0TdVmGKo4pwqdJSTLwWw,BVsIaKL-8QXVjt0Z9WoFWw,4.0,221111,31199
3,ak0TdVmGKo4pwqdJSTLwWw,4MClvr12OXBNvGu8h1yGpA,5.0,221111,13334
4,ak0TdVmGKo4pwqdJSTLwWw,2vH58mhkEl8GdcDug1OwWg,5.0,221111,9854


In [None]:
# Test of PyTorch Embeddding Layer
# emb = torch.nn.Embedding(len(le.classes_), 64, max_norm=True)
# emb_i = torch.nn.Embedding(len(le_item.classes_), 64, max_norm=True)
# one_e = emb(torch.tensor(small_data["userID_ndx"]))
# two_e = emb(torch.tensor(small_data["itemID_ndx"]))

# cat = torch.cat((one_e, two_e), dim=1)

In [None]:
# MODEL PARAMS
EMBEDDING_SIZE = 16
HIDDEN_SIZE = 64

In [None]:
# Define the Neural Collaborative Filtering Model, based off this paper https://arxiv.org/pdf/1708.05031.pdf

# We are learning the embeddings during training, could look into other methods of computing embeddings and then feeding in
class NCF(nn.Module):

  def __init__(self):
    torch.manual_seed(0)
    np.random.seed(0)
    super(NCF, self).__init__()

    self.user_emb = nn.Embedding(len(le.classes_), EMBEDDING_SIZE)
    self.item_emb = nn.Embedding(len(le_item.classes_), EMBEDDING_SIZE)
    self.fc1 = nn.Linear(EMBEDDING_SIZE * 2, HIDDEN_SIZE)
    self.fc2 = nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE)
    # self.fc3 = nn.Linear(int(EMBEDDING_SIZE / 2), int(EMBEDDING_SIZE / 4))#nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE)
    # self.fc4 = nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE)
    self.output = nn.Linear(HIDDEN_SIZE, 1)
    self.relu = nn.ReLU()



  def forward(self, data_tuple):
    userIDs, itemIDs = data_tuple
    user_embeddings = self.user_emb(userIDs)
    item_embeddings = self.item_emb(itemIDs)
    cat = torch.cat((user_embeddings, item_embeddings), dim=1)
    h1 = self.relu(self.fc1(cat))
    h2 = self.relu(self.fc2(h1))
    # h3 = self.relu(self.fc3(h2))
    output = self.relu(self.output(h2))
    return output

def getGroupEmbed(trained_model, groupUserIDs):
    rawgroup = trained_model.user_emb(groupUserIDs) # fewusers should be torch Tensor of userID_ndx's
    # print(rawgroup.shape)
    # print(rawgroup)
    groupembed = torch.mean(rawgroup, 0)
    groupembed = groupembed.reshape((1,16))

    item_embeddings = trained_model.item_emb(anitem) # anitem is a torch Tensor of length 1 with 1 itemNdx
    cat = torch.cat((groupembed, item_embeddings), dim=1)

    h1 = trained_model.relu(trained_model.fc1(cat))
    h2 = trained_model.relu(trained_model.fc2(h1))
    output = trained_model.relu(trained_model.output(h2))
    return output.detach().cpu().numpy()



  

In [None]:
NCF_model = NCF()
print(NCF_model)

NCF(
  (user_emb): Embedding(365665, 16)
  (item_emb): Embedding(159108, 16)
  (fc1): Linear(in_features=32, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (output): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)


In [None]:
getGroupEmbed(NCF_model, fewusers)

NameError: ignored

In [None]:
# Main training loop, takes in X_train, y_train, model, and number of epochs
def training_loop(data_df, y_train, model, epochs=200):
  torch.manual_seed(0)
  np.random.seed(0)
  # Use GPU if it's available
  if torch.cuda.is_available():
    userIDs = torch.LongTensor(data_df["userID_ndx"].values).cuda()
    itemIDs = torch.LongTensor(data_df["itemID_ndx"].values).cuda()
  else:
    userIDs = torch.LongTensor(data_df["userID_ndx"].values)
    itemIDs = torch.LongTensor(data_df["itemID_ndx"].values)

  gt = torch.FloatTensor(y_train.values)

  # Define Loss and Optimizer
  criterion= nn.MSELoss()
  optimizer = torch.optim.Adam(model.parameters())

  for epoch in range(epochs): # Loop over epochs
    optimizer.zero_grad()
    out = model((userIDs, itemIDs)) # Do Forward Pass
    if torch.cuda.is_available():
      out = out.cpu()
    out = out.flatten() # Get output
    loss = criterion(out, gt) # Compute Loss
    loss.backward() # Backward Pass
    optimizer.step() # Adjust weights
    if epoch % 10 == 0: # Print out RMSE every 10th epoch
      train_RMSE = np.sqrt(np.sum(np.square((out - gt).detach().numpy())) / len(data_df))
      print("Train RMSE at Epoch %d: %.4f" % (epoch, train_RMSE))
  return model # Output trained model

In [None]:
# Test Loop, very similar to training, except with X_test, y_test. And takes in trained model
# Computes predicitons on Test Data and outputs RMSE
def test_loop(data_df, y_test, trained_model):
  torch.manual_seed(0)
  np.random.seed(0)
  if torch.cuda.is_available():
    userIDs = torch.LongTensor(data_df["userID_ndx"].values).cuda()
    itemIDs = torch.LongTensor(data_df["itemID_ndx"].values).cuda()
  else:
    userIDs = torch.LongTensor(data_df["userID_ndx"].values)
    itemIDs = torch.LongTensor(data_df["itemID_ndx"].values)

  gt = torch.FloatTensor(y_test.values)

  out = trained_model((userIDs, itemIDs))
  if torch.cuda.is_available():
    out = out.cpu()
  out = out.flatten()
  test_RMSE = np.sqrt(np.sum(np.square((out - gt).detach().numpy())) / len(data_df))
  print("Test RMSE: %.4f" % (test_RMSE))
  return test_RMSE

In [None]:
# Preparing our dataset, 20% of data is test
from sklearn.model_selection import train_test_split
X = small_data[["userID_ndx", "itemID_ndx"]]
y = small_data["rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
torch.LongTensor(X_train["userID_ndx"].values)

tensor([ 96315, 293873, 163428,  ..., 180272, 260210,  15307])

In [None]:
# Initialize model and use GPU if available and running training loop
# 2 non linear with 64 dimension hidden
NCF_model = NCF()
if torch.cuda.is_available():
  NCF_model = NCF_model.cuda()
NCF_model = training_loop(X_train, y_train, NCF_model, 300)

Train RMSE at Epoch 0: 3.9270
Train RMSE at Epoch 10: 3.5234
Train RMSE at Epoch 20: 3.0607
Train RMSE at Epoch 30: 2.4364
Train RMSE at Epoch 40: 1.7553
Train RMSE at Epoch 50: 1.4818
Train RMSE at Epoch 60: 1.4642
Train RMSE at Epoch 70: 1.4007
Train RMSE at Epoch 80: 1.3902
Train RMSE at Epoch 90: 1.3766
Train RMSE at Epoch 100: 1.3642
Train RMSE at Epoch 110: 1.3540
Train RMSE at Epoch 120: 1.3441
Train RMSE at Epoch 130: 1.3349
Train RMSE at Epoch 140: 1.3261
Train RMSE at Epoch 150: 1.3175
Train RMSE at Epoch 160: 1.3093
Train RMSE at Epoch 170: 1.3013
Train RMSE at Epoch 180: 1.2936
Train RMSE at Epoch 190: 1.2860
Train RMSE at Epoch 200: 1.2787
Train RMSE at Epoch 210: 1.2715
Train RMSE at Epoch 220: 1.2645
Train RMSE at Epoch 230: 1.2577
Train RMSE at Epoch 240: 1.2510
Train RMSE at Epoch 250: 1.2444
Train RMSE at Epoch 260: 1.2380
Train RMSE at Epoch 270: 1.2316
Train RMSE at Epoch 280: 1.2254
Train RMSE at Epoch 290: 1.2193


In [None]:
# fewusers = torch.LongTensor(X_train["userID_ndx"].values[:5]).cuda()
fewusers = torch.LongTensor([20546, 63104, 41068, 15604, 15970]).cuda()
fewusers

tensor([20546, 63104, 41068, 15604, 15970], device='cuda:0')

In [None]:
fewusers

tensor([20546, 63104, 41068, 15604, 15970], device='cuda:0')

In [None]:
le.inverse_transform([20546, 63104, 41068, 15604, 15970]).tolist()

['BvcPaFl6N8aQWcdak2v_Sg',
 'b_-AmmH9I3lvhU7PANjFrw',
 'OhOgtmlIWSmikT25wcWBpA',
 '8q7-9Lv6NTlOLqnm5Yk0hg',
 '94u9RZbO2AKAGV-sXLjX4w']

In [None]:
",".join(le.inverse_transform([20546, 63104, 41068, 15604, 15970]).tolist())

'BvcPaFl6N8aQWcdak2v_Sg,b_-AmmH9I3lvhU7PANjFrw,OhOgtmlIWSmikT25wcWBpA,8q7-9Lv6NTlOLqnm5Yk0hg,94u9RZbO2AKAGV-sXLjX4w'

In [None]:
",".join(le_item.inverse_transform([41882, 4058, 32564]).tolist())

'GthqEQQNDl2z50-uC3obDg,0ja9ouEv_w8FWe1F5KMS4g,Cx8BotgsDzKFpH7zSmykkQ'

In [None]:
le_item.inverse_transform([41882]).item()

'GthqEQQNDl2z50-uC3obDg'

In [None]:
rawgroup = NCF_model.user_emb(fewusers)
print(rawgroup.shape)
print(rawgroup)
groupembed = torch.mean(rawgroup, 0)
groupembed = groupembed.reshape((1,16))

torch.Size([5, 16])
tensor([[-0.1359, -1.7267,  0.0988, -0.5761,  0.8728, -0.2803, -0.6304, -0.9812,
         -0.7600,  2.1858,  0.0536,  0.1042,  0.7313, -0.7874,  1.1681, -0.0246],
        [ 0.7069, -0.1732, -0.3700, -0.9830, -0.5454,  0.5245,  0.3611, -0.0510,
          1.4733, -0.3590,  0.0470,  1.0824,  1.5085, -0.3355, -0.2621, -0.4841],
        [ 0.4657,  0.0175, -0.6338,  1.0036, -0.6558,  0.4748,  1.0911, -0.5950,
          0.4349, -1.1988, -0.8353, -0.8726, -2.0432,  0.2561, -0.6501, -1.0592],
        [-0.0365, -0.5758,  0.5444, -2.7797,  0.8415,  0.3826,  0.5561,  0.6474,
          0.5197, -0.4897,  0.1441,  2.3153,  1.7026, -1.4898,  0.0815, -1.4824],
        [-0.1208, -0.0535, -1.4829, -0.3676, -0.0812,  1.5820, -1.1084, -0.6464,
          0.0588, -0.1174,  2.0579, -0.3083, -1.2872,  0.6968,  1.1739,  1.3332]],
       device='cuda:0', grad_fn=<EmbeddingBackward>)


In [None]:
groupembed.shape

torch.Size([1, 16])

In [None]:
# anitem = torch.LongTensor(X_train["itemID_ndx"].values[:1]).cuda()
anitem = torch.LongTensor([41882]).cuda()
anitem

tensor([41882], device='cuda:0')

In [None]:
# CODE TO COMPUTE GROUPEMBEDDING FROM TRAINED MODEL AND THEN GET PREDICITON
rawgroup = NCF_model.user_emb(fewusers)
print(rawgroup.shape)
print(rawgroup)
groupembed = torch.mean(rawgroup, 0)
groupembed = groupembed.reshape((1,16))

item_embeddings = NCF_model.item_emb(anitem)
# print(item_embeddings.shape)
# print(groupembed.reshape((1,16)).shape)
cat = torch.cat((groupembed, item_embeddings), dim=1)
# print(cat.shape)
h1 = NCF_model.relu(NCF_model.fc1(cat))
h2 = NCF_model.relu(NCF_model.fc2(h1))
output = NCF_model.relu(NCF_model.output(h2))

In [None]:
output

tensor([[2.7579]], device='cuda:0', grad_fn=<ReluBackward0>)

In [None]:
newData = pd.DataFrame([("Bob", "xGXzsc-hzam-VArK6eTvtw", 2.0), ("Bob", "2ILe62hVJfiZEOcKxFYKDA", 5.0), ("Bob", "LQ8rprZFKPuhlj8ndEPb0w", 1.0), ("Bob", "TLka3vszLhYp2OZtRD6_8g", 4.0), ("Bob", "Y1-cXsWZ1v2rRj5MMtFsqA", 3.0)], columns=["userID", "itemID", "rating"])
newX = newData[["userID", "itemID"]]
newY = newData["rating"]
print(newX)
print(newY)

  userID                  itemID
0    Bob  xGXzsc-hzam-VArK6eTvtw
1    Bob  2ILe62hVJfiZEOcKxFYKDA
2    Bob  LQ8rprZFKPuhlj8ndEPb0w
3    Bob  TLka3vszLhYp2OZtRD6_8g
4    Bob  Y1-cXsWZ1v2rRj5MMtFsqA
0    2.0
1    5.0
2    1.0
3    4.0
4    3.0
Name: rating, dtype: float64


In [None]:
newusers = small_data["userID"].append(newData["userID"])
newusers

0    RNm_RWkcd02Li2mKPRe7Eg
1    RNm_RWkcd02Li2mKPRe7Eg
2    RNm_RWkcd02Li2mKPRe7Eg
3    RNm_RWkcd02Li2mKPRe7Eg
4    RNm_RWkcd02Li2mKPRe7Eg
              ...          
0                       Bob
1                       Bob
2                       Bob
3                       Bob
4                       Bob
Name: userID, Length: 1157472, dtype: object

In [None]:
newData["userID_ndx"] = np.max(small_data["userID_ndx"]) + 1
newData["itemID_ndx"] = np.max(small_data["itemID_ndx"]) 

In [None]:
le.transform(["RNm_RWkcd02Li2mKPRe7Eg"])

array([32474])

In [None]:
newData

Unnamed: 0,userID,itemID,rating,userID_ndx,itemID_ndx
0,Bob,xGXzsc-hzam-VArK6eTvtw,2.0,73133,129770
1,Bob,2ILe62hVJfiZEOcKxFYKDA,5.0,73133,129770
2,Bob,LQ8rprZFKPuhlj8ndEPb0w,1.0,73133,129770
3,Bob,TLka3vszLhYp2OZtRD6_8g,4.0,73133,129770
4,Bob,Y1-cXsWZ1v2rRj5MMtFsqA,3.0,73133,129770


In [None]:
newX = newData[["userID_ndx", "itemID_ndx"]]
newY = newData["rating"]
print(newX)
print(newY)

   userID_ndx  itemID_ndx
0       73133      129770
1       73133      129770
2       73133      129770
3       73133      129770
4       73133      129770
0    2.0
1    5.0
2    1.0
3    4.0
4    3.0
Name: rating, dtype: float64


In [None]:
# NCF_model = NCF()
NCF_model = training_loop(newX, newY, NCF_model,1)

RuntimeError: ignored

In [None]:
# Run Test Loop
test_loop(X_test, y_test, NCF_model)

Test RMSE: 1.2777


1.2777408755460384

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
torch.save(NCF_model.state_dict(), 'ncf_all_statedict_64d300e.pt')

In [None]:
import dill as dill
torch.save(NCF_model, "./trainedyelprec_allData_64d300e.pt", pickle_module=dill)

In [None]:
NCF_model.eval()

NCF(
  (user_emb): Embedding(73133, 16)
  (item_emb): Embedding(129771, 16)
  (fc1): Linear(in_features=32, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
)

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')

In [None]:
enc.fit(small_data[["userID"]])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)

In [None]:
enc.categories_

[array(['---2PmXbF47D870stH1jqA', '---7FS-3SMy-cnnIPHcW9w',
        '--0DrQkM0FT-yCQRWw82uQ', ..., 'zzzCg-_lpyYE82TlJCdwdw',
        'zzz_yXv7Z_JRT3tQmEo5rw', 'zzzcuxFaP_FvdIB-fbP9iA'], dtype=object)]

In [None]:
small_data.head()

Unnamed: 0,userID,itemID,rating,userID_ndx,itemID_ndx
0,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5.0,269348,64036
1,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2.0,186784,1990
2,SFQ1jcnGguO0LYWnbbftAA,1OweB1FS_Q21b7IyqewGGQ,4.0,186784,4095
3,SFQ1jcnGguO0LYWnbbftAA,ss9d8bJaPBBFih4KYAC81w,4.0,186784,95732
4,SFQ1jcnGguO0LYWnbbftAA,3Y2rw1yonra6wEDiLadKhw,2.0,186784,7697


In [None]:
enc.transform([['---2PmXbF47D870stH1jqA']])

<1x408198 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [None]:
len(small_data)

811376

In [None]:
len(np.unique(small_data["userID"]))

408198

In [None]:
usr = torch.LongTensor([186784]).cuda()
item = torch.LongTensor([1990]).cuda()


In [None]:
NCF_model((usr, item))

tensor([[2.7993]], device='cuda:0', grad_fn=<ReluBackward0>)

In [None]:
from collections import Counter
Counter(small_data["userID"])

Counter({'RNm_RWkcd02Li2mKPRe7Eg': 20,
         'hn0ZbitvmlHnF--KJGJ6_A': 26,
         'JHXQEayrDHOWGexs0dCviA': 60,
         'd01SZKYmReEar7varZB0HQ': 352,
         'ffJp-ZN80M4sSkDL8Ra18w': 63,
         'SIP1Dfwy1M0HWzdgGg9H9g': 247,
         'pRPT3vqhqpU7kHgmKJamvw': 5,
         'Df1kUPqooB1FrnwvXi5HsQ': 11,
         '_soZ9DRjCF7Op7Us8bxSSA': 19,
         'LfyHQWb_kFyxbdNFb8kQGA': 89,
         'AyRa2eaaD2EOkSxdkSrNEA': 304,
         'u0x3SXagjYDbI2N4sgJ0Tw': 181,
         'rC7hTkOfJ91_LPSxk-ABlQ': 35,
         'kV7kmmBw_8IPLyI5M6wgTw': 20,
         'bUVFpVxZdnFOh2BZzCoLYw': 22,
         'YxvuZGAYcJIEQJmgtRCi3g': 34,
         'udclAXyUh3kka5uCXT0ITg': 59,
         '_zsiwKQoiAK6KiPUho1hFQ': 7,
         'pDIZYh9dIWKc2qUHO7j9Ww': 5,
         '0DmuCPKJ5l4otcf5ar_sew': 139,
         'EfD24GxaAAA4J79PvdxQGQ': 17,
         '80MUDP_Ny_J8jeShVxzdlw': 23,
         '4ym-LUiVrGQN49_7P_3uow': 5,
         'LZLTN6KxDEsJHEOJBgL-EQ': 24,
         '0iTlg0tKm2toM139TFYVCg': 19,
         'fiJOXfXKnQ-y0D

In [None]:
#Cold case functions
def cold_rating(items, user):
  out = []
  for i in items:
    #display restaurant name based on item input get_details() in main api
    #get the rating of the user based on a dropdown (for now outputting random numbers)
    out.append(np.random.randint(1, 6))
  return out

def new_user():
  # set for now, need input from user
  return "bob"

In [None]:
#Cold start
trial = small_data

#get the most popular items and their ndx values, make sure they're in a list as a string
most_rated = small_data['itemID'].value_counts()[:10].index.tolist()
top_itemndx = le_item.transform(most_rated).tolist()

new_userID = new_user()
new_user = np.max(small_data["userID_ndx"])
new_ratings = cold_rating(most_rated, new_userID)
for i in range(len(most_rated)):
  trial = trial.append({'userID' : new_userID, 'itemID': most_rated[i], 'rating': new_ratings[i], 'userID_ndx': str(new_user), 'itemID_ndx': str(top_itemndx[i])}, ignore_index = True)

trial.head(10)