In [2]:
import pandas as pd
import numpy as np
import re

## NCF Paper data

In [6]:
NCF_DATA_PATH="/home/kolassc/Desktop/ucsd_course_materials/CSE291/project/code/"
df_ratings = pd.read_csv(NCF_DATA_PATH+"cleaned_data_u20_enc.csv", sep=",", header=None)
df_ratings.columns = ["UserID","ItemID","Rating","Timestamp"]
len(df_ratings.UserID.unique()), len(df_ratings.ItemID.unique())
train_items = set(df_ratings.ItemID.values)

In [7]:
df_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,1,1,4.0,1124582400
1,1,2,5.0,1253059200
2,1,3,3.0,992995200
3,1,4,1.0,1120089600
4,1,5,1.0,1229472000


In [8]:
df_ratings = df_ratings.sort_values("Timestamp", ascending=False)
df_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
928397,15715,75118,5.0,1406073600
751548,12829,39739,3.0,1406073600
595123,10061,6929,5.0,1406073600
358911,5992,17361,5.0,1406073600
224826,3729,59570,5.0,1406073600


In [9]:
from collections import defaultdict

count_per_user = 1
user_counts = defaultdict(int)
rows = []
for index, row in df_ratings.iterrows():
    if user_counts[row["UserID"]] < count_per_user:
        rows += [index]
        user_counts[row["UserID"]] +=1

In [10]:
df_test_ratings = df_ratings.loc[rows]
df_train_ratings = df_ratings.loc[list(set(df_ratings.index)-set(df_test_ratings.index))]

In [12]:
df_test_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
928397,15715,75118,5.0,1406073600
751548,12829,39739,3.0,1406073600
595123,10061,6929,5.0,1406073600
358911,5992,17361,5.0,1406073600
224826,3729,59570,5.0,1406073600


In [13]:
df_train_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,1,1,4.0,1124582400
1,1,2,5.0,1253059200
2,1,3,3.0,992995200
3,1,4,1.0,1120089600
4,1,5,1.0,1229472000


df_test_ratings.to_csv("data/movielens.test.ratings", index=False)
df_train_ratings.to_csv("data/movielens.train.ratings", index=False)

### Implict Feedback and Negative Sampling

In [14]:
num_negs_per_positive = 5
user_item_pairs = defaultdict(set)

for x in df_ratings[["UserID", "ItemID"]].values:
    # add all +ve to set
    user_item_pairs[x[0]].add(x[1])

num_items = df_ratings["ItemID"].max()

def get_negs(u):
    negs = []
    for _i in range(num_negs_per_positive):
        # generate negatives and add to dict
        j = np.random.randint(num_items)
        while (u,j) in user_item_pairs[u]:
            j = np.random.randint(num_items)
        negs += [j]
        user_item_pairs[u].add(j)
    return negs

df_test_ratings["Negatives"] = df_test_ratings["UserID"].apply(lambda x: get_negs(x))
df_train_ratings["Negatives"] = df_train_ratings["UserID"].apply(lambda x: get_negs(x))

In [15]:
df_test_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp,Negatives
928397,15715,75118,5.0,1406073600,"[10038, 89863, 17955, 16054, 22450]"
751548,12829,39739,3.0,1406073600,"[38415, 73755, 90351, 95308, 70427]"
595123,10061,6929,5.0,1406073600,"[114311, 47601, 41245, 43836, 35527]"
358911,5992,17361,5.0,1406073600,"[88349, 53458, 107847, 68230, 112527]"
224826,3729,59570,5.0,1406073600,"[49037, 85798, 75003, 27870, 58881]"


In [21]:
df_test_ratings.to_pickle("data/amazon.test.data")
df_train_ratings.to_pickle("data/amazon.train.data")

In [18]:
!ls data/

amazon.test.data  amazon.train.data


In [19]:
len(df_train_ratings), len(df_test_ratings)

(937541, 16141)