In [1]:
import pandas as pd
import numpy as np
import re

## Movie Lens original data

In [2]:
DATA_PATH="ml-1m/ml-1m/"
df_movies = pd.read_csv(DATA_PATH+"movies.dat", sep="::", header=None)
df_movies.columns = ["ItemID","Title","Genres"]
df_movies["Year"] = df_movies.Title.apply(lambda x: re.match(".*\((\w+)\)", x)[1])
df_movies.head()

  


Unnamed: 0,ItemID,Title,Genres,Year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [3]:
map_occ = {0:  "other",
1:  "academic/educator",
2:  "artist",
3:  "clerical/admin",
4:  "college/grad student",
5:  "customer service",
6:  "doctor/health care",
7:  "executive/managerial",
8:  "farmer",
9:  "homemaker",
10:  "K-12 student",
11:  "lawyer",
12:  "programmer",
13:  "retired",
14:  "sales/marketing",
15:  "scientist",
16:  "self-employed",
17:  "technician/engineer",
18:  "tradesman/craftsman",
19:  "unemployed",
20:  "writer"}



df_users = pd.read_csv(DATA_PATH+"users.dat", sep="::", header=None)
df_users.columns = ["UserID","Gender","Age","Occupation","Zip-code"]
#df["Year"] = df.Title.applylambda x: re.match".*\\w+)\)", x)[1])
df_users["Occupation2"] = df_users.Occupation.apply(lambda x: map_occ[x])
df_users.head(10)



Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,Occupation2
0,1,F,1,10,48067,K-12 student
1,2,M,56,16,70072,self-employed
2,3,M,25,15,55117,scientist
3,4,M,45,7,2460,executive/managerial
4,5,M,25,20,55455,writer
5,6,F,50,9,55117,homemaker
6,7,M,35,1,6810,academic/educator
7,8,M,25,12,11413,programmer
8,9,M,25,17,61614,technician/engineer
9,10,F,35,1,95370,academic/educator


In [4]:
df_ratings = pd.read_csv(DATA_PATH+"/ratings.dat", sep="::", header=None)
df_ratings.columns = ["UserID","ItemID","Rating","Timestamp"]
df_ratings["time"] = df_ratings.Timestamp.apply(lambda x: pd.datetime.fromtimestamp(x))


  """Entry point for launching an IPython kernel.


In [5]:
df_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp,time
0,1,1193,5,978300760,2000-12-31 14:12:40
1,1,661,3,978302109,2000-12-31 14:35:09
2,1,914,3,978301968,2000-12-31 14:32:48
3,1,3408,4,978300275,2000-12-31 14:04:35
4,1,2355,5,978824291,2001-01-06 15:38:11


## NCF Paper data

In [6]:
NCF_DATA_PATH="neucolfilt/neural_collaborative_filtering/Data/"
df = pd.read_csv(NCF_DATA_PATH+"ml-1m.train.rating", sep="\t", header=None)
df.columns = ["UserID","ItemID","Rating","Timestamp"]
len(df.UserID.unique()), len(df.ItemID.unique())
train_items = set(df.ItemID.values)

In [7]:
df = pd.read_csv(NCF_DATA_PATH+"ml-1m.test.rating", sep="\t", header=None)
df.columns = ["UserID","ItemID","Rating","Timestamp"]
len(df.UserID.unique()), len(df.ItemID.unique())
test_items = set(df.ItemID.values)

In [8]:
test_items-train_items

{1805, 3569}

## Movie Lens Data - split train test data

In [9]:
# generate test data - ratings
df_ratings = pd.read_csv(DATA_PATH+"ratings.dat", sep="::", header=None)
df_ratings.columns = ["UserID","ItemID","Rating","Timestamp"]
#df_ratings["time"] = df_ratings.Timestamp.apply(lambda x: pd.datetime.fromtimestamp(x))

  


df1 = df_ratings.groupby("UserID").count()
users = set(df1[df1["ItemID"]>=300].index)
df_ratings["condition"] = df_ratings["UserID"].apply(lambda x: True if x in users else False)
df_ratings[df_ratings["condition"]]

In [10]:
df_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
df_ratings = df_ratings.sort_values("Timestamp", ascending=False)
df_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
825603,4958,1924,4,1046454590
825724,4958,3264,4,1046454548
825731,4958,2634,3,1046454548
825438,4958,1407,5,1046454443
825793,4958,2399,1,1046454338


In [12]:
from collections import defaultdict

count_per_user = 1
user_counts = defaultdict(int)
rows = []
for index, row in df_ratings.iterrows():
    if user_counts[row["UserID"]] < count_per_user:
        rows += [index]
        user_counts[row["UserID"]] +=1

In [13]:
df_test_ratings = df_ratings.loc[rows]
df_train_ratings = df_ratings.loc[list(set(df_ratings.index)-set(df_test_ratings.index))]

In [14]:
df_test_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
825603,4958,1924,4,1046454590
879288,5312,3267,4,1046444711
983967,5948,3098,4,1046437932
511804,3158,2648,4,1046393499
59497,403,1036,4,1046388675


In [15]:
df_train_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


df_test_ratings.to_csv("data/movielens.test.ratings", index=False)
df_train_ratings.to_csv("data/movielens.train.ratings", index=False)

### Implict Feedback and Negative Sampling

In [16]:
num_negs_per_positive = 5
user_item_pairs = defaultdict(set)

for x in df_ratings[["UserID", "ItemID"]].values:
    # add all +ve to set
    user_item_pairs[x[0]].add(x[1])

num_items = df_ratings["ItemID"].max()

def get_negs(u):
    negs = []
    for _i in range(num_negs_per_positive):
        # generate negatives and add to dict
        j = np.random.randint(num_items)
        while (u,j) in user_item_pairs[u]:
            j = np.random.randint(num_items)
        negs += [j]
        user_item_pairs[u].add(j)
    return negs

df_test_ratings["Negatives"] = df_test_ratings["UserID"].apply(lambda x: get_negs(x))
df_train_ratings["Negatives"] = df_train_ratings["UserID"].apply(lambda x: get_negs(x))

In [17]:
df_test_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp,Negatives
825603,4958,1924,4,1046454590,"[3930, 2070, 980, 2977, 1238]"
879288,5312,3267,4,1046444711,"[1379, 2383, 3668, 1935, 1565]"
983967,5948,3098,4,1046437932,"[667, 2136, 1886, 359, 3816]"
511804,3158,2648,4,1046393499,"[1534, 3423, 3636, 1593, 189]"
59497,403,1036,4,1046388675,"[3613, 1917, 330, 1437, 1365]"


In [18]:
df_test_ratings.to_csv("data/movielens.test.data", index=False)
df_train_ratings.to_csv("data/movielens.train.data", index=False)

In [23]:
!ls data/

movielens.test.data
movielens.train.data


In [22]:
len(df_train_ratings), len(df_test_ratings)

(994169, 6040)