In [1]:
import pandas as pd
import re
import numpy as np

In [7]:
df_movies_features = pd.read_pickle("movie_features.pkl")

In [11]:
movie_ids = set(df_movies_features["movieId"].unique())

In [20]:
df_ratings = pd.read_csv("ml-20m/ratings.csv")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [22]:
# filter by movieId
df_ratings = df_ratings[df_ratings["movieId"].apply(lambda x: x in movie_ids)]

In [57]:
# fitler by timestamp
#2005 to #2015
df_ratings2 = df_ratings[df_ratings["timestamp"]>=1356998400]

In [58]:
len(df_ratings2.userId.unique())

14182

In [59]:
users_set = set((df_ratings2.groupby("userId").count()["movieId"]>=20).index)

In [60]:
df_ratings2 = df_ratings2[df_ratings2["userId"].apply(lambda x: x in users_set)]

In [61]:
df_ratings2.shape

(1047762, 4)

In [64]:
len(df_ratings2.movieId.unique()),len(df_ratings2.userId.unique())

(9127, 14182)

In [92]:
df_ratings2.columns=["UserID","ItemID","Rating","Timestamp"]

In [93]:
df_ratings2.to_csv("movielense20M.ratings.data.csv", index=None)
df_ratings2.to_pickle("movielense20M.ratings.data.pkl")

### Test-train split

In [81]:
df = df_ratings2
df.columns=["UserID","ItemID","Rating","Timestamp"]

In [82]:
df = df.sort_values("Timestamp", ascending=False)
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [83]:
from collections import defaultdict

count_per_user = 1
count_per_user_valid = 0
user_counts = defaultdict(int)
rows = []
rows_valid = []
for index, row in df.iterrows():
    if user_counts[row["UserID"]] < count_per_user:
        rows += [index]
        user_counts[row["UserID"]] +=1
    elif user_counts[row["UserID"]] < count_per_user+count_per_user_valid:
        rows_valid += [index]
        #user_counts[row["UserID"]] +=1

In [84]:
df_test_ratings = df.loc[rows]
df_train_ratings = df.loc[list(set(df.index)-set(df_test_ratings.index))]

In [85]:
df_test_ratings = df_test_ratings.sort_values(["UserID","ItemID"])
df_test_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
3537,31,364,3.0,1424844807
4504,42,5952,4.5,1411668080
4965,49,2858,3.5,1367549529
7000,59,1,4.5,1380400521
8037,71,104337,3.5,1417454493


In [86]:
df_train_ratings = df_train_ratings.sort_values(["UserID","ItemID"])
df_train_ratings.head()

Unnamed: 0,UserID,ItemID,Rating,Timestamp
3534,31,1,3.0,1424733487
3535,31,110,5.0,1424733473
3538,31,527,0.5,1424733598
3539,31,588,3.0,1424842869
3549,31,2571,4.5,1424736532


In [87]:
df_test_ratings.to_pickle("movielens20M.test.data")
df_train_ratings.to_pickle("movielens20M.train.data")

In [80]:
!ls movielens20M*

movielens20M.test.data
movielens20M.train.data


### Negatives

In [91]:
%%time
user_item_pairs = defaultdict(set)

for x in df[["UserID", "ItemID"]].values:
    # add all +ve to set
    user_item_pairs[x[0]].add(x[1])

num_items = df["ItemID"].max()

def get_negs(u):
    negs = []
    for _i in range(num_negs_per_positive):
        # generate negatives and add to dict
        j = np.random.randint(num_items)
        while (u,j) in user_item_pairs[u]:
            j = np.random.randint(num_items)
        negs += [j]
        #user_item_pairs[u].add(j)
    return negs

num_negs_per_positive = 99
df_test_ratings["Negatives"] = df_test_ratings["UserID"].apply(lambda x: get_negs(x))
#num_negs_per_positive = 99
#df_valid_ratings["Negatives"] = df_valid_ratings["UserID"].apply(lambda x: get_negs(x))
for x in df_test_ratings[["UserID", "ItemID"]].values:
    # add all +ve to set
    user_item_pairs[x[0]].add(x[1])

num_negs_per_positive = 7
df_train_ratings["Negatives"] = df_train_ratings["UserID"].apply(lambda x: get_negs(x))

Wall time: 21.5 s


In [106]:
'''
Created on Aug 8, 2016
Processing datasets. 


'''
import scipy.sparse as sp
import numpy as np
import pandas as pd
from collections import defaultdict
import copy
class Dataset(object):
    '''
    classdocs
    '''

    def __init__(self, path, prep_data=False,count_per_user_test=1,count_per_user_validation=0,num_negatives_train=4):
        '''
        Constructor
        '''
        if prep_data:
            self.trainData = self.load_file(path)
            self.split_train_test(count_per_user_test=count_per_user_test,count_per_user_validation=count_per_user_validation)
            self.negative_sampling(num_negatives_train=num_negatives_train)
        else:
            self.trainData = self.load_file(path + ".train.data.original")
            self.testData = self.load_file(path + ".test.data.original")
            #self.validData = self.load_file(path + ".valid.data")
            self.validData = self.testData
        
        
        self.num_users = self.trainData["UserID"].max()+1
        self.num_items = self.trainData["ItemID"].max()+1
        #self.trainData = self.trainData.sample(1000)
        
    def load_file(self, filename):        
        return pd.read_pickle(filename)
    
    def split_train_test(self,count_per_user_test=1,count_per_user_validation=0):
        df = self.trainData
        df = df.sort_values("Timestamp", ascending=False)
       
        user_counts = defaultdict(int)
        rows = []
        rows_valid = []
        for index, row in df.iterrows():
            if user_counts[row["UserID"]] < count_per_user_test:
                rows += [index]
                user_counts[row["UserID"]] +=1
            elif user_counts[row["UserID"]] < count_per_user_test+count_per_user_validation:
                rows_valid += [index]
                user_counts[row["UserID"]] +=1
            
        df_test_ratings = df.loc[rows]
        df_validation_ratings = df.loc[rows_valid]
        df_train_ratings = df.loc[list(set(df.index)-set(rows)-set(rows_valid))]
    
        self.trainData = df_train_ratings
        self.testData  = df_test_ratings
        self.validData = df_validation_ratings
    
    def negative_sampling(self, num_negatives_train=4):
        user_item_pairs = defaultdict(set)
        
        for x in self.testData[["UserID", "ItemID"]].values:
            user_item_pairs[x[0]].add(x[1])
            
        for x in self.trainData[["UserID", "ItemID"]].values:
            user_item_pairs[x[0]].add(x[1])
            
        for x in self.validData[["UserID", "ItemID"]].values:
            user_item_pairs[x[0]].add(x[1])
        
        
        num_items = self.trainData["ItemID"].max()+1
        user_item_pairs2 = copy.deepcopy(user_item_pairs)
        
        def get_negs(u):
            negs = []
            for _i in range(num_negs_per_positive):
                # generate negatives and add to dict
                j = np.random.randint(num_items)
                while (u,j) in user_item_pairs[u]:
                    j = np.random.randint(num_items)
                negs += [j]
                #user_item_pairs[u].add(j)
                user_item_pairs2[u].add(j)
            return negs
        
        num_negs_per_positive = 99
        self.testData["Negatives"] = self.testData["UserID"].apply(lambda x: get_negs(x))
        user_item_pairs = user_item_pairs2
        user_item_pairs2 = copy.deepcopy(user_item_pairs)
        
        
        num_negs_per_positive = 99
        self.validData["Negatives"] = self.validData["UserID"].apply(lambda x: get_negs(x))
        user_item_pairs = user_item_pairs2
        user_item_pairs2 = copy.deepcopy(user_item_pairs)
    
        num_negs_per_positive = num_negatives_train
        self.trainData["Negatives"] = self.trainData["UserID"].apply(lambda x: get_negs(x))
        
    def save(self, path):
        self.trainData.to_pickle(path+".train.data")
        self.testData.to_pickle(path+".test.data")
        self.validData.to_pickle(path+".valid.data")

In [107]:
d = Dataset("movielense20M.ratings.data.pkl", prep_data=True, count_per_user_validation=1)