In [1]:
import os
import boto3
import sagemaker
import numpy as np
import pandas as pd

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
books_df = pd.read_csv('./dataset/Books.csv', low_memory=False)
ratings_df = pd.read_csv('./dataset/Ratings.csv', low_memory=False)

books_df.drop(axis=1, labels=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], inplace=True)
df = ratings_df.merge(books_df, how='inner', on='ISBN')
df.dropna(inplace=True)
df.drop(axis=1, labels=['ISBN', 'Book-Author', 'Year-Of-Publication', 'Publisher'], inplace=True)

In [3]:
df.head(10)

Unnamed: 0,User-ID,Book-Rating,Book-Title
0,276725,0,Flesh Tones: A Novel
1,2313,5,Flesh Tones: A Novel
2,6543,0,Flesh Tones: A Novel
3,8680,5,Flesh Tones: A Novel
4,10314,9,Flesh Tones: A Novel
5,23768,0,Flesh Tones: A Novel
6,28266,0,Flesh Tones: A Novel
7,28523,0,Flesh Tones: A Novel
8,39002,0,Flesh Tones: A Novel
9,50403,9,Flesh Tones: A Novel


In [34]:
comment_counts = pd.DataFrame(df["Book-Title"].value_counts())
rare_books = comment_counts[comment_counts["Book-Title"] < 250].index
common_books = df[~df["Book-Title"].isin(rare_books)]
common_books.columns = ['userId', 'rating', 'book']
common_books = common_books[['userId', 'book', 'rating']]

common_books

Unnamed: 0,userId,book,rating
62,276727,The Notebook,0
63,278418,The Notebook,0
64,638,The Notebook,0
65,3363,The Notebook,0
66,7158,The Notebook,10
...,...,...,...
1025422,264317,The Queen of the Damned (Vampire Chronicles (P...,0
1026724,266865,The Catcher in the Rye,10
1028777,271284,The Rainmaker,0
1029070,271705,Fahrenheit 451,0


In [35]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
common_books['book'] = encoder.fit_transform(common_books['book'])

common_books

Unnamed: 0,userId,book,rating
62,276727,146,0
63,278418,146,0
64,638,146,0
65,3363,146,0
66,7158,146,10
...,...,...,...
1025422,264317,154,0
1026724,266865,116,10
1028777,271284,155,0
1029070,271705,41,0


In [36]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(common_books, test_size=0.2, random_state=42)

In [37]:
def negative_sampling(user_ids, book, items, n_neg):
    """This function creates n_neg negative labels for every positive label
    
    @param user_ids: list of user ids
    @param movie_ids: list of movie ids
    @param items: unique list of movie ids
    @param n_neg: number of negative labels to sample
    
    @return df_neg: negative sample dataframe
    
    """
    
    neg = []
    ui_pairs = zip(user_ids, book)
    records = set(ui_pairs)
    
    # for every positive label case
    for (u, i) in records:
        # generate n_neg negative labels
        for _ in range(n_neg):
            # if the randomly sampled movie exists for that user
            j = np.random.choice(items)
            while(u, j) in records:
                # resample
                j = np.random.choice(items)
            neg.append([u, j, 0])
    # conver to pandas dataframe for concatenation later
    df_neg = pd.DataFrame(neg, columns=['userId', 'book', 'rating'])
    
    return df_neg

# create negative samples for training set
neg_train = negative_sampling(
    user_ids=df_train.userId.values, 
    book=df_train.book.values,
    items=common_books.book.unique(),
    n_neg=5
)

In [38]:
print(f'created {neg_train.shape[0]:,} negative samples')

created 295,375 negative samples


In [39]:
df_train = df_train[['userId', 'book']].assign(rating=1)
df_test = df_test[['userId', 'book']].assign(rating=1)

df_train = pd.concat([df_train, neg_train], ignore_index=True)

In [40]:
def get_unique_count(df):
    """calculate unique user and movie counts"""
    return df.userId.nunique(), df.book.nunique()

In [41]:
get_unique_count(common_books)

(23917, 186)

In [42]:
print('training set shape', get_unique_count(df_train))
print('testing set shape', get_unique_count(df_test))

training set shape (20695, 186)
testing set shape (8138, 186)


In [56]:
# number of unique user and number of unique item/movie
n_user, n_item = get_unique_count(df_train)

print("number of unique users", n_user)
print("number of unique items", n_item)

# save the variable for the model training notebook
# -----
# read about `store` magic here: 
# https://ipython.readthedocs.io/en/stable/config/extensions/storemagic.html

%store n_user
%store n_item

number of unique users 20695
number of unique items 186
Stored 'n_user' (int)
Stored 'n_item' (int)


In [44]:
bucket = 'gcu-ml2-005-bucket'
sess = sagemaker.Session(
    default_bucket = bucket
)

%store bucket

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Stored 'bucket' (str)


In [53]:
# save data locally first
dest = 'book'
train_path = os.path.join(dest, 'train.csv')
test_path = os.path.join(dest, 'test.csv')

df_train.to_csv(train_path, header=False, index=False)
df_test.to_csv(test_path, header=False, index=False)

# !mkdir {dest}
# np.save(train_path, df_train.values)
# np.save(test_path, df_test.values)


In [54]:
print(train_path)

book/train.csv


In [55]:
# upload to S3 bucket (see the bucket name above)
sess.upload_data(path='./book/train.csv', key_prefix='data')
sess.upload_data(path='./book/test.csv', key_prefix='data')

's3://gcu-ml2-005-bucket/data/test.csv'

In [21]:
encoded

<74673x186 sparse matrix of type '<class 'numpy.float64'>'
	with 74673 stored elements in Compressed Sparse Row format>