In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Data preperation
# dataset books
books = pd.read_csv("./dataset/Books.csv", low_memory=False)
ratings = pd.read_csv("./dataset/Ratings.csv", low_memory=False)
users = pd.read_csv("./dataset/Users.csv", low_memory=False)

# drop unnecessary columns
books.drop(axis=1, labels=['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], inplace=True)

# dataset books
print(books.head())

# dataset ratings
print(ratings.head())

# dataset users
print(users.head())

print('Number of book data:', len(books.ISBN.unique()))
print('Total book rating data from readers:', len(ratings.ISBN.unique()))
print('Amount of user data:', len(users['User-ID'].unique()))
# Grouping Book-Author' and count the number of books written by each author

author_counts = books.groupby('Book-Author')['Book-Title'].count()
# Sort authors in descending order
sorted_authors = author_counts.sort_values(ascending=False)
# Select the top 10 authors
top_10_authors = sorted_authors.head(10)

         ISBN                                         Book-Title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

            Book-Author Year-Of-Publication                   Publisher  
0    Mark P. O. Morford                2002     Oxford University Press  
1  Richard Bruce Wright                2001       HarperFlamingo Canada  
2          Carlo D'Este                1991             HarperPerennial  
3      Gina Bari Kolata                1999        Farrar Straus Giroux  
4       E. J. W. Barber                1999  W. W. Norton &amp; Company  
   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3  

In [2]:

# Data preprocessing
# Merging Files and Determining the Total Number of Ratings
# Merging dataframe ratings with books based on ISBN values
books = pd.merge(ratings, books, on='ISBN', how='left')
print(books)

# Checking missing value using isnull() function
print(books.isnull().sum())

all_books_clean = books.dropna()
print(all_books_clean.isnull().sum())

# Sort books by ISBN then put them in the fix_books variable
fix_books = all_books_clean.sort_values('ISBN', ascending=True)
print(fix_books)

preparation = fix_books.drop_duplicates('ISBN')
print(preparation)

# convert the 'ISBN' data series into list form
isbn_id = preparation['ISBN'].tolist()

# convert the 'Book-Title' data series into list form
book_title = preparation['Book-Title'].tolist()

# convert the 'Book-Author' data series into list form
book_author = preparation['Book-Author'].tolist()

# convert the 'Year-Of-Publication' data series into list form
year_of_publication = preparation['Year-Of-Publication'].tolist()

# convert the 'Publisher' data series into list form
publisher = preparation['Publisher'].tolist()

print(len(isbn_id))
print(len(book_title))
print(len(book_author))
print(len(year_of_publication))
print(len(publisher))

         User-ID         ISBN  Book-Rating  \
0         276725   034545104X            0   
1         276726   0155061224            5   
2         276727   0446520802            0   
3         276729   052165615X            3   
4         276729   0521795028            6   
...          ...          ...          ...   
1149775   276704   1563526298            9   
1149776   276706   0679447156            0   
1149777   276709   0515107662           10   
1149778   276721   0590442449           10   
1149779   276723  05162443314            8   

                                                Book-Title        Book-Author  \
0                                     Flesh Tones: A Novel         M. J. Rose   
1                                         Rites of Passage         Judith Rae   
2                                             The Notebook    Nicholas Sparks   
3                                           Help!: Level 1      Philip Prowse   
4        The Amsterdam Connection : Level 

In [3]:
#create a dictionary to determine key-value pairs for the
#isbn_id, book_title, book_author, year_of_publication, and publisher data
books_new = pd.DataFrame({
    'isbn': isbn_id,
    'book_title': book_title,
    'book_author': book_author,
    'year_of_publication': year_of_publication,
    'publisher': publisher
})
books_new = books_new[:20000]

In [4]:
# Collaborative Filtering
# convert User-ID to a list without matching values
user_ids = ratings['User-ID'].unique().tolist()
print('list userIDs: ', user_ids)

# perform User-ID encoding
user_to_user_encoded = {x: i for i, x in enumerate(user_ids)}
print('encoded userID: ', user_to_user_encoded)

# carry out the process of encoding numbers into User-ID
user_encoded_to_user = {i: x for i, x in enumerate(user_ids)}
print('encoded number to userID: ', user_encoded_to_user)

# convert ISBNs to a list without matching values
isbn_id = ratings['ISBN'].unique().tolist()

# perform ISBN encoding
isbn_to_isbn_encoded = {x: i for i, x in enumerate(isbn_id)}

# carry out the process of encoding numbers to ISBN
isbn_encoded_to_isbn = {i: x for i, x in enumerate(isbn_id)}

# Disable the SettingWithCopyWarning warning
pd.options.mode.chained_assignment = None # "warn" or "raise" to turn it back on

# Mapping User-ID to user dataframe
ratings['user'] = ratings['User-ID'].map(user_to_user_encoded)

# Mapping ISBN to book title dataframe
ratings['book_title'] = ratings['ISBN'].map(isbn_to_isbn_encoded)

# get the number of users
num_users = len(user_to_user_encoded)
print(num_users)

# get the number of book titles
num_book_title = len(isbn_to_isbn_encoded)
print(num_book_title)

# convert the rating to a float value
ratings['Book-Rating'] = ratings['Book-Rating'].values.astype(np.float32)

# minimum rating value
min_rating = min(ratings['Book-Rating'])

# maximum rating value
max_rating = max(ratings['Book-Rating'])
print('Number of Users: {}, Number of Books: {}, Min Rating: {}, Max Rating: {}'.format(
    num_users, num_book_title, min_rating, max_rating
))

data = books_new
data.sample(5)


list userIDs:  [276725, 276726, 276727, 276729, 276733, 276736, 276737, 276744, 276745, 276746, 276747, 276748, 276751, 276754, 276755, 276760, 276762, 276765, 276768, 276772, 276774, 276780, 276786, 276788, 276796, 276798, 276800, 276803, 276804, 276806, 276808, 276811, 276812, 276813, 276814, 276817, 276820, 276822, 276827, 276828, 276830, 276832, 276833, 276835, 276837, 276838, 276840, 276842, 276847, 276848, 276850, 276852, 276853, 276854, 276856, 276857, 276859, 276861, 276862, 276863, 276866, 276869, 276870, 276872, 276873, 276875, 276878, 276879, 276884, 276887, 276888, 276889, 276890, 276896, 276904, 276905, 276911, 276912, 276915, 276916, 276925, 276927, 276928, 276929, 276934, 276936, 276939, 276943, 276946, 276949, 276950, 276953, 276954, 276957, 276959, 276963, 276964, 276965, 276975, 276981, 276984, 276986, 276988, 276989, 276990, 276992, 276994, 276997, 276998, 277002, 277007, 277009, 277010, 277012, 277018, 277019, 277022, 277023, 277028, 277031, 277032, 277035, 277036, 

105283
340556
Number of Users: 105283, Number of Books: 340556, Min Rating: 0.0, Max Rating: 10.0


Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
9528,61059455,Piercing the Darkness : Undercover with Vampir...,Katherine Ramsland,1999,HarperTorch
18003,140266712,Listening to Prozac,Peter D. Kramer,1997,Penguin Books
3665,60148691,Pictorial Guide to the Planets,Joseph Hollister Jackson,1981,Harpercollins Childrens Books
2748,30343984,Economics : Private and Public Choice with Xtr...,James D. Gwartney,2002,South-Western College Pub
7641,60937114,First Mothers: The Women Who Shaped the Presid...,Bonnie Angelo,2001,Perennial


In [5]:
#TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tf = TfidfVectorizer()

# Perform IDF calculations on book_author data
tf.fit(data['book_author'])

# Mapping array from integer index features to name features
tf.get_feature_names_out()

# Performs a fit and then transforms it into matrix form
tfidf_matrix = tf.fit_transform(data['book_author'])

tfidf_matrix.todense()

pd.DataFrame(
    tfidf_matrix.todense(),
    columns=tf.get_feature_names_out(),
    index=data.book_title
).sample(15, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,lydon,rosie,dunwich,edgerton,akl,mastoon,rickie,branon,mcmullan,beaver,tenuta,warrick,gendler,peterson,rosita
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
All About Love: New Visions,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Time to Sing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Girl Who Trod On a Loaf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Little Little,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Pageant of World History,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Moon Under Her Feet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Giants in the Earth : A Saga of the Prairie (Perennial Classics),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Portrait in Sepia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KGB: The Inside Story of Its Foreign Operations from Lenin to Gorbachev,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Mystery of Edwin Drood (Penguin Classics),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#Cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Calculating cosine similarity on the tf-idf matrix
cosine_sim = cosine_similarity(tfidf_matrix)

# Create a dataframe from the cosine_sim variable with rows and columns in the form of book titles
cosine_sim_df = pd.DataFrame(cosine_sim, index=data['book_title'], columns=data['book_title'])
print('Shape:', cosine_sim_df.shape)

# View the similarity matrix for each book title
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

#Get recommendation
def book_recommendation(book_title, similarity_data=cosine_sim_df, items=data[['book_title', 'book_author']], k=5):
    # Retrieve data by using argpartition to partition indirectly along a given axis
    # Dataframe converted to numpy
    # Range(start, stop, step)
    index = similarity_data.loc[:,book_title].to_numpy().argpartition(range(-1, -k, -1))
    
    # Retrieve data with the greatest similarity from the existing index
    closest = similarity_data.columns[index[-1:-(k+2):-1]]
    
    # Drop book_title so that the name of the book you are looking for does not appear in the recommendation list
    closest = closest.drop(book_title, errors='ignore')
    return pd.DataFrame(closest).merge(items).head(k)

book_title_test = "Entering the Silence : Becoming a Monk and a Writer (The Journals of Thomas Merton, V. 2)" # book title example
data[data.book_title.eq(book_title_test)]

# Get recommendations for similar book titles
book_recommendation(book_title_test)

#Splitting data for training and validation
df_rating = ratings.sample(frac=1, random_state=42)
print(df_rating)

#Next, the process of dividing the data into training and validation data is carried out with a 90:10 composition. However, before that
# create a variable x to match user data and book title into one value
x = df_rating[['user', 'book_title']].values

# create a y variable to create a rating of the results
y = df_rating['Book-Rating'].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

# divide into 90% train data and 10% validation data
train_indices = int(0.9 * df_rating.shape[0])
X_train, X_val, y_train, y_val = (
    x[:train_indices],
    x[train_indices:],
    y[:train_indices],
    y[train_indices:]
)



Shape: (20000, 20000)
         User-ID        ISBN  Book-Rating   user  book_title
178554     38781  0373259131          0.0  15560       99291
533905    128835  0811805905          8.0  49582       59185
1091374   261829  037324486X          0.0  99796      121427
1036247   247747  0531303306          0.0  94309      320740
309523     74076  0316812404          0.0  28854       32411
...          ...         ...          ...    ...         ...
110268     25458  0142000191          0.0  10260       69256
259178     60146  0060964049          8.0  23699         527
131932     30509  1857230655          0.0  12254       79598
671155    163307  0446314145          0.0  62388      243103
121958     28150  0312195516          0.0  11319        1365

[1149780 rows x 5 columns]


In [None]:
np.save('x_train.npy', x_train)
np.save('x_val.npy', x_val)
np.save('y_train.npy', y_train)
np.save('y_val.npy', y_val)

In [7]:
import sagemaker
import numpy as np

bucket = 'gcu-ml2-005-bucket'
sagemaker_session = sagemaker.Session(
    default_bucket = bucket
)


np.save('X_train.npy', X_train)
np.save('X_val.npy', X_val)
np.save('y_train.npy', y_train)
np.save('y_val.npy', y_val)


data_path = sagemaker_session.upload_data(path='X_train.npy')
val_data_path = sagemaker_session.upload_data(path='X_val.npy')
train_labels_path = sagemaker_session.upload_data(path='y_train.npy')
val_labels_path = sagemaker_session.upload_data(path='y_val.npy')


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [22]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow

# Define your S3 bucket and paths to data
bucket_name = 'gcu-ml2-005-bucket'
training_data_uri = 's3://{}'.format(bucket_name)

# Create a SageMaker session
sagemaker_session = sagemaker.Session(
    default_bucket = bucket_name
)
role = get_execution_role()
distributed_training_spec = {'parameter_server': {'enabled': True}}


estimator = TensorFlow(
    entry_point='RecommenderNet.py',
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    framework_version='2.1.0',
    py_version='py3',
    distribution=distributed_training_spec,
    hyperparameters={
        'num_users': num_users,
        'num_book_title': num_book_title,
        'embedding_size': 50,
        'dropout_rate': 0.2,
        'epochs': 10,
    },
    
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [23]:
%run RecommenderNet.py
import os
import json

training_data_uri = os.path.join(f's3://{bucket_name}', 'data') 
estimator.fit(training_data_uri)

TypeError: the JSON object must be str, bytes or bytearray, not NoneType

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Amazon SageMaker Debugger does not currently support Parameter Server distribution
INFO:sagemaker:Amazon SageMaker Debugger does not currently support Parameter Server distribution
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2023-12-04-07-12-53-003


Using provided s3_resource
2023-12-04 07:12:53 Starting - Starting the training job...
2023-12-04 07:13:10 Starting - Preparing the instances for training......
2023-12-04 07:14:11 Downloading - Downloading input data......
2023-12-04 07:15:23 Training - Training image download completed. Training in progress.
2023-12-04 07:15:23 Uploading - Uploading generated training model[34m2023-12-04 07:15:15,448 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2023-12-04 07:15:15,461 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-12-04 07:15:15,697 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-12-04 07:15:15,713 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-12-04 07:15:15,729 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-12-04 07:15:15,739 sagemaker-containers INFO     I

UnexpectedStatusException: Error for Training job tensorflow-training-2023-12-04-07-12-53-003: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/usr/bin/python3 RecommenderNet.py --dropout_rate 0.2 --embedding_size 50 --epochs 10 --model_dir s3://sagemaker-ap-northeast-2-629515838455/tensorflow-training-2023-12-04-07-12-53-003/model --num_book_title 340556 --num_users 105283", exit code: 1

In [None]:
import sagemaker 
from sagemaker import get_execution_role 
from sagemaker.model import Model 

# SageMaker 세션 생성
sagemaker_session = sagemaker.Session() 
# 실행 역할 가져오기 
role = get_execution_role() 
# 모델 데이터의 S3 경로 
model_data = 's3://gcu-ml2-005-bucket'
# 프레임워크별 모델 클래스 사용 (예: TensorFlow, PyTorch 등)
# 여기서는 TensorFlow 모델을 예로 들겠습니다.
from sagemaker.tensorflow.model import TensorFlowModel 
# 모델 인스턴스 생성 
model = TensorFlowModel(model_data=model_data,
                        role=role, 
                        framework_version='2.3.0', # 사용하는 TensorFlow 버전에 맞게 설정                        
                        sagemaker_session=sagemaker_session)


In [None]:
endpoint_name = 'book-endpoint-ncf-005'
model_name = 'book-recommend-keras-model-005'

predictor = estimator.deploy(
    initial_instance_count=1, 
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name,
    model_name=model_name,
)

In [None]:
%run RecommenderNet.py

model = RecommenderNet(num_users, num_book_title, 50)

model.compile(
    loss = tf.keras.losses.BinaryCrossentropy(),
    optimizer = keras.optimizers.Adam(learning_rate=1e-4),
    metrics = [tf.keras.metrics.RootMeanSquaredError()]
)
# start the training process
history = model.fit(
    x = x_train,
    y = y_train,
    batch_size = 50,
    epochs =10,
    validation_data = (x_val, y_val)
)