In [1]:
import pandas as pd

data = pd.read_csv("books_data.csv", nrows=1000)

In [None]:
data.columns

In [None]:
data.info()

In [4]:
# Convert 'average_rating' to a numeric data type
data['ratingsCount'] = pd.to_numeric(data['ratingsCount'], 
                                       errors='coerce')

In [5]:
# Create a new column 'book_content' by combining 'title' and 'authors'
data['book_content'] = (data['Title'] + ' ') * 2 + data['description'] + ' ' + data['authors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

### Text Preprocessing

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['book_content'].values.astype('U'))

# Convert the texts to sequences and pad them
sequences = tokenizer.texts_to_sequences(data['book_content'].values.astype('U'))
padded_sequences = pad_sequences(sequences, maxlen=500)  # Adjust maxlen as necessary

In [None]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


### CNN Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=5000, output_dim=128, input_length=500))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))  # Extra convolutional layer
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(64, activation='relu'))  # Extra Dense layer for more complex embeddings
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.keras.layers import Input, Concatenate
from tensorflow.keras.models import Model
import tensorflow as tf

data['categories'] = data['categories'].fillna('Unknown')  # Ensure 'categories' column exists and has no NaNs
labels = []
pairs = []

# Create pairs of books and assign labels
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if data['categories'].iloc[i] == data['categories'].iloc[j]:
            labels.append(1)  # Similar books (same categories)
        else:
            labels.append(0)  # Dissimilar books (different categoriess)
        pairs.append((padded_sequences[i], padded_sequences[j]))

# Convert pairs and labels to numpy arrays
import numpy as np
labels = np.array(labels)
pairs = np.array(pairs)

# Separate input into two arrays: one for each book in the pair
X_train_1 = np.array([p[0] for p in pairs])
X_train_2 = np.array([p[1] for p in pairs])

# Define the CNN branch to be shared between both inputs
def create_cnn_model():
    input_layer = Input(shape=(500,))
    embedding_layer = Embedding(input_dim=5000, output_dim=128, input_length=500)(input_layer)
    conv_layer1 = Conv1D(filters=128, kernel_size=5, activation='relu')(embedding_layer)
    conv_layer2 = Conv1D(filters=128, kernel_size=5, activation='relu')(conv_layer1)
    pooling_layer = GlobalMaxPooling1D()(conv_layer2)
    dense_layer = Dense(128, activation='relu')(pooling_layer)
    return Model(inputs=input_layer, outputs=dense_layer)

# Create two inputs for the pairs
input_1 = Input(shape=(500,))
input_2 = Input(shape=(500,))

# Create the CNN branch and share it between both inputs
cnn_branch = create_cnn_model()

# Get the embeddings for both inputs
output_1 = cnn_branch(input_1)
output_2 = cnn_branch(input_2)

# Concatenate the outputs and add the Dense layers
concatenated = Concatenate()([output_1, output_2])
dense_layer1 = Dense(128, activation='relu')(concatenated)
dense_layer2 = Dense(64, activation='relu')(dense_layer1)
output_layer = Dense(1, activation='sigmoid')(dense_layer2)

# Build the Siamese model
siamese_model = Model(inputs=[input_1, input_2], outputs=output_layer)
siamese_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using the pairs of sequences
with tf.device('/GPU:0'):
    siamese_model.fit([X_train_1, X_train_2], labels, epochs=5, batch_size=32)

### Train the model

In [None]:
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

book_embeddings = cnn_branch.predict(padded_sequences)
normalized_book_embeddings = normalize(book_embeddings)
cosine_sim_matrix = cosine_similarity(normalized_book_embeddings)

In [11]:
def recommend_books(book_title, threshold, cosine_sim_matrix):
    # Get the index of the book that matches the title
    idx = data[data['Title'] == book_title].index[0]

    # Get the cosine similarity scores for all books with this book
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [(i, "{:.5f}".format(score)) for i, score in sim_scores[0:] if score >= threshold]

    # Get the book titles and their similarity scores
    book_recommendations = [(data['Title'].iloc[i[0]], i[1]) for i in sim_scores]

    return book_recommendations

### Recommend book using CNN

In [12]:
from IPython.display import clear_output
book_title = ''
while (book_title != 'q'):
  clear_output(wait=True)
  book_title = input("Enter the title of a book: ")
  # recommended_books = recommend_books(book_title, threshold=0.1)
  recommended_books = recommend_books(book_title, 0.2, cosine_sim_matrix)
  f = open('output.txt', 'w')
  f.write('Counts: ' + str(len(recommended_books)) + '\n\n')
  for book in recommended_books:
    f.write(book[1] + ' | ' + str(book[0]) + '\n')

  print('Found: ' + str(len(recommended_books)))

IndexError: index 0 is out of bounds for axis 0 with size 0