In [1]:
import pandas as pd

data = pd.read_csv("books_data.csv", nrows=50000)

In [2]:
data.columns

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'publishedDate', 'infoLink', 'categories', 'ratingsCount'],
      dtype='object')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Title          49999 non-null  object 
 1   description    36968 non-null  object 
 2   authors        46548 non-null  object 
 3   image          41330 non-null  object 
 4   previewLink    48665 non-null  object 
 5   publisher      35100 non-null  object 
 6   publishedDate  48247 non-null  object 
 7   infoLink       48665 non-null  object 
 8   categories     44053 non-null  object 
 9   ratingsCount   12669 non-null  float64
dtypes: float64(1), object(9)
memory usage: 3.8+ MB


In [4]:
# Convert 'average_rating' to a numeric data type
data['ratingsCount'] = pd.to_numeric(data['ratingsCount'], 
                                       errors='coerce')

In [5]:
# Create a new column 'book_content' by combining 'title' and 'authors'
data['book_content'] = (data['Title'] + ' ') * 2 + data['description'] + ' ' + data['authors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

### Text Preprocessing

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)  # Or any reasonable vocab size
tokenizer.fit_on_texts(data['book_content'].values.astype('U'))

# Convert the texts to sequences and pad them
sequences = tokenizer.texts_to_sequences(data['book_content'].values.astype('U'))
padded_sequences = pad_sequences(sequences, maxlen=500)  # Adjust maxlen as necessary

### CNN Model

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=5000, output_dim=128, input_length=500))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))  # Extra convolutional layer
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(64, activation='relu'))  # Extra Dense layer for more complex embeddings
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
data['categories'] = data['categories'].fillna('Unknown')  # Ensure 'categories' column exists and has no NaNs
labels = []
pairs = []

# Create pairs of books and assign labels
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        if data['categories'].iloc[i] == data['categories'].iloc[j]:
            labels.append(1)  # Similar books (same categories)
        else:
            labels.append(0)  # Dissimilar books (different categoriess)
        pairs.append((padded_sequences[i], padded_sequences[j]))

# Convert pairs and labels to numpy arrays
import numpy as np
labels = np.array(labels)
pairs = np.array(pairs)

# Separate input into two arrays: one for each book in the pair
X_train_1 = np.array([p[0] for p in pairs])
X_train_2 = np.array([p[1] for p in pairs])

cnn_model.fit(padded_sequences, labels, epochs=5, batch_size=64)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x00000274B97BB550>>
Traceback (most recent call last):
  File "C:\Users\Kenneth\AppData\Roaming\Python\Python311\site-packages\ipykernel\ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                                                 ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Kenneth\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 1503, in enumerate
    return list(_active.values()) + list(_limbo.values())
           ^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt: 


### Train the model

In [None]:
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

embeddings = cnn_model.predict(padded_sequences)
normalized_embeddings = normalize(embeddings)
cosine_sim_matrix = cosine_similarity(normalized_embeddings)

In [10]:
def recommend_books(book_title, threshold, cosine_sim_matrix):
    # Get the index of the book that matches the title
    idx = data[data['Title'] == book_title].index[0]

    # Get the cosine similarity scores for all books with this book
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = [(i, "{:.5f}".format(score)) for i, score in sim_scores[0:] if score >= threshold]

    # Get the book titles and their similarity scores
    book_recommendations = [(data['Title'].iloc[i[0]], i[1]) for i in sim_scores]

    return book_recommendations

### Recommend book using CNN

In [None]:
from IPython.display import clear_output
book_title = ''
while (book_title != 'q'):
  clear_output(wait=True)
  book_title = input("Enter the title of a book: ")
  # recommended_books = recommend_books(book_title, threshold=0.1)
  recommended_books = recommend_books(book_title, 0.2, cosine_sim_matrix)
  f = open('output.txt', 'w')
  f.write('Counts: ' + str(len(recommended_books)) + '\n\n')
  for book in recommended_books:
    f.write(book[1] + ' | ' + str(book[0]) + '\n')

  print('Found: ' + str(len(recommended_books)))