In [1]:
import warnings
import pandas as pd
import numpy as np
import pathlib
import os
from faker import Faker

In [3]:
fake = Faker()
n_patrons = 1000
print(f"Building {n_patrons} patron accounts")
values = tuple(
    zip([fake.name() for _ in range(n_patrons)],
        [fake.address() for _ in range(n_patrons)],
        [fake.unique.email() for _ in range(n_patrons)]))

## CHAT GPT SUGGESTED ##

# Function to simulate book selection
# Function to simulate book selection
def simulate_book_selection(persons, library):
    selections = []
    for person in persons:
        if np.random.rand() < person['probability_of_reading']:
            # Calculate weights based on hidden preferences
            weights = np.array([person['hidden_preferences'].get(genre, 0) for genre in library['Genre']])

            # Normalize weights to make them probabilities
            weights /= weights.sum()

            selected_book = np.random.choice(library['BookID'], p=weights)
            selections.append((person['person_id'], selected_book))
    return selections

# Generate a list of book genres
book_genres = ['Fiction', 'Mystery', 'Sci-Fi', 'Romance', 'Non-Fiction', 'Fantasy']

# Create a library of books with genres
num_books = 1000
library = pd.DataFrame({
    'BookID': np.arange(1, num_books + 1),
    'Genre': np.random.choice(book_genres, size=num_books)
})

# Create a list of persons with assigned genres and hidden preferences
num_persons = 100
persons = []
for person_id in range(1, num_persons + 1):
    assigned_genres = np.random.choice(book_genres, size=np.random.randint(1, len(book_genres) + 1), replace=False)

    # Generate hidden preferences for each person
    hidden_preferences = {genre: np.random.rand() for genre in book_genres}

    probability_of_reading = 0.01  # Very low probability of reading for each person
    persons.append({
        'person_id': person_id,
        'assigned_genres': assigned_genres,
        'hidden_preferences': hidden_preferences,
        'probability_of_reading': probability_of_reading
    })

# Simulate book selections for 100,000 iterations
num_iterations = 100000
selected_books = simulate_book_selection(persons, library)

# Display a sample of the selected books
print("Sample of selected books:")
print(selected_books[:10])


Building 1000 patron accounts
Sample of selected books:
[(96, 25)]


In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize the text and remove stop words
    tokens = [word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stopwords.words('english')]
    return set(tokens)

def weighted_jaccard_similarity(book1, book2):
    # Define weights for each text field
    weights = {'title': 3, 'authors': 2, 'publisher': 1.5, 'description': 1, 'genres': 1}

    # Preprocess each text field
    preprocessed_fields1 = {field: preprocess_text(book1[field]) for field in weights.keys()}
    preprocessed_fields2 = {field: preprocess_text(book2[field]) for field in weights.keys()}

    # Calculate weighted Jaccard similarity for each text field
    similarities = [(field, weights[field] * len(preprocessed_fields1[field].intersection(preprocessed_fields2[field])) /
                     len(preprocessed_fields1[field].union(preprocessed_fields2[field])))
                    for field in weights.keys()]

    # Combine individual similarities using the assigned weights
    overall_similarity = sum(similarity for _, similarity in similarities)

    return overall_similarity

# Example usage:
book1 = {
    'title': 'The Catcher in the Rye',
    'authors': 'J.D. Salinger',
    'publisher': 'Little, Brown and Company',
    'publish_year': 1951,
    'description': 'A classic novel about a teenage boy',
    'genres': 'Fiction, Coming-of-age'
}

book2 = {
    'title': 'To Kill a Mockingbird',
    'authors': 'Harper Lee',
    'publisher': 'J.B. Lippincott & Co.',
    'publish_year': 1960,
    'description': 'A novel set in the American South during the 1930s',
    'genres': 'Fiction, Southern Gothic'
}

similarity_score = weighted_jaccard_similarity(book1, book2)
print(f"Similarity Score: {similarity_score}")



Similarity Score: 0.4583333333333333


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Morri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Morri\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsRegressor

tfidf = TfidfVectorizer(analyzer='word',
                      ngram_range=(1, 10),
                      stop_words = 'english')


Generating bookshelves...
Generated 363 bookshelves containing 40000 books


In [10]:
vecs = tfidf.fit_transform(books["Title"].drop_duplicates())


In [11]:
vecs

<26737x341661 sparse matrix of type '<class 'numpy.float64'>'
	with 458900 stored elements in Compressed Sparse Row format>

In [12]:
similarity_matrix = cosine_similarity(vecs, vecs)

MemoryError: Unable to allocate 5.33 GiB for an array with shape (26737, 26737) and data type float64

In [5]:
import populate_books
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import preprocess_documents

books = populate_books.read_books_data()

titles =[TaggedDocument(doc, [i]) for i, doc in enumerate(preprocess_documents(books["Title"].drop_duplicates()))]


Generating bookshelves...
Generated 361 bookshelves containing 40000 books


In [6]:
titles[:5]

[TaggedDocument(words=['nation', 'danc', 'religion', 'ident', 'cultur', 'differ', 'caribbean'], tags=[0]),
 TaggedDocument(words=['jaquith', 'famili', 'america'], tags=[1]),
 TaggedDocument(words=['megaton', 'gambl'], tags=[2]),
 TaggedDocument(words=['sleep', 'boi'], tags=[3]),
 TaggedDocument(words=['dinner', 'antoin'], tags=[4])]

In [109]:

model = Doc2Vec(titles,
                vector_size=1000,
                window=5,
                min_count=1,
                workers=50,
                dm=0,
                epochs=1000) # PV-DBOW

In [123]:
model.save("d2v.model")

In [125]:
model = Doc2Vec.load("d2v.model")




AttributeError: 'numpy.ndarray' object has no attribute 'most_similar'

In [None]:
description =[TaggedDocument(doc, [i]) for i, doc in enumerate(preprocess_documents(books.drop_duplicates("Title")["description"]))]
model_description = Doc2Vec(description,
                vector_size=1000,
                window=5,
                min_count=1,
                workers=50,
                dm=0,
                epochs=100)

In [104]:
book_vecs = []
for book in books["Title"].drop_duplicates():
    book_vecs.append(model.infer_vector(preprocess_documents([book])[0]))

In [128]:
from sklearn.neighbors import NearestNeighbors
book_vecs = np.array([model.dv[i[0]] for _, i in titles])

book_vecs = np.array(book_vecs)
neigh = NearestNeighbors(n_neighbors=5, metric='cosine')
neigh.fit(book_vecs)

In [139]:
import joblib


joblib.dump(neigh, "../recommendation/neighbors.pkl")

['../recommendation/neighbors.pkl']

In [88]:
# print(book_vecs[:3])

In [129]:
sample = model.infer_vector(preprocess_documents(["Bible Jesus king"])[0])
dist, idxs = neigh.kneighbors([sample],5)


In [134]:
print (idxs)

[[12375  8367 13853 13270  7001]]


In [138]:
no_dupe = books.drop_duplicates("Title")
for i in idxs[0]:
    print(no_dupe.iloc[i]["Title"])
    for title, tag in titles:
        if i in tag:
            print(title)
            print(tag)
    # print(titles)


King Jesus,
['king', 'jesu']
[12375]
The Name of the King
['king']
[8367]
The Bible Jesus Read
['bibl', 'jesu', 'read']
[13853]
In the Name of Jesus
['jesu']
[13270]
The King's Coat
['king', 'coat']
[7001]


In [71]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

book_vec = []
b1 = books.iloc[36591]["Title"]
b2 = "Researchers Delight"
b3 = "Rugby and its implications"
book_vec.append(model.infer_vector(preprocess_documents([b1])[0]))
book_vec.append(model.infer_vector(preprocess_documents([b2])[0]))
book_vec.append(model.infer_vector(preprocess_documents([b3])[0]))
book_vec = np.array(book_vec)

In [72]:
def lower_triangle_average(matrix):
    lower_triangle = np.tril(matrix+3, -1)
    lower_triangle_values = lower_triangle.flatten()
    lower_triangle_values = lower_triangle[lower_triangle != 0]
    average = np.mean(lower_triangle_values-3)
    return average


In [73]:
book_vec_2 = []
for book in books[books["categories"] == "['Health & Fitness']"]["Title"]:
    book_vec_2.append(model.infer_vector(preprocess_documents([book])[0]))
book_vec_2 = np.array(book_vec_2)
sim_mat_2 = cosine_similarity(book_vec_2, book_vec_2)
print(lower_triangle_average(sim_mat_2),
lower_triangle_average(np.abs(sim_mat_2)),
np.sqrt(lower_triangle_average(np.square(sim_mat_2))))

0.21931298 0.23320505 0.28350484


In [74]:
book_vec_3 = []
for book in books[books["categories"] == '["Misc"]']["Title"]:
    book_vec_3.append(model.infer_vector(preprocess_documents([book])[0]))
book_vec_3 = np.array(book_vec_3)
sim_mat_3 = cosine_similarity(book_vec_3, book_vec_3)
print(lower_triangle_average(sim_mat_3),
lower_triangle_average(np.abs(sim_mat_3)),
np.sqrt(lower_triangle_average(np.square(sim_mat_3))))

0.1497528 0.1691976 0.21097985


In [75]:
lower_triangle_average(cosine_similarity(book_vec_2, book_vec_3))

0.1685921

In [37]:
cosine_similarity(book_vec, book_vec)


array([[1.0000001 , 0.5533638 , 0.68679124],
       [0.5533638 , 0.99999994, 0.1655393 ],
       [0.68679124, 0.1655393 , 1.        ]], dtype=float32)

In [15]:
books[books["categories"] != '["Misc"]']["categories"].head()

8361    ['Health & Fitness']
8362    ['Health & Fitness']
8363    ['Health & Fitness']
8364    ['Health & Fitness']
8365    ['Health & Fitness']
Name: categories, dtype: object

In [None]:
descriptions =[TaggedDocument(doc, [i]) for i, doc in enumerate(preprocess_documents(books["description"].drop_duplicates()))]


In [17]:
books[books["categories"]=="['Education']"]["Title"]

36089    Situations: A Casebook of Virtual Realities fo...
36090    Playing Along: 37 Group Learning Activities Bo...
36091          Learning To Bow - Inside The Heart Of Japan
36092    Boys Themselves: A Return to Single-Sex Education
36093    The Missing 'Gator of Gumbo Limbo: An Ecologic...
                               ...                        
36591                               The Nature of Research
36592    Grant Writing for Teachers: If You Can Write a...
36593         Kids in Print: Publishing a School Newspaper
36594    Cities of God And Nationalism Rome, Mecca, And...
36595                                         Odd girl out
Name: Title, Length: 507, dtype: object

In [None]:
# Persist a model to disk with:

