In [1]:
import pandas as pd
import re
from collections import Counter

In [2]:
df_filter = pd.read_csv('df_filter.csv')

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\13808\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\13808\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\13808\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\13808\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

In [6]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    additional_stopwords = {'translate', 'translation', 'already'}
    stop_words = stop_words.union(additional_stopwords)
    punctuation = set(string.punctuation)
    months = ["January", "February", "March", "April", "May", "June", 
              "July", "August", "September", "October", "November", "December",
              "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Sept", "Oct", "Nov", "Dec"]
    months_regex = r'\b(?:' + '|'.join(months) + r')\.?\b'

    text = re.sub(months_regex, '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = text.replace('./', '')
    text = text.replace('./', '')
    
    words = word_tokenize(text)
    filtered_text = ' '.join([word for word in words if word.lower() not in stop_words and word not in punctuation])
    return filtered_text

In [8]:
df_filter['title'] = df_filter['title'].apply(lambda x: remove_stopwords(x) if isinstance(x, str) else x)
df_filter['alternative_title'] = df_filter['alternative_title'].apply(lambda x: remove_stopwords(x) if isinstance(x, str) else x)
df_filter['abstract'] = df_filter['abstract'].apply(lambda x: remove_stopwords(x) if isinstance(x, str) else x)

In [9]:
df_filter = df_filter.fillna("")

In [9]:
# Word2vec pretrain model
# https://code.google.com/archive/p/word2vec/

In [10]:
import gensim

wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [11]:
from tqdm import tqdm

def recommendation(input_uuid):
    input_row = df_filter[df_filter['identifier_uuid'] == input_uuid].to_numpy()

    if len(input_row) == 0:
        print(f"No book found with UUID: {input_uuid}")
        return []

    matrix_input_title_vocab = []
    for list_ in input_row:
        title_words = [word for word in list_[1].split() if word in wv.vocab] if isinstance(list_[1], str) else []
        alt_title_words = [word for word in list_[2].split() if word in wv.vocab] if isinstance(list_[2], str) else []
        abstract_words = [word for word in list_[3].split() if word in wv.vocab] if isinstance(list_[3], str) else []
        matrix_input_title_vocab.append([list_[0], title_words, alt_title_words, abstract_words])

    matrix_similarity = []

    for list1 in df_filter.to_numpy():
        if list1[0] == input_uuid:  
            continue

        list1_title_words = [word for word in list1[1].split() if word in wv.vocab] if isinstance(list1[1], str) else []
        list1_alt_title_words = [word for word in list1[2].split() if word in wv.vocab] if isinstance(list1[2], str) else []
        list1_abstract_words = [word for word in list1[3].split() if word in wv.vocab] if isinstance(list1[3], str) else []

        for list2 in matrix_input_title_vocab:
            score_title = score_alt_title = score_abstract = 0

            if list1_title_words and list2[1]:
                score_title = wv.n_similarity(list1_title_words, list2[1])
            if list1_alt_title_words and list2[2]:
                score_alt_title = wv.n_similarity(list1_alt_title_words, list2[2])
            if list1_abstract_words and list2[3]:
                score_abstract = wv.n_similarity(list1_abstract_words, list2[3])

            score_max_title = max(score_title, score_alt_title)
            final_score = score_max_title + score_abstract
            matrix_similarity.append([list1[0], final_score])

    sorted_similarities = sorted(matrix_similarity, key=lambda x: x[1], reverse=True)[:10]

    recommended_uuids = [uuid for uuid, score in sorted_similarities]
    return recommended_uuids

In [12]:
recommendation('000032e8-b5d5-43a1-8f0b-e410327580b6')

['020c14b6-c11f-4ead-b8b6-6b3a4e4073b3',
 '2fefcf62-1443-4b88-8ae8-4776f6495776',
 '3fbedac9-6a0c-4af3-9024-e603515023f3',
 '5062c101-452c-489e-8458-778c29d733b4',
 '64bd8e06-de2b-4f77-8ebe-4377226a7535',
 '693e2428-08e1-4970-89cb-1f1071a68330',
 '8a972717-f82c-45e3-8eb7-ddb3d06bfa38',
 '8afa792e-9033-40ae-917a-c59454e7766c',
 '8bdc72b1-a2da-404e-920a-6d8b485e0dd6',
 '9b0af09d-c80a-4322-9f02-7d623da1ef0a']

In [14]:
import numpy as np

def calculate_similarity_matrices(df):
    df = df.reset_index(drop=True)
    num_books = len(df)

    title_matrix = np.zeros((num_books, num_books))
    alt_title_matrix = np.zeros((num_books, num_books))
    abstract_matrix = np.zeros((num_books, num_books))

    titles = [np.array([wv[word] for word in title.split() if word in wv.vocab]) for title in df['title'].fillna('')]
    alt_titles = [np.array([wv[word] for word in alt_title.split() if word in wv.vocab]) for alt_title in df['alternative_title'].fillna('')]
    abstracts = [np.array([wv[word] for word in abstract.split() if word in wv.vocab]) for abstract in df['abstract'].fillna('')]

    def average_vector(vectors):
        if len(vectors) == 0:
            return np.zeros(300)
        return np.mean(vectors, axis=0)

    for i in tqdm(range(num_books)):
        for j in range(i, num_books):
            avg_vec_i_title = average_vector(titles[i])
            avg_vec_j_title = average_vector(titles[j])
            avg_vec_i_alt_title = average_vector(alt_titles[i])
            avg_vec_j_alt_title = average_vector(alt_titles[j])
            avg_vec_i_abstract = average_vector(abstracts[i])
            avg_vec_j_abstract = average_vector(abstracts[j])

            title_similarity = np.dot(avg_vec_i_title, avg_vec_j_title) / (np.linalg.norm(avg_vec_i_title) * np.linalg.norm(avg_vec_j_title) + 1e-10)
            title_matrix[i, j] = title_matrix[j, i] = title_similarity

            alt_title_similarity = np.dot(avg_vec_i_alt_title, avg_vec_j_alt_title) / (np.linalg.norm(avg_vec_i_alt_title) * np.linalg.norm(avg_vec_j_alt_title) + 1e-10)
            alt_title_matrix[i, j] = alt_title_matrix[j, i] = alt_title_similarity

            abstract_similarity = np.dot(avg_vec_i_abstract, avg_vec_j_abstract) / (np.linalg.norm(avg_vec_i_abstract) * np.linalg.norm(avg_vec_j_abstract) + 1e-10)
            abstract_matrix[i, j] = abstract_matrix[j, i] = abstract_similarity

    return title_matrix, alt_title_matrix, abstract_matrix

In [15]:
title_matrix, alt_title_matrix, abstract_matrix = calculate_similarity_matrices(df_filter)

100%|█████████████████████████████████████████████████████████████████████████| 42261/42261 [20:47:28<00:00,  1.77s/it]


In [20]:
def get_top_recommendations(title_matrix, alt_title_matrix, abstract_matrix, top_n=10):
    num_books = title_matrix.shape[0]
    top_recommendations = {}

    for i in range(num_books):
        scores = np.maximum(title_matrix[i], alt_title_matrix[i]) + abstract_matrix[i]
        scores[i] = -np.inf
        top_indices = np.argpartition(scores, -top_n)[-top_n:]
        top_indices = top_indices[np.argsort(-scores[top_indices])]
        top_recommendations[i] = [(index, scores[index]) for index in top_indices]

    return top_recommendations

recommendations = get_top_recommendations(title_matrix, alt_title_matrix, abstract_matrix)

In [36]:
recommendations[0]

[(34274, 1.9556227562495014),
 (39748, 1.955416939265599),
 (39856, 1.95190068252902),
 (13258, 1.951762379952316),
 (41649, 1.9515244518848027),
 (41827, 1.9514178615534066),
 (8560, 1.9513294376520687),
 (38548, 1.9493518347922212),
 (8910, 1.9488663768467793),
 (6417, 1.9472349684426402)]

In [37]:
df_filter.to_numpy()[0]

array(['00000c04-4b98-4c0f-935f-3e9227cd71bb', 'Original report',
       'Primary production bulletin',
       "`` Origin Bulletin '' monthly publication Singapore Agri-Food Veterinary Authority introducing technical knowledge agriculture animal husbandry guiding application agricultural science management knowledge 66th issue content including quail breeding agricultural Q flower cultivation Newcastle disease poultry"],
      dtype=object)

In [38]:
df_filter.to_numpy()[6417]

array(['299d8cb4-f571-41f5-9cd2-ac000c694b51', 'Original report',
       'Primary production bulletin Village produce news Primary produce report',
       "`` 'Origin Report monthly publication Singapore Agri-Food Veterinary Authority introducing technical knowledge agriculture animal husbandry guiding application agricultural science management knowledge 41st issue content including tomato cultivation new agricultural knowledge agricultural Q prevention piglet diseases introduction dairy cows ''"],
      dtype=object)

In [None]:
'''
import json

np.save('title_matrix.npy', title_matrix)
np.save('alt_title_matrix.npy', alt_title_matrix)
np.save('abstract_matrix.npy', abstract_matrix)

with open('recommendations.json', 'w') as f:
    recommendations_to_save = {str(k): [(int(index), float(score)) for index, score in v] for k, v in recommendations.items()}
    json.dump(recommendations_to_save, f, indent=4)
'''