In [49]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/large-books-metadata-dataset-50-mill-entries/books.json/books.json
/kaggle/input/large-books-metadata-dataset-50-mill-entries/series.json/series.json
/kaggle/input/large-books-metadata-dataset-50-mill-entries/list.json/list.json
/kaggle/input/large-books-metadata-dataset-50-mill-entries/authors.json/authors.json


In [174]:
import os
import pickle
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors


chunk_size = 250000
chunks = pd.read_json("/kaggle/input/large-books-metadata-dataset-50-mill-entries/books.json/books.json", lines=True, chunksize=chunk_size)

offset = 22
chunk_limit = 1
chunk_ctr = 0

df_list = []
for chunk in chunks:
    if chunk_ctr == offset:
        df_list.append(chunk)
        if len(df_list) >= chunk_limit:
            break
    else:
        chunk_ctr += 1

In [179]:
df_books = pd.concat(df_list, ignore_index=True)
filtered_df_books = df_books[
    [
        'id', 
        'title', 
        'author_id', 
        'author_name', 
        'author_id', 
        'average_rating', 
        'publication_date', 
        'shelves'
    ]
]
filtered_df_books.rename(
    columns={
        'average_rating': 'rating', 
        'shelves': 'genre'
    }, inplace=True
)
print(f'sample: {filtered_df_books.iloc[0]}')
print(f'total rows: {filtered_df_books.shape[0]}')

sample: id                                                            7555155
title               God Speaks in Dreams: Connect with Him and Eac...
author_id                                                      865891
author_name                                            Carol Oschmann
author_id                                                      865891
rating                                                            5.0
publication_date                                           2009-08-11
genre                                                              []
Name: 0, dtype: object
total rows: 250000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_books.rename(


In [141]:
# limit to small space of supported categories only for now, as we can't afford a big dataset due to resource constraints
# genres to support
# fiction, literature, plays, contemporary, romance, classic, politics,
# literary-fiction, historical, general-fiction, 
# theatre, literary, modern-classics
supported_genres = {
    'fiction': 1,
    'literature': 1,
    'plays': 1,
    'contemporary': 1,
    'romance': 1,
    'classic': 1,
    'politics': 1,
    'historical': 1,
    'theatre': 1,
    'literary': 1,
    'film': 1,
    'school-books': 1,
    'modern-classics': 1,
    'general-fiction': 1,
    'literary-fiction': 1
}

In [180]:
book_data = []
for index, row in filtered_df_books.iterrows():
    # limit the category to just 
    ctr = 0
    genres = []
    for genre in row['genre']:
        name = genre['name']
        if supported_genres.get(name, None):
            genres.append(name)
            if len(genres) > 3:
                break    
    if len(genres) > 0:
        book_data.append({
            'id': row['id'],
            'title': row['title'],
            'author_id': row['author_id'],
            'author_name': row['author_name'],
            'genre': genres,
            'rating': row['rating']
        })
    
for_training_books_df = pd.DataFrame(book_data)
print(f'sample: {for_training_books_df.iloc[0]}')
print(f'total rows: {for_training_books_df.shape[0]}')

df_books_file_name = f"dataset/df_books_{offset}.json"
for_training_books_df.to_json(df_books_file_name, orient='records', lines=True)
! tree -DU -h dataset

sample: id                                                       7555156
title          The Big Rich: The Rise and Fall of the Greates...
author_id      author_id    32587
author_id    32587
Name: 1,...
author_name                                       Bryan Burrough
genre                                                 [politics]
rating                                                      4.02
Name: 0, dtype: object
total rows: 54395
[01;34mdataset[00m
|-- [7.9M Aug 29 14:05]  df_books_5.json
|-- [9.1M Aug 29 15:35]  df_books_20.json
|-- [9.3M Aug 29 13:53]  df_books_4.json
|-- [6.3M Aug 29 14:18]  df_books_9.json
|-- [3.7M Aug 29 15:10]  df_books_17.json
|-- [2.8M Aug 29 14:40]  df_books_12.json
|-- [2.3M Aug 29 14:50]  df_books_14.json
|-- [4.2M Aug 29 14:28]  df_books_11.json
|-- [ 15M Aug 29 13:35]  df_books_0.json
|-- [1.9M Aug 29 14:45]  df_books_13.json
|-- [7.1M Aug 29 14:09]  df_books_7.json
|-- [8.9M Aug 29 13:45]  df_books_2.json
|-- [7.7M Aug 29 17:00]  df_books_21.json


In [218]:
import json
import traceback

# merge all datasets to one json file (1M)
merged_data = []
for _ in range(23):
    file_name = f'dataset/df_books_{_}.json'
    print(file_name)
    with open(file_name, 'r') as f:
        for item in f.read().split("\n"):
            row = item.strip()
            if not row: continue
            try:
                data = json.loads(row)
                merged_data.append(data)
            except:
                print(traceback.format_exc())

for_training_books_df_1m = pd.DataFrame(merged_data)
for_training_books_df_1m.to_json("df_books_1M.json", orient='records', lines=True)

dataset/df_books_0.json
dataset/df_books_1.json
dataset/df_books_2.json
dataset/df_books_3.json
dataset/df_books_4.json
dataset/df_books_5.json
dataset/df_books_6.json
dataset/df_books_7.json
dataset/df_books_8.json
dataset/df_books_9.json
dataset/df_books_10.json
dataset/df_books_11.json
dataset/df_books_12.json
dataset/df_books_13.json
dataset/df_books_14.json
dataset/df_books_15.json
dataset/df_books_16.json
dataset/df_books_17.json
dataset/df_books_18.json
dataset/df_books_19.json
dataset/df_books_20.json
dataset/df_books_21.json
dataset/df_books_22.json


In [221]:
# !! NEW !! train from 1M clean datapoints
for_training_books_df = pd.read_json("df_books_1M.json", lines=True)

In [226]:
print(f"total datapoints : {for_training_books_df.shape[0]}")

total datapoints : 1022335


In [227]:
# genre feature
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(for_training_books_df['genre'])

In [228]:
# rating feature
for_training_books_df['scaled_rating'] = StandardScaler().fit_transform(for_training_books_df[['rating']])

In [56]:
# author_name feature
ohe = OneHotEncoder()
author_features = ohe.fit_transform(for_training_books_df[['author_name']]).toarray()

In [229]:
# final training features (genre, author, rating)
# train_features = np.hstack([
#     genre_features, author_features,
#     for_training_books_df[['scaled_rating']].values
# ])

train_features = np.hstack([
    genre_features, for_training_books_df[['scaled_rating']].values
])
print(for_training_books_df.iloc[0])
print(train_features[0])

id                                                               2
title            Harry Potter and the Order of the Phoenix (Har...
author_id                                       [1077326, 1077326]
author_name                                           J.K. Rowling
genre                                                    [fiction]
rating                                                         4.5
scaled_rating                                             2.036048
Name: 0, dtype: object
[0.         0.         1.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         2.03604789]


In [230]:
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(train_features)

In [231]:
offset = "1m"
file_path = f'model_{offset}'
os.makedirs(file_path)

In [236]:
df_books_file_name = f"{file_path}/df_books_{offset}.json"
for_training_books_df.to_json(df_books_file_name, orient='records', lines=True)

with open(f'{file_path}/knn_model_{offset}.pkl', 'wb') as f:
    pickle.dump(knn, f)

with open(f'{file_path}/mlb_{offset}.pkl', 'wb') as f:
    pickle.dump(mlb, f)

In [240]:
! tree -DU -h model_1m
# ! rm -rf model*

[01;34mmodel_1m[00m
|-- [ 481 Aug 29 17:52]  mlb_1m.pkl
|-- [125M Aug 29 17:52]  knn_model_1m.pkl
`-- [189M Aug 29 17:52]  df_books_1m.json

0 directories, 3 files


In [242]:
import numpy as np
import pickle
import pandas as pd
import json


def recommend(genres, offset):

    # Load the books data, mlb, and knn model
    books_df = pd.read_json(f'model_{offset}/df_books_{offset}.json', orient='records', lines=True)

    with open(f'model_{offset}/mlb_{offset}.pkl', 'rb') as f:
        mlb = pickle.load(f)

    with open(f'model_{offset}/knn_model_{offset}.pkl', 'rb') as f:
        knn = pickle.load(f)

    def create_query_vector(genres, num_features):
        query_genre_features = mlb.transform([genres])
        # Fill the rest of the vector with zeros to match the expected feature size
        query_vector = np.hstack([query_genre_features, np.zeros((1, num_features - query_genre_features.shape[1]))])
        return query_vector
    
    # Assuming 2703 features based on the error message\
    num_features = knn.n_features_in_
    query_vector = create_query_vector(genres, num_features)

    # Get the nearest neighbors
    distances, indices = knn.kneighbors(query_vector)
    recommended_books = books_df.iloc[indices[0]]

    # Optionally, exclude the query title from recommendations
    query_title = None  # Replace with the title you want to exclude
    if query_title:
        recommended_books = recommended_books[recommended_books['title'] != query_title]

    # Format the result
    result = recommended_books[
        [
            'title', 
            'rating', 
            'author_name',
            'genre', 
         ]
    ].to_dict(orient='records')
    final_data = {
        "model": f"knn_model_{offset}",
        "datapoints": books_df.shape[0],
        "features": num_features,
        "data": result
    }
    print(json.dumps(final_data, indent=2))

In [245]:
import numpy as np
import pickle
import pandas as pd
import json


offest = "1m"

# Load the books data, mlb, and knn model
books_df = pd.read_json(f'model_{offset}/df_books_{offset}.json', orient='records', lines=True)

with open(f'model_{offset}/mlb_{offset}.pkl', 'rb') as f:
    mlb = pickle.load(f)

with open(f'model_{offset}/knn_model_{offset}.pkl', 'rb') as f:
    knn = pickle.load(f)

In [246]:
def create_query_vector(genres, num_features):
    query_genre_features = mlb.transform([genres])
    # Fill the rest of the vector with zeros to match the expected feature size
    query_vector = np.hstack([query_genre_features, np.zeros((1, num_features - query_genre_features.shape[1]))])
    return query_vector

def recommend(genres, offset):
    
    # Assuming 2703 features based on the error message\
    num_features = knn.n_features_in_
    query_vector = create_query_vector(genres, num_features)

    # Get the nearest neighbors
    distances, indices = knn.kneighbors(query_vector)
    recommended_books = books_df.iloc[indices[0]]

    # Optionally, exclude the query title from recommendations
    query_title = None  # Replace with the title you want to exclude
    if query_title:
        recommended_books = recommended_books[recommended_books['title'] != query_title]

    # Format the result
    result = recommended_books[
        [
            'title', 
            'rating', 
            'author_name',
            'genre', 
         ]
    ].to_dict(orient='records')
    final_data = {
        "model": f"knn_model_{offset}",
        "datapoints": books_df.shape[0],
        "features": num_features,
        "data": result
    }
    print(json.dumps(final_data, indent=2))

In [254]:
recommend(['fiction', 'politics'], '1m')

{
  "model": "knn_model_1m",
  "datapoints": 1022335,
  "features": 16,
  "data": [
    {
      "title": "Crossover (Cassandra Kresnov, #1)",
      "rating": 3.84,
      "author_name": "Joel Shepherd",
      "genre": [
        "fiction",
        "politics"
      ]
    },
    {
      "title": "Moving Mars (Queen of Angels, #3)",
      "rating": 3.84,
      "author_name": "Greg Bear",
      "genre": [
        "fiction",
        "politics"
      ]
    },
    {
      "title": "Singularity Sky (Eschaton, #1)",
      "rating": 3.84,
      "author_name": "Charles Stross",
      "genre": [
        "fiction",
        "politics"
      ]
    },
    {
      "title": "Coyote",
      "rating": 3.84,
      "author_name": "Allen M. Steele",
      "genre": [
        "fiction",
        "politics"
      ]
    },
    {
      "title": "To Be Real: Telling the Truth and Changing the Face of Feminism",
      "rating": 3.84,
      "author_name": "Rebecca Walker",
      "genre": [
        "fiction",
        "p