In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Load dataset
try:
    df = pd.read_csv('cleaned_books_data.xls')
    print(f"Dataset loaded successfully, rows: {len(df)}")
except FileNotFoundError:
    print("Error: 'books_data_file.csv' not found, please check the file path.")
    exit()

# Check if required columns exist
required_columns = ['title', 'description']
if not all(col in df.columns for col in required_columns):
    print(f"Error: Dataset missing columns: {[col for col in required_columns if col not in df.columns]}")
    exit()

# Select columns and preprocess
df = df[required_columns].drop_duplicates().dropna()

print(f"Rows after preprocessing: {len(df)}")

# Combine title and description
df['content'] = df['title'] + ' ' + df['description']
print(f"Content column sample: {df['content'].head(2)}")

# Build TF-IDF model
tfidf = TfidfVectorizer(stop_words='english', max_features=500, min_df=5)
tfidf_matrix = tfidf.fit_transform(df['content'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"TF-IDF matrix memory usage: {tfidf_matrix.data.nbytes / 1e6:.2f} MB")

# Build NearestNeighbors model
nn = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute', n_jobs=-1)  # 11 to account for self
nn.fit(tfidf_matrix)
print("NearestNeighbors model fitted successfully")

# Create title-to-index mapping
indices = pd.Series(df.index, index=df['title']).drop_duplicates(keep='first')
print(f"Indices sample: {indices.head()}")

# Recommendation function
def recommend_books(title, nn_model=nn, top_n=10):
    print(f"Recommending for title: {title}")
    
    # Check if title exists
    if title not in indices:
        print(f"Title '{title}' not in dataset.")
        return None
    
    # Get index
    idx = indices[title]
    print(f"Index for title '{title}': {idx}")
    
    # Ensure idx is an integer
    if isinstance(idx, (int, np.integer)):
        idx = idx
    else:
        idx = idx[0]  # Take first if multiple
    
    # Get nearest neighbors
    distances, book_indices = nn_model.kneighbors(tfidf_matrix[idx], n_neighbors=top_n + 1)
    book_indices = book_indices.flatten()[1:]  # Exclude self
    distances = distances.flatten()[1:]  # Exclude self
    similarities = 1 - distances  # Convert distance to similarity
    
    print(f"Recommended book indices: {book_indices}")
    print(f"Similarity scores sample: {similarities[:5]}")
    
    # Return results
    result = df.iloc[book_indices][['title','description']].copy()
    result['similarity_score'] = similarities
    return result


Dataset loaded successfully, rows: 77168
Rows after preprocessing: 76542
Content column sample: 0    zap!_i'm_a_mind_reader whod thought mind readi...
1    the_original_adventures_of_hank_the_cowdog_(ha...
Name: content, dtype: object
TF-IDF matrix shape: (76542, 500)
TF-IDF matrix memory usage: 9.25 MB
NearestNeighbors model fitted successfully
Indices sample: title
zap!_i'm_a_mind_reader                                              0
the_original_adventures_of_hank_the_cowdog_(hank_the_cowdog,_#1)    1
lara_and_the_gray_mare_(hoofbeats:_lara_and_the_gray_mare,_#1)      2
the_gunny_sack_man                                                  3
gobbolino,_the_witch's_cat                                          4
dtype: int64


In [16]:
recommendations = recommend_books("zap!_i'm_a_mind_reader")
if recommendations is None:
    print("Recommendation result is None, please check debug information.")
else:
    print("Recommendation results:")
    print(recommendations)

Recommending for title: zap!_i'm_a_mind_reader
Index for title 'zap!_i'm_a_mind_reader': 0
Recommended book indices: [22949 47780 23053  9420 57186 37470 53139 40052 53460 68972]
Similarity scores sample: [0.61824968 0.49646353 0.4859334  0.47090507 0.45016153]
Recommendation results:
                                                  title  \
23037                                      the_castaway   
48057                                     the_paw_thing   
23141                           the_mean_old_mean_hyena   
9450                                     the_silly_book   
57528                                    this_is_silly!   
37678                             the_big_book_of_silly   
53453                                  the_silly_gooses   
40278           smelly_locker:_silly_dilly_school_songs   
53777         the_mean_and_vulgar_bits._kjartan_poskitt   
69404  the_mean_and_vulgar_bits:_fractions_and_averages   

                                             description  simila

In [17]:
# evaluate the accuray based on the similarity book
# since the result is only return top 10

In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Load dataset
try:
    df = pd.read_csv('cleaned_books_data.xls')
    print(f"Dataset loaded successfully, rows: {len(df)}")
except FileNotFoundError:
    print("Error: 'cleaned_books_data.xls' not found, please check the file path.")
    exit()

# Check if required columns exist
required_columns = ['book_id', 'title', 'description', 'similar_books']
if not all(col in df.columns for col in required_columns):
    print(f"Error: Dataset missing columns: {[col for col in required_columns if col not in df.columns]}")
    exit()

# Preprocess dataset
df = df[required_columns].drop_duplicates(subset=['book_id']).dropna(subset=['title', 'description'])
print(f"Rows after preprocessing: {len(df)}")

# Parse similar_books
def parse_similar_books(similar_books):
    try:
        if isinstance(similar_books, str):
            return ast.literal_eval(similar_books)
        return similar_books
    except (ValueError, SyntaxError):
        return []

df['similar_books'] = df['similar_books'].apply(parse_similar_books)
df = df[df['similar_books'].apply(len) > 0]
print(f"Rows with valid similar_books: {len(df)}")

df = df.reset_index(drop=True)

# Combine title and description
df['content'] = df['title'] + ' ' + df['description']
print(f"Content column sample: {df['content'].head(2)}")

# TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=500, min_df=5)
tfidf_matrix = tfidf.fit_transform(df['content'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"TF-IDF matrix memory usage: {tfidf_matrix.data.nbytes / 1e6:.2f} MB")

# Nearest Neighbors model
nn = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute', n_jobs=-1)
nn.fit(tfidf_matrix)
print("NearestNeighbors model fitted successfully")

# Mappings
indices = pd.Series(df.index, index=df['title']).drop_duplicates(keep='first')
book_id_to_index = pd.Series(df.index, index=df['book_id']).drop_duplicates(keep='first')
index_to_book_id = pd.Series(df['book_id'].values, index=df.index)

# Recommendation function
def recommend_books(title, nn_model=nn, top_n=10):
    if title not in indices:
        return None, None
    idx = indices[title]
    if isinstance(idx, (int, np.integer)):
        idx = idx
    else:
        idx = idx.iloc[0]
    distances, book_indices = nn_model.kneighbors(tfidf_matrix[idx], n_neighbors=top_n + 1)
    book_indices = book_indices.flatten()[1:]  # Exclude self
    recommended_book_ids = index_to_book_id.iloc[book_indices].values
    result = df.iloc[book_indices][['book_id', 'title', 'description']].copy()
    result['similarity_score'] = 1 - distances.flatten()[1:]
    return result, recommended_book_ids

# Evaluation function
def evaluate_recommendations(df, nn_model, top_n=10):
    precisions = []
    for idx, row in df.iterrows():
        title = row['title']
        ground_truth = set(row['similar_books'])
        recommendations, recommended_book_ids = recommend_books(title, nn_model, top_n)
        if recommendations is None:
            continue
        recommended_set = set(recommended_book_ids)
        intersection = recommended_set.intersection(ground_truth)
        precision = len(intersection) / top_n if top_n > 0 else 0
        precisions.append(precision)
        if (idx + 1) % 100 == 0:
            print(f"Evaluated {idx + 1} books, current mean precision: {np.mean(precisions):.4f}")
    mean_precision = np.mean(precisions) if precisions else 0
    return mean_precision, precisions

# Function to calculate precision@K for a specific book
def precision_at_k(title, df, nn_model, k=10):
    recommendations, recommended_book_ids = recommend_books(title, nn_model, top_n=k)
    if recommendations is None:
        print(f"Title '{title}' not found in the dataset.")
        return None
    idx = indices[title]
    ground_truth = set(df.loc[idx, 'similar_books'])
    recommended_set = set(recommended_book_ids)
    intersection = recommended_set.intersection(ground_truth)
    precision = len(intersection) / k
    return precision, intersection, ground_truth, recommended_set, recommendations

# Run evaluation on entire dataset
mean_precision, precisions = evaluate_recommendations(df, nn, top_n=10)
print(f"\nEvaluation complete!")
print(f"Mean Precision@10: {mean_precision:.4f}")
print(f"Number of books evaluated: {len(precisions)}")
print(f"Precision distribution: min={np.min(precisions):.4f}, max={np.max(precisions):.4f}, std={np.std(precisions):.4f}")

# Evaluate one specific book
title = "zap!_i'm_a_mind_reader"
result = precision_at_k(title, df, nn, k=10)
if result:
    precision, intersection, ground_truth, recommended_set, recommendations = result
    print(f"\nPrecision@10 for '{title}': {precision:.4f}")
    print("Recommendations:")
    print(recommendations[['book_id', 'title', 'similarity_score']])
    print(f"\nGround truth similar_books: {ground_truth}")
    print(f"Recommended book_ids: {recommended_set}")
    print(f"Intersection (correct matches): {intersection}")


Dataset loaded successfully, rows: 77168
Rows after preprocessing: 77168
Rows with valid similar_books: 35222
Content column sample: 0    zap!_i'm_a_mind_reader whod thought mind readi...
1    the_original_adventures_of_hank_the_cowdog_(ha...
Name: content, dtype: object
TF-IDF matrix shape: (35222, 500)
TF-IDF matrix memory usage: 4.44 MB
NearestNeighbors model fitted successfully
Evaluated 100 books, current mean precision: 0.0000
Evaluated 200 books, current mean precision: 0.0000
Evaluated 300 books, current mean precision: 0.0000
Evaluated 400 books, current mean precision: 0.0000
Evaluated 500 books, current mean precision: 0.0000
Evaluated 600 books, current mean precision: 0.0000
Evaluated 700 books, current mean precision: 0.0000
Evaluated 800 books, current mean precision: 0.0000
Evaluated 900 books, current mean precision: 0.0000
Evaluated 1000 books, current mean precision: 0.0000
Evaluated 1100 books, current mean precision: 0.0000
Evaluated 1200 books, current mean precis