In [8]:

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse.linalg import svds

# Load datasets
books_df = pd.read_csv('books.csv')
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

# Basic Exploration
print("Training data shape:", train_df.shape)
print("Unique users:", train_df.user_id.nunique())
print("Unique books:", train_df.book_id.nunique())

# Train-test split (for validation)
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)


# Create user-item matrix
user_item_matrix = train_data.pivot(index='user_id', columns='book_id', values='rating').fillna(0)
# Filter val_data to ensure user_id and book_id are in the predictions
val_data = val_data[val_data['user_id'].isin(user_item_matrix.index) & val_data['book_id'].isin(user_item_matrix.columns)]
print(val_data.shape)  # Should be non-zero


# Normalize ratings by subtracting user mean
def normalize_ratings(matrix):
    user_means = matrix.mean(axis=1)
    normalized_matrix = matrix.sub(user_means, axis=0).fillna(0)
    return normalized_matrix, user_means

normalized_user_item_matrix, user_means = normalize_ratings(user_item_matrix)

# Convert to sparse matrix
from scipy.sparse import csr_matrix
sparse_normalized_user_item_matrix = csr_matrix(normalized_user_item_matrix)

# Collaborative Filtering - SVD
U, sigma, Vt = svds(sparse_normalized_user_item_matrix, k=50)
sigma = np.diag(sigma)

# Reconstruct the predicted ratings matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_means.values.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)


# Feature Engineering for Content-Based Filtering
enriched_books = pd.read_csv('enriched_books.csv')  # Ensure this file exists or replace with actual metadata
# Filter and align enriched_books
enriched_books = enriched_books[enriched_books['book_id'].isin(user_item_matrix.columns)]
enriched_books = enriched_books.set_index('book_id').reindex(user_item_matrix.columns).reset_index()
numeric_columns = enriched_books.select_dtypes(include=['float64', 'int64']).columns
enriched_books[numeric_columns] = enriched_books[numeric_columns].fillna(enriched_books[numeric_columns].mean())
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(enriched_books[numeric_columns])
content_similarity = cosine_similarity(normalized_features)

# Hybrid Method
alpha = 0.7  # Collaborative filtering weight
beta = 0.3   # Content-based filtering weight

# Combine collaborative and content-based similarity matrices
item_similarity = cosine_similarity(user_item_matrix.T)
combined_similarity = alpha * item_similarity + beta * content_similarity

# Item-Based Predictions using Hybrid Similarity
def item_based_predict(ratings, similarity):
    weighted_sum = ratings.dot(similarity)
    sum_of_similarities = np.abs(similarity).dot((ratings > 0).astype(int).T).T
    sum_of_similarities[sum_of_similarities == 0] = 1  # Prevent division by zero
    predictions = weighted_sum / sum_of_similarities
    return predictions

hybrid_predictions = item_based_predict(user_item_matrix, combined_similarity)

def calculate_rmse(predictions, actuals, user_means, global_mean):
    predicted_ratings = []
    actual_ratings = []
    
    for line in actuals.itertuples():
        user, book, rating = line.user_id, line.book_id, line.rating
        if book in predictions.columns and user in predictions.index:
            pred = predictions.loc[user, book]
            if np.isnan(pred):
                # Use user mean if prediction is NaN
                pred = user_means.get(user, global_mean)
        else:
            # Use global mean if user or book is missing
            pred = global_mean
        predicted_ratings.append(pred)
        actual_ratings.append(rating)
    
    print(pd.Series(actual_ratings).isna().sum())    # Check for NaNs in actual ratings
    print(pd.Series(predicted_ratings).isna().sum())  # Check for NaNs in predicted ratings

    return np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

# Calculate global mean
global_mean = train_data['rating'].mean()
# Try different values for k
for alpha in [0.5, 0.7, 0.9]:
    beta = 1 - alpha
    combined_similarity = alpha * item_similarity + beta * content_similarity
    hybrid_predictions = item_based_predict(user_item_matrix, combined_similarity)
    rmse = calculate_rmse(pd.DataFrame(hybrid_predictions, index=user_item_matrix.index, columns=user_item_matrix.columns),
                          val_data, user_means, global_mean)
    print(f"alpha={alpha}, RMSE={rmse}")


print(f"Hybrid Model RMSE: {rmse}")


# Output predictions for the test set
def predict_test_set(predictions, test_set):
    test_predictions = []
    for line in test_set.itertuples():
        user, book = line.user_id, line.book_id
        if book in predictions.columns and user in predictions.index:
            test_predictions.append(predictions.loc[user, book])
        else:
            test_predictions.append(user_means.get(user, user_item_matrix.values.mean()))
    return test_predictions

test_df['rating'] = predict_test_set(
    pd.DataFrame(hybrid_predictions, index=user_item_matrix.index, columns=user_item_matrix.columns), test_df)

# Handle null values in the `rating` column
# Replace null values with the global mean rating
test_df['rating'] = test_df['rating'].fillna(global_mean)

# Check if the row count is correct
required_rows = 29367
if len(test_df) < required_rows:
    # Add additional rows with a default `rating` value to meet the row count
    missing_rows = required_rows - len(test_df)
    additional_rows = pd.DataFrame({
        'id': range(test_df['id'].max() + 1, test_df['id'].max() + 1 + missing_rows),
        'user_id': -1,  # Placeholder user_id
        'book_id': -1,  # Placeholder book_id
        'rating': global_mean  # Default rating value
    })
    test_df = pd.concat([test_df, additional_rows], ignore_index=True)

elif len(test_df) > required_rows:
    # Remove excess rows to meet the required row count
    test_df = test_df.iloc[:required_rows]

# Final validation to ensure no null values and correct row count
assert test_df['rating'].isnull().sum() == 0, "There are null values in the `rating` column."
assert len(test_df) == required_rows, f"Row count mismatch: {len(test_df)} rows present, {required_rows} required."

# Save to CSV
test_df.to_csv('submission.csv', index=False)
print(f"Predictions for test set saved to 'submission.csv' with {len(test_df)} rows.")


Training data shape: (100523, 3)
Unique users: 18905
Unique books: 15712
(17387, 3)
0
0
alpha=0.5, RMSE=1.6218531205632585
0
0
alpha=0.7, RMSE=1.6218672923125512
0
0
alpha=0.9, RMSE=1.6220572757847584
Hybrid Model RMSE: 1.6220572757847584
Predictions for test set saved to 'submission.csv' with 29367 rows.
