In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [4]:
#DATA PRECPROCESSING-----------------------------------------------------------------------------------------------

# File paths
users_file = 'Users.csv'
books_file = 'Books.csv'
ratings_file = 'Ratings.csv'

# Load the datasets
users_df = pd.read_csv(users_file)
books_df = pd.read_csv(books_file)
ratings_df = pd.read_csv(ratings_file)

# Users Data Preprocessing
# Handle missing values in 'Age' and 'Location'
users_df['Age'].fillna(users_df['Age'].median(), inplace=True)
users_df['Location'].fillna('Unknown', inplace=True)



# Assuming 'Age' is the column containing ages in users_df
age_counter = Counter(users_df['Age'])

# Sort the age counter items by age
sorted_age_counter = sorted(age_counter.items(), key=lambda x: x[0])

# Print the sorted age counter
for age, count in sorted_age_counter:
    print(f"Age: {age}, Count: {count}")



# Books Data Preprocessing
# Convert 'Year-Of-Publication' to numeric and handle non-numeric values
books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')
books_df['Year-Of-Publication'].fillna(books_df['Year-Of-Publication'].median(), inplace=True)
# Handle missing values in 'Book-Author' and 'Publisher'
books_df['Book-Author'].fillna('Unknown', inplace=True)
books_df['Publisher'].fillna('Unknown', inplace=True)

# Ratings Data Preprocessing
# Merge 'Ratings' with 'Books' to filter out books not in the 'Books' dataset
ratings_df = ratings_df[ratings_df['ISBN'].isin(books_df['ISBN'])]

# Data Splitting
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings_df, test_size=0.9, random_state=42)

# Export Processed Data
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

# Checking the transformations and missing values after preprocessing
print("\nData after preprocessing:")
print("Unique values in 'Year-Of-Publication':", books_df['Year-Of-Publication'].unique())
print("\nMissing values after preprocessing:")
print("Users:", users_df.isnull().sum())
print("Books:", books_df.isnull().sum())
print("Ratings:", ratings_df.isnull().sum())



  books_df = pd.read_csv(books_file)


Age: 0.0, Count: 416
Age: 1.0, Count: 288
Age: 2.0, Count: 105
Age: 3.0, Count: 45
Age: 4.0, Count: 28
Age: 5.0, Count: 26
Age: 6.0, Count: 18
Age: 7.0, Count: 27
Age: 8.0, Count: 54
Age: 9.0, Count: 62
Age: 10.0, Count: 84
Age: 11.0, Count: 121
Age: 12.0, Count: 192
Age: 13.0, Count: 885
Age: 14.0, Count: 1962
Age: 15.0, Count: 2383
Age: 16.0, Count: 2570
Age: 17.0, Count: 3044
Age: 18.0, Count: 3703
Age: 19.0, Count: 3950
Age: 20.0, Count: 4056
Age: 21.0, Count: 4438
Age: 22.0, Count: 4714
Age: 23.0, Count: 5456
Age: 24.0, Count: 5687
Age: 25.0, Count: 5618
Age: 26.0, Count: 5547
Age: 27.0, Count: 5383
Age: 28.0, Count: 5347
Age: 29.0, Count: 5293
Age: 30.0, Count: 4778
Age: 31.0, Count: 4665
Age: 32.0, Count: 115543
Age: 33.0, Count: 4699
Age: 34.0, Count: 4656
Age: 35.0, Count: 4194
Age: 36.0, Count: 3896
Age: 37.0, Count: 3582
Age: 38.0, Count: 3404
Age: 39.0, Count: 3234
Age: 40.0, Count: 3142
Age: 41.0, Count: 2919
Age: 42.0, Count: 2758
Age: 43.0, Count: 2816
Age: 44.0, Count: 

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from scipy.sparse import csr_matrix



# Label Encoding for User-ID and ISBN
le_user = LabelEncoder()
le_book = LabelEncoder()

# Use label encoder for all values with numbers
users_df['User-ID'] = le_user.fit_transform(users_df['User-ID'])
books_df['ISBN'] = le_book.fit_transform(books_df['ISBN'])
train_data['User-ID'] = le_user.transform(train_data['User-ID'])
train_data['ISBN'] = le_book.transform(train_data['ISBN'])
test_data['User-ID'] = le_user.transform(test_data['User-ID'])
test_data['ISBN'] = le_book.transform(test_data['ISBN'])

# Collaborative Filtering
# Create user-item matrix
user_item_train_sparse = csr_matrix((train_data['Book-Rating'], (train_data['User-ID'], train_data['ISBN'])))

# Calculate user similarity using cosine similarity
user_similarity = cosine_similarity(user_item_train_sparse)

# Content-Based Filtering
# Combine relevant book information into a single string
books_df['Book-Info'] = books_df['Book-Title'] + ' ' + books_df['Book-Author'] + ' ' + books_df['Publisher']

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
book_tfidf_matrix = tfidf_vectorizer.fit_transform(books_df['Book-Info'])

# Calculate book similarity using cosine similarity
book_similarity = cosine_similarity(book_tfidf_matrix)

# # Hybrid Model
# def hybrid_recommendation(user_id, book_title):
#     # Collaborative Filtering
#     user_index = le_user.transform([user_id])[0]
#     book_index = le_book.transform([book_title])[0]

#     user_similarities = user_similarity[user_index]
#     user_ratings = train_data.iloc[user_index]

#     # Content-Based Filtering
#     book_similarities = book_similarity[book_index]
#     book_ratings = train_data[book_index]

#     # Combine predictions using a weighted average
#     hybrid_scores = 0.7 * user_similarities * user_ratings + 0.3 * book_similarities * book_ratings

#     # Exclude already rated books
#     rated_books = train_data[train_data['User-ID'] == user_id]['ISBN'].values
#     hybrid_scores[rated_books] = 0

#     # Get book recommendations
#     recommended_books = hybrid_scores.sort_values(ascending=False).index

#     return recommended_books

# # Example of making recommendations for a user
# user_id_to_recommend = 100
# book_title_to_recommend = 'The Da Vinci Code'

# recommendations = hybrid_recommendation(user_id_to_recommend, book_title_to_recommend)
# print("Hybrid Recommendations:", recommendations)


NameError: ignored