In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [2]:
#DATA PRECPROCESSING-----------------------------------------------------------------------------------------------

# File paths
users_file = 'Users.csv'
books_file = 'Books.csv'
ratings_file = 'Ratings.csv'

# Load the datasets
users_df = pd.read_csv(users_file)
books_df = pd.read_csv(books_file)
ratings_df = pd.read_csv(ratings_file)

# Users Data Preprocessing
# Handle missing values in 'Age' and 'Location'
users_df['Age'].fillna(users_df['Age'].median(), inplace=True)
users_df['Location'].fillna('Unknown', inplace=True)


# Books Data Preprocessing
# Convert 'Year-Of-Publication' to numeric and handle non-numeric values
books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')
books_df['Year-Of-Publication'].fillna(books_df['Year-Of-Publication'].median(), inplace=True)
# Handle missing values in 'Book-Author' and 'Publisher'
books_df['Book-Author'].fillna('Unknown', inplace=True)
books_df['Publisher'].fillna('Unknown', inplace=True)

# Ratings Data Preprocessing
# Merge 'Ratings' with 'Books' to filter out books not in the 'Books' dataset
ratings_df = ratings_df[ratings_df['ISBN'].isin(books_df['ISBN'])]

# Data Splitting
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings_df, test_size=0.7, random_state=42)

# Export Processed Data
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

print(train_data)
# Checking the transformations and missing values after preprocessing
print("\nData after preprocessing:")
print("Unique values in 'Year-Of-Publication':", books_df['Year-Of-Publication'].unique())
print("\nMissing values after preprocessing:")
print("Users:", users_df.isnull().sum())
print("Books:", books_df.isnull().sum())
print("Ratings:", ratings_df.isnull().sum())



  books_df = pd.read_csv(books_file)


        User-ID        ISBN  Book-Rating
612733   148199  0671256084            5
264381    60858  0140013997            7
854111   206534  0701207639            0
83532     17443  1568581505            8
591287   142524  0874835186            0
...         ...         ...          ...
289630    69211  0312971397            8
409224    98391  0515132640            8
148815    33580  0446522856            0
750906   181944  0060185708            0
137978    31315  0446356751            0

[309340 rows x 3 columns]

Data after preprocessing:
Unique values in 'Year-Of-Publication': [2002. 2001. 1991. 1999. 2000. 1993. 1996. 1988. 2004. 1998. 1994. 2003.
 1997. 1983. 1979. 1995. 1982. 1985. 1992. 1986. 1978. 1980. 1952. 1987.
 1990. 1981. 1989. 1984.    0. 1968. 1961. 1958. 1974. 1976. 1971. 1977.
 1975. 1965. 1941. 1970. 1962. 1973. 1972. 1960. 1966. 1920. 1956. 1959.
 1953. 1951. 1942. 1963. 1964. 1969. 1954. 1950. 1967. 2005. 1957. 1940.
 1937. 1955. 1946. 1936. 1930. 2011. 1925. 1948. 

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Collaborative Filtering based on users' age
def collaborative_filtering(user_age, train_data, users_df):
    # Filter users based on age
    filtered_users = users_df[users_df['Age'] == user_age]

    # Filter training data based on selected users and non-zero ratings
    filtered_train_data = train_data[(train_data['User-ID'].isin(filtered_users['User-ID'])) & (train_data['Book-Rating'] != 0)]

    # Create a user-item matrix for collaborative filtering
    user_item_matrix = filtered_train_data.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating')
    user_item_matrix = user_item_matrix.fillna(0)

    print(user_item_matrix)
    # Calculate cosine similarity between users
    cosine_sim_users = cosine_similarity(user_item_matrix, user_item_matrix)

    return cosine_sim_users

# Content-Based Filtering based on book ratings
def content_based_filtering(train_data, books_df):
    # Get the top N books with the highest average ratings
    top_books = train_data.groupby('ISBN')['Book-Rating'].mean().reset_index()
    top_books = top_books.sort_values(by='Book-Rating', ascending=False).head(20000)

    # Merge with books_df to get additional book information
    top_books = pd.merge(top_books, books_df, on='ISBN')

    # Create a TF-IDF vectorizer for book titles
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(top_books['Book-Title'])

    # Apply Truncated SVD for dimensionality reduction
    svd = TruncatedSVD(n_components=100)
    tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

    # Calculate the cosine similarity between book titles
    cosine_sim_books = cosine_similarity(tfidf_matrix_reduced, tfidf_matrix_reduced)
    print(cosine_sim_books)
    return cosine_sim_books


# Hybrid Recommender System
def hybrid_recommender(user_age, user_id, train_data, books_df, users_df):
    # Collaborative Filtering
    collaborative_results = collaborative_filtering(user_age, train_data, users_df)

    # Content-Based Filtering
    content_based_results = content_based_filtering(train_data, books_df)

    # Combine the results using a weighted sum
    # Note: Here, we use the mean rating from collaborative filtering
    # and the cosine similarity values from content-based filtering
    collaborative_results_df = pd.DataFrame(collaborative_results)
    collaborative_results_df = collaborative_results_df.fillna(0)

    hybrid_results = pd.DataFrame(index=train_data['User-ID'], columns=train_data['ISBN'])
    hybrid_results['Hybrid-Score'] = (0.5 * collaborative_results_df.values +
                                       0.5 * content_based_results.loc[collaborative_results_df.index, collaborative_results_df.columns].fillna(0))

    # Sort the results by Hybrid-Score in descending order
    recommended_books = hybrid_results.unstack().reset_index(name='Hybrid-Score')
    user_rated_books = train_data[train_data['User-ID'] == user_id]['ISBN']
    recommended_books = recommended_books[~recommended_books['ISBN'].isin(user_rated_books)]

    return recommended_books[['ISBN', 'Hybrid-Score']].nlargest(5, 'Hybrid-Score')

# Example of using the hybrid recommender system
user_id = 123  # Replace with the actual user ID
user_age = 25  # Replace with the user's age

recommendations = hybrid_recommender(user_age, user_id, train_data, books_df, users_df)

# Display the top 5 recommended books
print("Top 5 Recommended Books:")
print(recommendations)


ISBN     0003300277  0006149995  0006240038  0006512674  0006550835  \
User-ID                                                               
843             0.0         0.0         0.0         0.0         0.0   
1137            0.0         0.0         0.0         0.0         0.0   
1167            0.0         0.0         0.0         0.0         0.0   
1511            0.0         0.0         0.0         0.0         0.0   
1725            0.0         0.0         0.0         0.0         0.0   
...             ...         ...         ...         ...         ...   
277085          0.0         0.0         0.0         0.0         0.0   
277205          0.0         0.0         0.0         0.0         0.0   
277619          0.0         0.0         0.0         0.0         0.0   
278378          0.0         0.0         0.0         0.0         0.0   
278401          0.0         0.0         0.0         0.0         0.0   

ISBN     0020442203  0020442505  0020442602  0020446500  0020518501  ...  \
