In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [5]:
#DATA PRECPROCESSING-----------------------------------------------------------------------------------------------

# File paths
users_file = 'Users.csv'
books_file = 'Books.csv'
ratings_file = 'Ratings.csv'

# Load the datasets
users_df = pd.read_csv(users_file)
books_df = pd.read_csv(books_file)
ratings_df = pd.read_csv(ratings_file)

# Users Data Preprocessing
# Handle missing values in 'Age' and 'Location'
users_df['Age'].fillna(users_df['Age'].median(), inplace=True)
users_df['Location'].fillna('Unknown', inplace=True)


# Books Data Preprocessing
# Convert 'Year-Of-Publication' to numeric and handle non-numeric values
books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')
books_df['Year-Of-Publication'].fillna(books_df['Year-Of-Publication'].median(), inplace=True)
# Handle missing values in 'Book-Author' and 'Publisher'
books_df['Book-Author'].fillna('Unknown', inplace=True)
books_df['Publisher'].fillna('Unknown', inplace=True)

# Ratings Data Preprocessing
# Merge 'Ratings' with 'Books' to filter out books not in the 'Books' dataset
ratings_df = ratings_df[ratings_df['ISBN'].isin(books_df['ISBN'])]

# Data Splitting
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings_df, test_size=0.9, random_state=42)

# Export Processed Data
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

# Checking the transformations and missing values after preprocessing
print("\nData after preprocessing:")
print("Unique values in 'Year-Of-Publication':", books_df['Year-Of-Publication'].unique())
print("\nMissing values after preprocessing:")
print("Users:", users_df.isnull().sum())
print("Books:", books_df.isnull().sum())
print("Ratings:", ratings_df.isnull().sum())



  books_df = pd.read_csv(books_file)



Data after preprocessing:
Unique values in 'Year-Of-Publication': [2002. 2001. 1991. 1999. 2000. 1993. 1996. 1988. 2004. 1998. 1994. 2003.
 1997. 1983. 1979. 1995. 1982. 1985. 1992. 1986. 1978. 1980. 1952. 1987.
 1990. 1981. 1989. 1984.    0. 1968. 1961. 1958. 1974. 1976. 1971. 1977.
 1975. 1965. 1941. 1970. 1962. 1973. 1972. 1960. 1966. 1920. 1956. 1959.
 1953. 1951. 1942. 1963. 1964. 1969. 1954. 1950. 1967. 2005. 1957. 1940.
 1937. 1955. 1946. 1936. 1930. 2011. 1925. 1948. 1943. 1947. 1945. 1923.
 2020. 1939. 1926. 1938. 2030. 1911. 1904. 1949. 1932. 1928. 1929. 1927.
 1931. 1914. 2050. 1934. 1910. 1933. 1902. 1924. 1921. 1900. 2038. 2026.
 1944. 1917. 1901. 2010. 1908. 1906. 1935. 1806. 2021. 2012. 2006. 1909.
 2008. 1378. 1919. 1922. 1897. 2024. 1376. 2037.]

Missing values after preprocessing:
Users: User-ID     0
Location    0
Age         0
dtype: int64
Books: ISBN                   0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher           

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the preprocessed data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Collaborative Filtering based on users' age and location
def collaborative_filtering(user_age, user_location, train_data):
    # Filter users based on age and location
    filtered_users = users_df[(users_df['Age'] == user_age) & (users_df['Location'] == user_location)]

    # Filter training data based on selected users
    filtered_train_data = train_data[train_data['User-ID'].isin(filtered_users['User-ID'])]

    # Group by book ISBN and calculate the mean rating
    book_ratings = filtered_train_data.groupby('ISBN')['Book-Rating'].mean().reset_index()

    return book_ratings

# Content-Based Filtering based on book ratings
def content_based_filtering(train_data, books_df):
    # Merge training data with book information
    merged_data = pd.merge(train_data, books_df, on='ISBN')

    # Group by book ISBN and calculate the mean rating
    book_ratings = merged_data.groupby('ISBN')['Book-Rating'].mean().reset_index()

    # Merge with books_df to get additional book information
    book_data = pd.merge(book_ratings, books_df, on='ISBN')

    # Create a TF-IDF vectorizer for book titles
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(book_data['Book-Title'])

    # Calculate the cosine similarity between book titles
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    return cosine_sim

# Hybrid Recommender System
def hybrid_recommender(user_age, user_location, user_id, train_data, books_df):
    # Collaborative Filtering
    collaborative_results = collaborative_filtering(user_age, user_location, train_data)

    # Content-Based Filtering
    content_based_results = content_based_filtering(train_data, books_df)

    # Combine the results using a weighted average or any other method
    # For simplicity, we will use the mean rating from collaborative filtering
    # and the mean rating from content-based filtering
    hybrid_results = pd.merge(collaborative_results, content_based_results, on='ISBN')
    hybrid_results['Hybrid-Score'] = (hybrid_results['Book-Rating_x'] + hybrid_results['Book-Rating_y']) / 2

    # Sort the results by Hybrid-Score in descending order
    recommended_books = hybrid_results.sort_values(by='Hybrid-Score', ascending=False)

    # Exclude books that the user has already rated
    user_rated_books = train_data[train_data['User-ID'] == user_id]['ISBN']
    recommended_books = recommended_books[~recommended_books['ISBN'].isin(user_rated_books)]

    return recommended_books

# Example of using the hybrid recommender system
user_id = 123  # Replace with the actual user ID
user_age = 25  # Replace with the user's age
user_location = 'New York'  # Replace with the user's location

recommendations = hybrid_recommender(user_age, user_location, user_id, train_data, books_df)

# Display the top N recommended books
top_n = 10
print(f"Top {top_n} Recommended Books:")
print(recommendations.head(top_n))