In [1]:
import numpy as np
import pandas as pd

### Train data

In [2]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,book_id,user_id,rating
0,7260,20145,3.5
1,243238,85182,4.0
2,9135,45973,1.0
3,18671,63554,3.0
4,243293,81002,5.0


In [3]:
print(df_train.isnull().sum())

book_id    0
user_id    0
rating     0
dtype: int64


In [4]:
n_users = df_train.user_id.nunique()
n_items = df_train.book_id.nunique()
print ('Number of users = ' + str(n_users) + ' | Number of books = ' + str(n_items))

Number of users = 18905 | Number of books = 15712


In [5]:
df_train.describe()

Unnamed: 0,book_id,user_id,rating
count,100523.0,100523.0,100523.0
mean,49799.616217,45473.754852,2.361738
std,82711.354018,26921.5636,1.106863
min,1.0,37.0,1.0
25%,3754.0,22696.0,1.5
50%,11916.0,44711.0,2.0
75%,37735.0,68147.0,3.0
max,249242.0,94399.0,5.0


index not starting at zero -> id much higher than nb of line/columns 

### Test data

In [6]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,book_id,user_id
0,0,3786,40484
1,1,1985,47039
2,2,2290,60111
3,3,118657,64447
4,4,1560,2953


In [7]:
print(df_test.isnull().sum())

id         0
book_id    0
user_id    0
dtype: int64


In [8]:
n_users_test = df_test.user_id.unique().shape[0]
n_items_test = df_test.book_id.unique().shape[0]
print ('Number of users = ' + str(n_users_test) + ' | Number of books = ' + str(n_items_test))

Number of users = 6519 | Number of books = 9371


### Recommender

In [None]:
def map_ids(data, column_name):
    """Map data IDs to zero-based indices"""
    unique_ids = data[column_name].unique()
    id_to_index = {old: new for new, old in enumerate(unique_ids)}
    return data[column_name].map(id_to_index), id_to_index

# Applying the mapping function to both train and test data
df_train['user_id'], user_mapping = map_ids(df_train, 'user_id')
df_train['book_id'], book_mapping = map_ids(df_train, 'book_id')

n_users = df_train['user_id'].nunique()
n_books = df_train['book_id'].nunique()

In [10]:
df_train.head()

Unnamed: 0,book_id,user_id,rating
0,0,0,3.5
1,1,1,4.0
2,2,2,1.0
3,3,3,3.0
4,4,4,5.0


In [11]:
df_train.describe()

Unnamed: 0,book_id,user_id,rating
count,100523.0,100523.0,100523.0
mean,4985.189409,4149.39773,2.361738
std,4030.433657,4360.990111,1.106863
min,0.0,0.0,1.0
25%,1536.0,919.0,1.5
50%,3951.0,2559.0,2.0
75%,7761.0,5870.0,3.0
max,15711.0,18904.0,5.0


now value of id max is same as number of line/columns

In [None]:
def create_data_matrix(data, n_users, n_books):
    """Create user-item matrix"""
    data_matrix = np.zeros((n_users, n_books))
    for line in data.itertuples():
        data_matrix[line[2], line[1]] = line[3]
    return data_matrix

train_data_matrix = create_data_matrix(df_train, n_users, n_books)


In [13]:
print(train_data_matrix.shape)

(18905, 15712)


In [14]:
train_data_matrix

array([[3.5, 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 4. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 1. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [15]:
## check for one value 
df_train[df_train["user_id"]==0]

Unnamed: 0,book_id,user_id,rating
0,0,0,3.5
1412,1259,0,2.5
11445,2989,0,2.0
19016,5684,0,1.5
22810,2641,0,2.5
46391,2482,0,2.0
52041,2389,0,2.5
54017,1556,0,2.0
59832,2742,0,3.5
61883,103,0,2.0


In [16]:
train_data_matrix[0][2989]

2.0

### Item-based similarity

Item-based better because each user will likely rate only a small subset of the available books --> user-item matrix will be sparse

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# Similarity with data without feature addition 
item_similarity = 1 - pairwise_distances(train_data_matrix.T, metric='cosine')

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import pairwise_distances

# Similary with enriched data 
enriched_books = pd.read_csv('enriched_books.csv')
enriched_books = enriched_books[enriched_books['book_id'].isin(book_mapping.keys())]
enriched_books['book_id'] = enriched_books['book_id'].map(book_mapping)
enriched_books = enriched_books.sort_values(by='book_id')
numeric_columns = enriched_books.select_dtypes(include=['float64', 'int64']).columns
enriched_books[numeric_columns] = enriched_books[numeric_columns].fillna(enriched_books[numeric_columns].mean())

scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(enriched_books[numeric_columns])

content_similarity = 1 - pairwise_distances(normalized_features, metric='cosine')

if item_similarity.shape != content_similarity.shape:
    raise ValueError("Shapes of item_similarity and content_similarity do not match. Please verify alignment.")

# Combine collaborative filtering similarity with content-based similarity
alpha = 0.5  # Weight for collaborative filtering
beta = 0.5   # Weight for content similarity
combined_similarity = alpha * item_similarity + beta * content_similarity


Content-based similarity computed successfully.


In [19]:
def item_based_predict(ratings, similarity):
    # Calculate the dot product between the ratings and the similarity matrix
    weighted_sum = ratings.dot(similarity)
    
    # Calculate the sum of similarities for each user and item pair
    sum_of_similarities = np.abs(similarity).dot((ratings > 0).astype(int).T).T
    
    # Prevent division by zero
    sum_of_similarities[sum_of_similarities == 0] = 1
    
    # Calculate the predicted ratings
    predictions = weighted_sum / sum_of_similarities
    
    return predictions

item_prediction = item_based_predict(train_data_matrix, item_similarity)
print(item_prediction)

[[2.69590358 0.         2.         ... 0.         0.         0.        ]
 [0.         4.         0.         ... 0.         0.         0.        ]
 [1.56190587 0.         1.82738198 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
from sklearn.metrics import mean_squared_error

# First try, comput the RMSE with the data without the features augmentation 
predictions = item_based_predict(train_data_matrix, item_similarity)

predicted_ratings = []
actual_ratings = []

for line in df_train.itertuples():
    user = line.user_id
    book = line.book_id
    actual_ratings.append(line.rating)
    predicted_ratings.append(predictions[user, book])

# Compute RMSE
rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
print(f"RMSE: {rmse}")


RMSE: 0.48127727312253343


In [None]:
# Second try, compute RMSE with feature augmentation
predictions_2 = item_based_predict(train_data_matrix, combined_similarity)

predicted_ratings = []
actual_ratings = []

for line in df_train.itertuples():
    user = line.user_id
    book = line.book_id
    actual_ratings.append(line.rating)
    predicted_ratings.append(predictions_2[user, book])

rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
print(f"RMSE: {rmse}")


RMSE: 0.6348777623546191


In [47]:
df_test ['user_id'], user_mapping = map_ids(df_test, 'user_id')
df_test['book_id'], book_mapping = map_ids(df_test, 'book_id')

n_users_test = df_test.user_id.unique().shape[0]
n_books_test = df_test.book_id.unique ().shape[0]

test_data_matrix = create_data_matrix(df_test, n_users_test, n_books_test)

IndexError: index 9371 is out of bounds for axis 1 with size 9371