In [1]:
import numpy as np
import pandas as pd


In [2]:
books_df = pd.read_csv("data/raw_data/Books.csv", dtype={3: "str"})
ratings_df = pd.read_csv("data/raw_data/Ratings.csv")
users_df = pd.read_csv("data/raw_data/Users.csv")

books_df = books_df[:50000]
ratings_df = ratings_df[:50000]
users_df = users_df[:50000]


### Books

In [3]:
books_df.head()


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
duplicats = books_df.duplicated().sum()
books_nunique = books_df["ISBN"].nunique()

print(f"Number of duplicates: {duplicats}")
print(f"Number of unique books: {books_nunique}")


Number of duplicates: 0
Number of unique books: 50000


In [5]:
books_df.drop(["Image-URL-S", "Image-URL-M", "Image-URL-L"], axis=1, inplace=True)


books_df.rename(
    columns={
        "ISBN": "isbn",
        "Book-Title": "book_title",
        "Book-Author": "book_author",
        "Year-Of-Publication": "year",
        "Publisher": "publisher",
    },
    inplace=True,
)


print(f"books_df.shape before = {books_df.shape}")

books_df[books_df.columns[3]] = pd.to_numeric(
    books_df[books_df.columns[3]], errors="coerce"
)  # If ‘coerce’, then invalid parsing will be set as NaN.
books_df.dropna(subset=["year"], axis=0, inplace=True)
books_df["year"] = books_df["year"].astype(int)

print(f"books_df.shape after = {books_df.shape}")


books_df.shape before = (50000, 5)
books_df.shape after = (50000, 5)


In [6]:
books_df.describe(include="object")


Unnamed: 0,isbn,book_title,book_author,publisher
count,50000,50000,50000,50000
unique,50000,46241,23359,4994
top,195153448,Wuthering Heights,Stephen King,Harlequin
freq,1,9,192,1206


In [7]:
books_df.describe(include=np.number)


Unnamed: 0,year
count,50000.0
mean,1959.89884
std,260.465229
min,0.0
25%,1990.0
50%,1996.0
75%,2000.0
max,2030.0


In [8]:
filtered_df = books_df[
    (books_df["year"] == 0) | (books_df["year"] > 2004)
]  # The dataset was collected in 2004
filtered_df = filtered_df.astype({"year": float})
filtered_df.loc[:, "year"] = np.nan  # Replacing the numbers with the missing ones.


print(f"Number of books with incorrect year of publication = {len(filtered_df)}")
print(f"This represents {round(len(filtered_df)/len(books_df)*100)} percent of the total number of books.")

books_df.drop(index=filtered_df.index, inplace=True) # drop this


Number of books with incorrect year of publication = 874
This represents 2 percent of the total number of books.


We can replace the missing values after splitting the dataset into training and test dataset to prevent data leakage.
But due to the fact that this is all 2 percent of the sample, as part of the test assignment, I will simply delete these rows.

### Ratings

In [9]:
ratings_df.rename(
    columns={
        "ISBN": "isbn",
        "User-ID": "user_id",
        "Book-Rating": "rating",
    },
    inplace=True,
)


In [10]:
pd.set_option("display.float_format", "{:.2f}".format)
ratings_df['rating'].describe()


count   50000.00
mean        3.18
std         3.93
min         0.00
25%         0.00
50%         0.00
75%         7.00
max        10.00
Name: rating, dtype: float64

### Users


In [11]:
users_df.rename(
    columns={"User-ID": "user_id"},
    inplace=True,
)
users_df.drop(["Location", "Age"], axis=1, inplace=True)
users_df = users_df.sample(frac=0.3) # crop dataframe


In [12]:
duplicats = users_df.duplicated().sum()
user_nunique = users_df["user_id"].nunique()

print(f"Number of duplicates: {duplicats}")
print(f"Number of unique users: {user_nunique}")


Number of duplicates: 0
Number of unique users: 15000


In [13]:
ratings_users = users_df.merge(right=ratings_df, how="inner", on="user_id")


In [14]:
# Let's keep only the rows that are in both datasets.
ratings_users = ratings_users.merge(right=books_df, how="inner", on="isbn")
ratings_users.drop(["book_title", "book_author", "year", "publisher"], axis=1, inplace=True)


In [15]:
ratings_users.head()


Unnamed: 0,user_id,isbn,rating
0,6575,1714600,0
1,6575,28604199,0
2,6575,28606736,0
3,6575,30640075,0
4,6575,60002093,0


In [16]:
def __drop_zero_rating(data: pd.DataFrame, groupby_column: str) -> pd.DataFrame:
    grouped_sum_ratings = data.groupby([groupby_column]).sum()["rating"]
    zero_ratings = grouped_sum_ratings[grouped_sum_ratings == 0]
    data = data[~data[groupby_column].isin(zero_ratings.index)]  # drop zero_ratings

    return data


def __more_than_ratings(data: pd.DataFrame, groupby_column: str, min_ratings: int) -> pd.DataFrame:
    grouped_count_ratings = data.groupby([groupby_column]).count()["rating"]
    more_than = grouped_count_ratings[grouped_count_ratings >= min_ratings]
    data = data[data[groupby_column].isin(more_than.index)]

    return data


def clean_up_data(
    data: pd.DataFrame, groupby_column: str, min_ratings: int
) -> pd.DataFrame:
    data = __drop_zero_rating(data, groupby_column)
    data = __more_than_ratings(data, groupby_column, min_ratings)

    return data


In [17]:
ratings_users = clean_up_data(data=ratings_users, groupby_column="user_id", min_ratings=10)
ratings_users = clean_up_data(data=ratings_users, groupby_column="isbn", min_ratings=5)
ratings_users.shape


(400, 3)

As part of the test task, I will **remove** books that had **less than 5 ratings**. 44.7% of the books were graded only once. I don't want to keep these books for training the model, because the rating matrix will be extremely sparse, which will have a bad effect on the quality of the predictions. 

It is potentially possible to cluster books with a small number of ratings and use this clustering for users with specific preferences.

In [18]:
def train_test(
    data: pd.DataFrame, test_size: float
) -> tuple[pd.DataFrame, pd.DataFrame]:
    train_data = pd.DataFrame(columns=data.columns)
    test_data = pd.DataFrame(columns=data.columns)

    for _, user_data in data.groupby("user_id"):
        rated_data = user_data[user_data["rating"] > 0]
        test_count = int(len(rated_data) * test_size)

        test_indices = np.random.choice(
            rated_data.index, size=test_count, replace=False
        )
        user_test_data = user_data.loc[test_indices]
        user_train_data = user_data.drop(test_indices)

        test_data = pd.concat([test_data, user_test_data])
        train_data = pd.concat([train_data, user_train_data])

    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)

    return train_data, test_data

train_data, test_data = train_test(ratings_users, test_size=0.2)
# crop_train_data = train_data.iloc[:100]


In [19]:
null_vals = train_data[train_data["rating"] == 0].shape[0]
not_null_vals = train_data[train_data["rating"] != 0].shape[0]

print(f"There are {null_vals} zeros in the trimmed sample.")
print(f"There are {not_null_vals} non-zeros in the trimmed sample.")


There are 233 zeros in the trimmed sample.
There are 154 non-zeros in the trimmed sample.


In [20]:
A: pd.DataFrame = train_data.pivot(
    values="rating", index="user_id", columns="isbn"
).infer_objects(copy=False)
A.fillna(0, inplace=True)
A = A.astype(int)
A_norm: pd.DataFrame = (A - np.min(A)) / (np.max(A) - np.min(A))
test_data["rating"] = (test_data["rating"] - np.min(A)) / (np.max(A) - np.min(A))


In [21]:
def SVD(A: np.ndarray, d: int, learning_rate: float, lambda_reg, n_iters):
    mu = A.sum()/(A!=0).sum()
    non_zero = (A!=0).sum()

    # Initialize matrices U and V with dimensions (rows of A, d) and (d, columns of A), filled with the mean value mu
    U = np.zeros((A.shape[0], d)) + mu
    V = np.zeros((d, A.shape[1])) + mu

    mse_start = 0
    index, zero_index, mse = [], [], []

    for i in range(A.shape[0]):
        for j in range(A.shape[1]):
            if A[i][j]>0:
                index.append([i, j])
                mse_start += ((A[i, j] - np.dot(U[i,:], V[:,j])) ** 2) / non_zero
            else:
                zero_index.append([i, j])

    # Stochastic Gradient Descent loop over specified number of iterations
    for n in range(n_iters):
        choice = np.random.randint(0, len(index))
        ij = index[choice]
        i = ij[0]
        j = ij[1]

        # Update factors U and V for chosen element (i, j)
        for k in range(d):
            U[i, k] = U[i, k] + learning_rate * ((A[i][j] - np.dot(U[i, :], V[:, j])) * V[k, j] - lambda_reg * U[i, k])
            V[k, j] = V[k, j] + learning_rate * ((A[i][j] - np.dot(U[i, :], V[:, j])) * U[i, k] - lambda_reg * V[k, j])

        current_mse = 0
        for i in range(A.shape[0]):
            for j in range(A.shape[1]):
                if A[i,j]>0:
                    current_mse += ((A[i, j] - np.dot(U[i,:], V[:,j])) ** 2) / non_zero
        mse.append(current_mse)

    return U, V, mse_start, mse


In [22]:
U, V, mse_start, mse = SVD(A_norm.values, 10, learning_rate=0.005, lambda_reg=0.02, n_iters=1000)


In [23]:
def evaluate(A: pd.DataFrame, test_data: pd.DataFrame, U: np.ndarray, V: np.ndarray):
    U_series = pd.Series(data=list(U), index=A.index)
    V_series = pd.Series(data=list(V.T), index=A.columns)
    len_test_sample = len(test_data)
    mse_test = 0

    for _, row in test_data.iterrows():
        true_val = row.iloc[2]
        prediction = np.dot(U_series[row.iloc[0]], V_series[row.iloc[1]])
        if prediction > 1:
            prediction = 1

        mse_test += ((true_val - prediction) ** 2) / len_test_sample

    return mse_test


In [24]:
mse_test = evaluate(A_norm, test_data, U, V)
print(f"MSE on the test sample: {mse_test}")


MSE on the test sample: 0.06301236579139169
