- Popularity is defined by the number of reviews.

In [None]:
import pandas as pd
from IPython.display import display, HTML

#Rmat = pd.read_csv(path + 'Rmat.csv', index_col=0)
Rmat = pd.read_csv('Rmat.csv', index_col=0)
#movies_url = ""

movies = pd.read_csv(
    movies_url,
    sep="::",
    engine="python",
    header=None,
    names=["MovieID", "Title", "Genres"],
    encoding="latin1"
)
movies["MovieID"] = movies["MovieID"].astype(int)

#small_image_url = ""
movies["ImageURL"] = movies["MovieID"].apply(lambda x: f"{small_image_url}{x}.jpg?raw=true")

popularity = Rmat.notna().sum(axis=0)
popularity_df = pd.DataFrame({
    "MovieID": Rmat.columns.str.replace("m", "").astype(int),
    "NumReviews": popularity.values
}).sort_values(by="NumReviews", ascending=False)

popular_movies = popularity_df.head(10).merge(movies, on="MovieID")

html_content = "<h2>Top 10 Popular Movies</h2><table>"
for _, row in popular_movies.iterrows():
    html_content += f"""
    <tr>
        <td><img src="{row['ImageURL']}" height="150"></td>
        <td>{row['Title']}</td>
        <td>Movie ID: {row['MovieID']}</td>
    </tr>
    """
html_content += "</table>"

display(HTML(html_content))


0,1,2
,American Beauty (1999),Movie ID: 2858
,Star Wars: Episode IV - A New Hope (1977),Movie ID: 260
,Star Wars: Episode V - The Empire Strikes Back (1980),Movie ID: 1196
,Star Wars: Episode VI - Return of the Jedi (1983),Movie ID: 1210
,Jurassic Park (1993),Movie ID: 480
,Saving Private Ryan (1998),Movie ID: 2028
,Terminator 2: Judgment Day (1991),Movie ID: 589
,"Matrix, The (1999)",Movie ID: 2571
,Back to the Future (1985),Movie ID: 1270
,"Silence of the Lambs, The (1991)",Movie ID: 593


# System 2

##1. Normalize the rating matrix by centering each row.

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.float_format', '{:.7f}'.format)

#Rmat = pd.read_csv(path + 'Rmat.csv', index_col=0)

def normalize_ratings(matrix):
    normalized_matrix = matrix.copy()
    for user in matrix.index:
        row = matrix.loc[user]
        row_mean = row[row.notna()].mean()
        normalized_matrix.loc[user] = row - row_mean
    return normalized_matrix

R_normalized = normalize_ratings(Rmat)

In [None]:
R_normalized

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
u1,0.8113208,,,,,,,,,,...,,,,,,,,,,
u10,0.8852868,,,,,,,,,,...,,,,,,,,,,
u100,,,,,,,,,,,...,,,,,,,,,,
u1000,0.8690476,,,,,,,,,,...,,,,,,,,,,
u1001,0.3474801,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
u995,,,,,,,,,,,...,,,,,,,,,,
u996,0.0641892,,,,,,,,,,...,,,,,,,,,,-0.9358108
u997,0.0666667,,,,,,,,,,...,,,,,,,,,,
u998,,,,,,,,,,,...,,,,,,,,,,


## 2. Compute the (transformed) Cosine similarity among the 3,706 movies.

In [None]:
import numpy as np
import pandas as pd

Rmat_indices = R_normalized.index
Rmat_columns = R_normalized.columns

Rmat_normalized = R_normalized.values

def compute_cosine_similarity(matrix):
    n_movies = matrix.shape[1]
    similarity_matrix = np.zeros((n_movies, n_movies), dtype=np.float64)

    for i in range(n_movies):
        for j in range(i, n_movies):
            shared_users = np.where(~np.isnan(matrix[:, i]) & ~np.isnan(matrix[:, j]))[0]
            if len(shared_users) < 3:
                similarity_matrix[i, j] = np.nan
                similarity_matrix[j, i] = np.nan
                continue

            vec_i = matrix[shared_users, i]
            vec_j = matrix[shared_users, j]
            dot_product = np.dot(vec_i, vec_j)
            norm_i = np.linalg.norm(vec_i)
            norm_j = np.linalg.norm(vec_j)
            if norm_i == 0 or norm_j == 0:
                similarity = np.nan
            else:
                similarity = dot_product / (norm_i * norm_j)

            transformed_similarity = (1 + similarity) / 2
            similarity_matrix[i, j] = transformed_similarity
            similarity_matrix[j, i] = transformed_similarity

    return similarity_matrix

cosine_similarity_matrix = compute_cosine_similarity(Rmat_normalized)

cosine_similarity_df = pd.DataFrame(
    cosine_similarity_matrix,
    columns=Rmat_columns,
    index=Rmat_columns
)

cosine_similarity_df.to_csv("cosine_similarity_matrix_corrected.csv")

In [None]:
cosine_similarity_df

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,1.0000000,0.5121055,0.3919999,0.7296371,0.4052488,0.3443622,0.1934793,0.2920968,0.2757620,0.4342140,...,0.5256346,0.1678860,0.4382444,0.2044081,0.5517557,0.6834218,0.2906526,0.5140432,0.3837718,0.4145054
m10,0.5121055,1.0000000,0.5474583,0.4904717,,0.6109830,0.4237425,0.4606591,0.6576989,0.5495395,...,0.2617006,0.4658628,0.4480788,0.3857350,,0.4544643,0.5475044,0.6687327,0.4482895,0.6008116
m100,0.3919999,0.5474583,1.0000000,0.4829650,,0.8365839,0.6295382,0.5682818,0.8118070,0.4885245,...,0.4107531,0.6426157,0.4936404,0.1936714,0.8028437,0.3067432,0.6293738,0.2695757,0.4789227,0.6128149
m1000,0.7296371,0.4904717,0.4829650,1.0000000,,0.1807649,,,,0.7052228,...,,,0.2073925,0.9015211,,0.2260270,0.6684361,,0.7253362,0.6805737
m1002,0.4052488,,,,1.0000000,,,,,,...,,,,,,0.7227661,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,0.6834218,0.4544643,0.3067432,0.2260270,0.7227661,0.2517379,0.2271863,0.1402858,0.2490622,0.2743969,...,0.4011803,0.1486861,0.4705176,0.1928585,0.5397140,1.0000000,0.2155611,0.4490137,0.3078245,0.3985167
m996,0.2906526,0.5475044,0.6293738,0.6684361,,0.7908892,0.7119653,0.6911337,0.8060751,0.6216948,...,0.6181369,0.7796494,0.4780710,0.7975184,,0.2155611,1.0000000,0.0771135,0.5563784,0.6225577
m997,0.5140432,0.6687327,0.2695757,,,0.3660229,0.9327237,0.9492277,0.2144257,0.2100087,...,0.2157111,0.8661206,0.4162218,,0.4120181,0.4490137,0.0771135,1.0000000,0.6426355,0.4606457
m998,0.3837718,0.4482895,0.4789227,0.7253362,,0.4450076,0.8437724,0.6048153,0.3545709,0.5041464,...,,0.6983913,0.6629043,0.8523279,,0.3078245,0.5563784,0.6426355,1.0000000,0.6427270


In [None]:
cosine_similarity_df.to_csv("Smat.csv")

## 3. Save Smat with top 30 information

In [None]:
Smat = pd.read_csv('Smat.csv', index_col=0)
Smat.head()

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,1.0,0.5121055,0.3919999,0.7296371,0.4052488,0.3443622,0.1934793,0.2920968,0.275762,0.434214,...,0.5256346,0.167886,0.4382444,0.2044081,0.5517557,0.6834218,0.2906526,0.5140432,0.3837718,0.4145054
m10,0.5121055,1.0,0.5474583,0.4904717,,0.610983,0.4237425,0.4606591,0.6576989,0.5495395,...,0.2617006,0.4658628,0.4480788,0.385735,,0.4544643,0.5475044,0.6687327,0.4482895,0.6008116
m100,0.3919999,0.5474583,1.0,0.482965,,0.8365839,0.6295382,0.5682818,0.811807,0.4885245,...,0.4107531,0.6426157,0.4936404,0.1936714,0.8028437,0.3067432,0.6293738,0.2695757,0.4789227,0.6128149
m1000,0.7296371,0.4904717,0.482965,1.0,,0.1807649,,,,0.7052228,...,,,0.2073925,0.9015211,,0.226027,0.6684361,,0.7253362,0.6805737
m1002,0.4052488,,,,1.0,,,,,,...,,,,,,0.7227661,,,,


In [None]:
np.fill_diagonal(Smat.values, np.nan)
Smat.head()

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,,0.5121055,0.3919999,0.7296371,0.4052488,0.3443622,0.1934793,0.2920968,0.275762,0.434214,...,0.5256346,0.167886,0.4382444,0.2044081,0.5517557,0.6834218,0.2906526,0.5140432,0.3837718,0.4145054
m10,0.5121055,,0.5474583,0.4904717,,0.610983,0.4237425,0.4606591,0.6576989,0.5495395,...,0.2617006,0.4658628,0.4480788,0.385735,,0.4544643,0.5475044,0.6687327,0.4482895,0.6008116
m100,0.3919999,0.5474583,,0.482965,,0.8365839,0.6295382,0.5682818,0.811807,0.4885245,...,0.4107531,0.6426157,0.4936404,0.1936714,0.8028437,0.3067432,0.6293738,0.2695757,0.4789227,0.6128149
m1000,0.7296371,0.4904717,0.482965,,,0.1807649,,,,0.7052228,...,,,0.2073925,0.9015211,,0.226027,0.6684361,,0.7253362,0.6805737
m1002,0.4052488,,,,,,,,,,...,,,,,,0.7227661,,,,


In [None]:
def top_k_upper(matrix):
    filtered_matrix = pd.DataFrame(np.nan, index=matrix.index, columns=matrix.columns)

    for i in range(len(matrix)):
        row = matrix.iloc[i, i+1:]
        sorted_values = row.dropna().sort_values(ascending=False)

        if len(sorted_values) > 30:
            sorted_values.iloc[30:] = np.nan

        for col_index, value in zip(sorted_values.index, sorted_values.values):
            filtered_matrix.loc[matrix.index[i], col_index] = value

    return filtered_matrix


Smat_top30 = top_k_upper(Smat)
#print(Smat_top30.head())

In [None]:
matrix_array = Smat_top30.values

i_upper = np.triu_indices_from(matrix_array, k=1)
matrix_array.T[i_upper] = matrix_array[i_upper]

Smat_top30 = pd.DataFrame(matrix_array, index=Smat_top30.index, columns=Smat_top30.columns)

print(Smat_top30.head())

       m1  m10  m100  m1000  m1002  m1003  m1004  m1005  m1006  m1007  ...  \
m1    NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m10   NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m100  NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m1000 NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   
m1002 NaN  NaN   NaN    NaN    NaN    NaN    NaN    NaN    NaN    NaN  ...   

       m99  m990  m991  m992  m993  m994  m996  m997  m998  m999  
m1     NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
m10    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
m100   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
m1000  NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
m1002  NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  

[5 rows x 3706 columns]


In [None]:
Smat_top30.to_csv("Smat_top30.csv")
Smat_top30

Unnamed: 0,m1,m10,m100,m1000,m1002,m1003,m1004,m1005,m1006,m1007,...,m99,m990,m991,m992,m993,m994,m996,m997,m998,m999
m1,,,,,,,,,,,...,,,,,,,,,,
m10,,,,,,,,,,,...,,,,,,,,,,
m100,,,,,,,,,,,...,,,,,,,,,,
m1000,,,,,,,,,,,...,,,,,,,,,,
m1002,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m994,,,,,,,,,,,...,0.4011803,0.1486861,0.4705176,0.1928585,0.5397140,,0.2155611,0.4490137,0.3078245,0.3985167
m996,,,,,,,,,,,...,0.6181369,0.7796494,0.4780710,0.7975184,,0.2155611,,0.0771135,0.5563784,0.6225577
m997,,,,,,,,,,,...,0.2157111,0.8661206,0.4162218,,0.4120181,0.4490137,0.0771135,,0.6426355,0.4606457
m998,,,,,,,,,,,...,,0.6983913,0.6629043,0.8523279,,0.3078245,0.5563784,0.6426355,,0.6427270


#### Display the pairwise similarity values from the 𝑆 matrix (you obtained at Step 2) for the specified movies:  
“m1”, “m10”, “m100”, “m1510”, “m260”, “m3212”. Please round the results to 7 decimal places.

In [None]:
import pandas as pd
import numpy as np

Smat = pd.read_csv("Smat.csv", index_col=0)

np.fill_diagonal(Smat.values, np.nan)

target_movies = ["m1", "m10", "m100", "m1510", "m260", "m3212"]
filtered_similarities = Smat.loc[target_movies, target_movies]
target_movies_similarities = filtered_similarities.round(7)

target_movies_similarities
target_movies_similarities.to_csv("target_movies_similarities.csv")

In [None]:
target_movies_similarities

Unnamed: 0,m1,m10,m100,m1510,m260,m3212
m1,,0.5121055,0.3919999,,0.7411482,
m10,0.5121055,,0.5474583,,0.5343338,
m100,0.3919999,0.5474583,,,0.3296943,
m1510,,,,,,
m260,0.7411482,0.5343338,0.3296943,,,
m3212,,,,,,


## 4.Create a function myIBCF

In [None]:
import numpy as np
import pandas as pd

def myIBCF(newuser, similarity_matrix, R, top_k=10):
    pred = pd.Series(index=similarity_matrix.index, dtype=float)

    for movie in similarity_matrix.index:
        if not pd.isna(newuser[movie]):
            pred[movie] = np.nan
            continue

        rated_movies = newuser.dropna().index
        neighbors = similarity_matrix.loc[movie, rated_movies].dropna()

        if neighbors.empty:
            pred[movie] = np.nan
            continue

        numerator = (neighbors * newuser[neighbors.index]).sum()
        denominator = neighbors.sum()

        if denominator == 0:
            pred[movie] = np.nan
        else:
            pred[movie] = numerator / denominator

    recommendations = pred.sort_values(ascending=False).dropna().head(top_k).index.tolist()

    if len(recommendations) < top_k:
        popularity = R.notna().sum(axis=0).sort_values(ascending=False).index.tolist()
        recommendations += [movie for movie in popularity if movie not in newuser.dropna().index and movie not in recommendations][:top_k - len(recommendations)]

    return recommendations

Smat_top30 = pd.read_csv("Smat_top30.csv", index_col=0)
R = pd.read_csv("Rmat.csv", index_col=0)

user_ratings = R.loc["u1181"]
recomm_user1 = myIBCF(user_ratings, Smat_top30, R)
print(recomm_user1)

new_user = pd.Series(index=R.columns, dtype=float)
new_user["m1613"] = 5
new_user["m1755"] = 4
recomm_new = myIBCF(new_user, Smat_top30, R)
print(recomm_new)

['m749', 'm3899', 'm2554', 'm1235', 'm1253', 'm2793', 'm3288', 'm2082', 'm1995', 'm1039']
['m1017', 'm765', 'm74', 'm691', 'm2805', 'm3726', 'm2531', 'm2558', 'm2771', 'm2846']
