#Section 1:  Recommendation Based on Popularity

In [None]:
import pandas as pd
import numpy as np


movies_cols = ['MovieID', 'Title', 'Genres']
movies = pd.read_csv('ml-1m/movies.dat', sep='::', names=movies_cols, encoding='latin-1', engine='python')

rating_matrix = pd.read_csv('Rmat.csv')

rating_matrix.head(), rating_matrix.info()

rating_stats = {
    "num_users": rating_matrix.shape[0],
    "num_movies": rating_matrix.shape[1],
    "num_ratings": rating_matrix.count().sum(),
    "rating_density": rating_matrix.count().sum() / (rating_matrix.shape[0] * rating_matrix.shape[1])
}


ratings_filled = rating_matrix.fillna(0)

movie_rating_counts = ratings_filled.astype(bool).sum(axis=0)

movie_avg_ratings = rating_matrix.mean(axis=0, skipna=True)

popularity_df = pd.DataFrame({
    "MovieID": rating_matrix.columns,
    "RatingCount": movie_rating_counts,
    "AverageRating": movie_avg_ratings
}).sort_values(by=["RatingCount", "AverageRating"], ascending=[False, False])


popularity_df['MovieID'] = popularity_df['MovieID'].str.lstrip('m')
popularity_df['MovieID'] = popularity_df['MovieID'].astype(str)
movies['MovieID'] = movies['MovieID'].astype(str)
top_10_popular_movies = popularity_df.head(10)



top_10_popular_movies = popularity_df.head(10)
top_10_with_titles = pd.merge(top_10_popular_movies, movies, on='MovieID', how='left')

for _, row in top_10_with_titles.iterrows():
    print(f"Title: {row['Title']} (Rating: {row['AverageRating']:.2f}, Count: {row['RatingCount']})")
    display(Image(filename=f"./MovieImages/{row['MovieID']}.jpg"))


<class 'pandas.core.frame.DataFrame'>
Index: 6040 entries, u1 to u999
Columns: 3706 entries, m1 to m999
dtypes: float64(3706)
memory usage: 170.8+ MB
Title: American Beauty (1999) (Rating: 4.32, Count: 3428)


TypeError: 'module' object is not callable

In [3]:
import pandas as pd
import numpy as np

def normalize_ratings(R):
    row_means = R.mean(axis=1, skipna=True)
    R_normalized = R.sub(row_means, axis=0)
    return R_normalized, row_means


def compute_cosine_similarity(R_normalized):
    R_normalized = np.array(R_normalized)

    num_movies = R_normalized.shape[1]
    S = np.full((num_movies, num_movies), np.nan)  

    for i in range(num_movies):
        for j in range(i + 1, num_movies):  

            common_users = ~np.isnan(R_normalized[:, i]) & ~np.isnan(R_normalized[:, j])
            
            if np.sum(common_users) > 2:  
                R_i = R_normalized[common_users, i]
                R_j = R_normalized[common_users, j]
                numerator = np.sum(R_i * R_j)
                denominator = np.sqrt(np.sum(R_i**2)) * np.sqrt(np.sum(R_j**2))
                
                if denominator > 0:
                    similarity = 0.5 + 0.5 * (numerator / denominator)  
                    S[i, j] = similarity
                    S[j, i] = similarity  
                    
    return S



def adjust_similarity_matrix(S, top_k=30):
    S_copy = S.copy()

    for i, row in S_copy.iterrows():
        top_indices = row.dropna().nlargest(top_k).index
        S_copy.loc[i, ~S_copy.columns.isin(top_indices)] = np.nan

    return S_copy


In [4]:
R = pd.read_csv('Rmat.csv')  
R_normalized, row_means = normalize_ratings(R)  # Normalize ratings
#print(R_normalized)

In [5]:
S = compute_cosine_similarity(R_normalized) 
S_df = pd.DataFrame(S, columns=R_normalized.columns, index=R_normalized.columns)
#print(S_df)

In [6]:

movie_ids = ['m1', 'm10', 'm100', 'm1510', 'm260', 'm3212']

similarity_dict = {movie: [None] * len(movie_ids) for movie in movie_ids}

for i in range(len(movie_ids)):
    for j in range(i+1, len(movie_ids)): 
        movie_i = movie_ids[i]
        movie_j = movie_ids[j]
        
        similarity_value = S_df.loc[movie_i, movie_j]
        
     
        similarity_dict[movie_i][j] = similarity_value
        similarity_dict[movie_j][i] = similarity_value  

similarity_matrix = pd.DataFrame(similarity_dict, index=movie_ids, columns=movie_ids)

similarity_matrix_rounded = similarity_matrix.round(7)

pd.set_option('display.float_format', '{:,.7f}'.format)

# Display the similarity matrix
print(similarity_matrix_rounded)

             m1       m10      m100  m1510      m260  m3212
m1          NaN 0.5121055 0.3919999    NaN 0.7411482    NaN
m10   0.5121055       NaN 0.5474583    NaN 0.5343338    NaN
m100  0.3919999 0.5474583       NaN    NaN 0.3296943    NaN
m1510       NaN       NaN       NaN    NaN       NaN    NaN
m260  0.7411482 0.5343338 0.3296943    NaN       NaN    NaN
m3212       NaN       NaN       NaN    NaN       NaN    NaN


In [None]:
S_adjusted = adjust_similarity_matrix(S_df) 
# print(S_adjusted.head(10))
S_adjusted.to_csv('adjusted_similarity_matrix.csv')

In [8]:
def myIBCF(newuser, similarity_matrix, rating_matrix, popularity_df, top_k=10):
    newuser = np.array(newuser)
    
    if similarity_matrix.shape[0] != rating_matrix.shape[1]:
        raise ValueError("Similarity matrix and rating matrix dimensions do not align.")
    
    predictions = np.full(len(newuser), np.nan)
    

    for i in range(len(newuser)):
        if np.isnan(newuser[i]):
            similarities = similarity_matrix.iloc[i, :]
            rated_movies_indices = np.where(~np.isnan(newuser))[0]
            valid_similarities = similarities.iloc[rated_movies_indices]
            rated_movie_ratings = newuser[rated_movies_indices]
            
            non_na_mask = ~valid_similarities.isna()
            valid_similarities = valid_similarities[non_na_mask]
            rated_movie_ratings = rated_movie_ratings[non_na_mask]
            
            weighted_sum = np.sum(valid_similarities * rated_movie_ratings)
            similarity_sum = np.sum(np.abs(valid_similarities))
#             print(f"Similarities for movie {i}: {valid_similarities}")
#             print(f"Weighted sum for movie {i}: {weighted_sum}")

            if similarity_sum > 0:
                predictions[i] = weighted_sum / similarity_sum
                #print(f"Movie {i}: Weighted Sum = {weighted_sum}, Similarity Sum = {similarity_sum}, Predicted Rating = {predictions[i]}")
        
    movie_ids = rating_matrix.columns
    prediction_df = pd.DataFrame({
        "MovieID": movie_ids,
        "PredictedRating": predictions
    })
    
    already_rated_movies = set(np.where(~np.isnan(newuser))[0])
    prediction_df = prediction_df[~prediction_df.index.isin(already_rated_movies)]
    prediction_df = prediction_df.sort_values(
        by=["PredictedRating", "MovieID"],
        ascending=[False, True]
)

    recommended_movies = prediction_df.head(top_k).dropna(subset=["PredictedRating"])
    
    if len(recommended_movies) < top_k:
        remaining_movies = [
            movie for movie in popularity_df["MovieID"]
            if movie not in already_rated_movies
        ]
        remaining_df = pd.DataFrame({
            "MovieID": remaining_movies[:top_k - len(recommended_movies)],
            "PredictedRating": [None] * (top_k - len(recommended_movies))
        })
        recommended_movies = pd.concat([recommended_movies, remaining_df])

    return recommended_movies



def save_popularity_ranking(rating_matrix, output_file="popularity_ranking.csv"):

    movie_popularity = rating_matrix.notna().sum(axis=0)
    popularity_ranking = movie_popularity.sort_values(ascending=False).index.tolist()
    pd.DataFrame({"MovieID": popularity_ranking}).to_csv(output_file, index=False)
    return popularity_ranking


popularity_ranking = save_popularity_ranking(rating_matrix)


u1181_ratings = rating_matrix.loc['u1181'].values


hypothetical_user = np.full(R.shape[1], np.nan)
hypothetical_user[1613 - 1] = 5  
hypothetical_user[1755 - 1] = 4  

recommendations = myIBCF(u1181_ratings, S_adjusted, R, popularity_ranking)
print("Top 10 Recommendations for user1181:", recommendations.to_string(index=False))
recommendations = myIBCF(hypothetical_user, S_adjusted, R, popularity_ranking)
print("Top 10 Recommendations for hypothetical user:", recommendations.to_string(index=False))


Top 10 Recommendations for user1181: MovieID  PredictedRating
  m3732        5.0000000
   m749        4.5265592
  m3899        4.5260660
  m1039        4.0000000
  m1235        4.0000000
  m1253        4.0000000
  m1734        4.0000000
  m1914        4.0000000
  m2082        4.0000000
  m2361        4.0000000
Top 10 Recommendations for hypothetical user: MovieID  PredictedRating
  m1468        5.0000000
  m1518        5.0000000
  m2643        5.0000000
  m2688        5.0000000
  m2977        5.0000000
   m435        5.0000000
   m436        5.0000000
  m1003        5.0000000
   m117        5.0000000
  m1243        5.0000000
