In [None]:
import pandas as pd
import numpy as np
import time

def load_data(input_file):
    column_names = ['user_id', 'movie_id', 'rating', 'timestamp']
    df = pd.read_csv(input_file, sep='\t', names=column_names, usecols=[0, 1, 2], engine='python')
    
    return df

def function_1(user_movie_matrix, movie_a, movie_b, user_threshold):
    """
    Function to find common users between two movies.
    
    Arguments:
    ----------
    user_movie_matrix : pd.DataFrame
        Matrix of user ratings for movies.
    movie_a : int
        ID of the first movie.
    movie_b : int
        ID of the second movie.
    user_threshold : int
        Minimum number of common users required for similarity.
    
    Returns:
    --------
    common_users : list
        List of users who rated both movies.
    """
    common_users = user_movie_matrix.loc[movie_a].dropna().index.intersection(user_movie_matrix.loc[movie_b].dropna().index)
    if len(common_users) >= user_threshold:
        return common_users
    else:
        return []

def function_2(ratings_a, ratings_b, mean_a, mean_b):
    """
    Function to calculate Adjusted Cosine Similarity between two movies.
    
    Arguments:
    ----------
    ratings_a : pd.Series
        Ratings of the first movie from the common users.
    ratings_b : pd.Series
        Ratings of the second movie from the common users.
    mean_a : float
        Mean rating for the first movie.
    mean_b : float
        Mean rating for the second movie.
    
    Returns:
    --------
    similarity_score : float
        The cosine similarity score between the two movies.
    """
    numerator = np.sum((ratings_a - mean_a) * (ratings_b - mean_b))
    denominator = np.sqrt(np.sum((ratings_a - mean_a)**2) * np.sum((ratings_b - mean_b)**2))
    
    if denominator != 0:
        similarity_score = numerator / denominator
    else:
        similarity_score = 0
    
    return similarity_score

def compute_similarity(input_file, output_file, user_threshold=5):
    df = load_data(input_file)
    
    user_movie_matrix = df.pivot(index='movie_id', columns='user_id', values='rating')
    
    similarities = []
    
def compute_similarity(input_file, output_file, user_threshold=5):
    # Load the data
    df = load_data(input_file)
    
    # Create a user-movie matrix from the DataFrame
    user_movie_matrix = df.pivot(index='movie_id', columns='user_id', values='rating')
    
    # Initialize a list to store similarities
    similarities = []
    
    # Loop through movie pairs to compute similarities
    for movie_a in user_movie_matrix.index:
        for movie_b in user_movie_matrix.index:
            if movie_a < movie_b:
                # Get common users
                common_users = user_movie_matrix.loc[movie_a].dropna().index.intersection(
                    user_movie_matrix.loc[movie_b].dropna().index
                )
                
                if not common_users.empty:  # Ensure there are common users
                    ratings_a = user_movie_matrix.loc[movie_a, common_users]
                    ratings_b = user_movie_matrix.loc[movie_b, common_users]
                    
                    # Compute the mean ratings for the common users
                    mean_a = ratings_a.mean()
                    mean_b = ratings_b.mean()
                    
                    # Calculate the numerator and denominator for the similarity score
                    numerator = np.sum((ratings_a - mean_a) * (ratings_b - mean_b))
                    denominator = np.sqrt(np.sum((ratings_a - mean_a)**2) * np.sum((ratings_b - mean_b)**2))
                    
                    # Avoid division by zero
                    if denominator != 0:
                        similarity_score = numerator / denominator
                    else:
                        similarity_score = 0
                    
                    # Append the results to the similarities list
                    similarities.append([movie_a, movie_b, similarity_score, len(common_users)])
    
    # Check if there are computed similarities
    if similarities:
        print(f"Writing {len(similarities)} movie pairs to the output file.")
        similarity_df = pd.DataFrame(similarities, columns=['movie_id_a', 'movie_id_b', 'similarity', 'common_users'])
        similarity_df.to_csv(output_file, index=False)
    else:
        print("No movie pairs with enough common users. Output file not created.")

if __name__ == "__main__":
    input_file = r"C:\Users\Hayat\Documents\ml-100k\u.data"  # Path to the u.data file
    output_file = r"C:\Users\Hayat\Documents\movie_similarities.csv"  # Path for output
    
    t1 = time.time()
    compute_similarity(input_file, output_file, user_threshold=5)
    print(f"Movie similarities computed in {time.time() - t1:.2f} seconds")
