In [1]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load ratings.csv data
ratings_df = pd.read_csv(r"C:\Users\karrt\Documents\ProjectGRO\Diff_proj\ratings.csv")

In [2]:

# Filter movies rated above 4 for each user and concatenate the DataFrames
high_rated_movies = pd.concat([group for _, group in ratings_df[ratings_df['rating'] > 4].groupby('userId')])

print(high_rated_movies.head())

    userId  movieId  rating   timestamp
0        1      296     5.0  1147880044
2        1      307     5.0  1147868828
3        1      665     5.0  1147878820
8        1     1237     5.0  1147868839
16       1     2351     4.5  1147877957


In [3]:
print(len(high_rated_movies))

5813013


In [4]:
# Filter movies rated 3.5 for each user and count the occurrences
count_4_5_ratings = high_rated_movies[high_rated_movies['rating'] == 4.5].groupby('userId').size()

print("Number of 4.5 rated movies for each user:")
print(count_4_5_ratings)

Number of 4.5 rated movies for each user:
userId
1          7
2         29
3         37
4         36
10         7
          ..
162534    37
162536    13
162538    30
162540    21
162541    27
Length: 96195, dtype: int64


In [5]:
# Calculate the total number of ratings
total_ratings = ratings_df.shape[0]

# Count the number of ratings for each movie ID
movie_ratings_count = ratings_df['movieId'].value_counts()

# Find movie IDs with fewer than 20% of the total number of ratings
threshold = 0.0001 * total_ratings
less_rated_movies = movie_ratings_count[movie_ratings_count < threshold].index

# Display the movie IDs
print("Movie IDs with fewer than 20% of the total number of ratings:")
print(len(less_rated_movies))

Movie IDs with fewer than 20% of the total number of ratings:
56935


In [6]:
# Calculate the total number of unique movie IDs in the original dataset
total_movie_ids = ratings_df['movieId'].nunique()

# Display the total number of movie IDs
print("Total number of unique movie IDs in the original dataset:", total_movie_ids)


Total number of unique movie IDs in the original dataset: 59047


In [7]:
# Filter out rows with movie IDs in less_rated_movies
high_rated_movies = high_rated_movies[~high_rated_movies['movieId'].isin(less_rated_movies)]

# Display the updated DataFrame
print("DataFrame after removing movie IDs with fewer than 0.1% of the total number of ratings:")
print(len(high_rated_movies))


DataFrame after removing movie IDs with fewer than 0.1% of the total number of ratings:
4933754


In [8]:
total_movie_ids2 = high_rated_movies['movieId'].nunique()
total_movie_ids2

2112

In [9]:
import numpy as np

# Create a pivot table with user IDs as rows and movie IDs as columns, and fill it with ratings
pivot_table = high_rated_movies.pivot_table(index='userId', columns='movieId', values='rating', fill_value=np.nan)

# Replace NaN values with 0
pivot_table.fillna(0, inplace=True)

# Function to convert ratings to the desired format
def convert_ratings(rating):
    if rating == 5:
        return 2
    elif rating == 4 or rating == 4.5:
        return 1
    else:
        return 0

# Apply the function to all elements of the pivot table
vectorized_ratings = pivot_table.applymap(convert_ratings)

# Convert the pivot table to a DataFrame
vectorized_ratings_df = pd.DataFrame(vectorized_ratings)

# Display the resulting DataFrame
print("Vectorized ratings for each user:")
print(vectorized_ratings_df.head())



Vectorized ratings for each user:
movieId  1       2       3       4       5       6       7       9       \
userId                                                                    
1             0       0       0       0       0       0       0       0   
2             0       0       0       0       0       0       0       0   
3             0       0       0       0       0       0       0       0   
4             0       0       0       0       0       0       0       0   
5             0       0       0       0       0       0       0       0   

movieId  10      11      ...  168252  171763  174055  176371  177593  177765  \
userId                   ...                                                   
1             0       0  ...       0       0       0       0       0       0   
2             0       0  ...       0       0       0       0       0       0   
3             0       0  ...       0       0       0       1       0       0   
4             0       0  ...       0    

In [11]:
vectorized_ratings_df.shape

(160220, 2112)

In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Assuming you have the vectorized_ratings_df DataFrame with the shape (160220, 2112)

# Function to recommend movies for a given user
def recommend_movies(user_id, vectorized_ratings_df, n_recommendations=10):
    # Get the vector for the given user
    user_vector = vectorized_ratings_df.loc[user_id].values.reshape(1, -1)

    # Calculate cosine similarity between the user's vector and all other users' vectors
    similarities = cosine_similarity(user_vector, vectorized_ratings_df.values)

    # Find the most similar user
    most_similar_user_idx = np.argsort(similarities[0])[-2]  # Exclude the target user itself

    # Get movies watched by the most similar user but not by the target user
    similar_user_movies = vectorized_ratings_df.iloc[most_similar_user_idx]
    target_user_movies = vectorized_ratings_df.loc[user_id]
    new_movies = similar_user_movies[similar_user_movies != target_user_movies].index.tolist()

    # If there are fewer than n_recommendations different movies, get recommendations from the next most similar user
    if len(new_movies) < n_recommendations:
        second_most_similar_user_idx = np.argsort(similarities[0])[-3]  # Exclude the target user and the most similar user
        second_similar_user_movies = vectorized_ratings_df.iloc[second_most_similar_user_idx]
        additional_movies = second_similar_user_movies[~second_similar_user_movies.index.isin(new_movies)].index.tolist()
        new_movies.extend(additional_movies[:n_recommendations - len(new_movies)])

    # Return up to n_recommendations movies
    return new_movies[:n_recommendations]

# Example usage
user_id = input("Enter your user ID: ")  # Example user ID
user_id = int(user_id)  # Convert to integer if necessary
recommended_movies = recommend_movies(user_id, vectorized_ratings_df)
print("Recommended movies for User", user_id, ":")
print(recommended_movies)


Recommended movies for User 4 :
[541, 1136, 1197, 1580, 1732, 2028, 4963, 4993, 5110, 5618]


In [13]:
import pandas as pd

# Load movies.csv
movies_df = pd.read_csv(r"C:\Users\karrt\Documents\ProjectGRO\Diff_proj\movies.csv")  # Replace "path_to_movies.csv" with the actual path

# Function to recommend movies for a given user
def recommend_movies(user_id, vectorized_ratings_df, movies_df, n_recommendations=10):
    # Assume vectorized_ratings_df is already defined and contains the user vectors

    # Get the vector for the given user
    user_vector = vectorized_ratings_df.loc[user_id].values.reshape(1, -1)

    # Calculate cosine similarity between the user's vector and all other users' vectors
    similarities = cosine_similarity(user_vector, vectorized_ratings_df.values)

    # Find the most similar user
    most_similar_user_idx = np.argsort(similarities[0])[-2]  # Exclude the target user itself

    # Get movies watched by the most similar user but not by the target user
    similar_user_movies = vectorized_ratings_df.iloc[most_similar_user_idx]
    target_user_movies = vectorized_ratings_df.loc[user_id]
    new_movies = similar_user_movies[similar_user_movies != target_user_movies].index.tolist()

    # If there are fewer than n_recommendations different movies, get recommendations from the next most similar user
    if len(new_movies) < n_recommendations:
        second_most_similar_user_idx = np.argsort(similarities[0])[-3]  # Exclude the target user and the most similar user
        second_similar_user_movies = vectorized_ratings_df.iloc[second_most_similar_user_idx]
        additional_movies = second_similar_user_movies[~second_similar_user_movies.index.isin(new_movies)].index.tolist()
        new_movies.extend(additional_movies[:n_recommendations - len(new_movies)])

    # Map movie IDs to movie names
    recommended_movies = movies_df[movies_df['movieId'].isin(new_movies)][['movieId', 'title']]

    # Return up to n_recommendations movies
    return recommended_movies

# Example usage
user_id = input("Enter your user ID: ")  # Example user ID
user_id = int(user_id)  # Convert to integer if necessary
recommended_movies = recommend_movies(user_id, vectorized_ratings_df, movies_df)
print("Recommended movies for User", user_id, ":")
print(recommended_movies)


Recommended movies for User 100 :
      movieId                               title
0           1                    Toy Story (1995)
1           2                      Jumanji (1995)
2           3             Grumpier Old Men (1995)
3           4            Waiting to Exhale (1995)
4           5  Father of the Bride Part II (1995)
699       714                     Dead Man (1995)
1201     1233        Boot, Das (Boat, The) (1981)
1319     1354           Breaking the Waves (1996)
1323     1358                  Sling Blade (1996)
1424     1466                Donnie Brasco (1997)
