In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("hadeeniyaf/movies")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/movies


In [2]:
import pandas as pd

# Load the datasets
movies = pd.read_csv("/kaggle/input/movies/movies.csv",nrows=100000)
ratings = pd.read_csv("/kaggle/input/movies/ratings.csv",nrows=100000)

# Display first few rows
print(movies.head())
print(ratings.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510


**Transforming data**

Popularity-Based Recommendation (Top Rated & Most Rated)

In [3]:
# Calculate average rating for each movie
movie_avg_rating = ratings.groupby('movieId')['rating'].mean().reset_index()
movie_avg_rating.columns = ['movieId', 'avg_rating']

# Count number of ratings per movie
movie_ratings_count = ratings.groupby('movieId').count()['rating'].reset_index()
movie_ratings_count.columns = ['movieId', 'num_ratings']
# Merge with movies
movies = movies.merge(movie_avg_rating, on='movieId')
movies = movies.merge(movie_ratings_count, on='movieId')
# Get top 10 popular movies
top_movies = movies.sort_values(by=['num_ratings', 'avg_rating'], ascending=[False, False])
print(top_movies[['title', 'num_ratings', 'avg_rating']].head(10))

                                          title  num_ratings  avg_rating
314                         Forrest Gump (1994)          370    4.072973
259                         Pulp Fiction (1994)          363    4.162534
279            Shawshank Redemption, The (1994)          352    4.414773
510            Silence of the Lambs, The (1991)          325    4.189231
1911                         Matrix, The (1999)          315    4.169841
417                        Jurassic Park (1993)          300    3.705000
228   Star Wars: Episode IV - A New Hope (1977)          292    4.183219
459                     Schindler's List (1993)          280    4.339286
99                            Braveheart (1995)          267    4.080524
507           Terminator 2: Judgment Day (1991)          255    3.972549


installing package if needed

In [4]:
!pip install scikit-surprise



**SVD Model training** (collaborative filtering)

In [5]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate

# Prepare data
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

# Train model
svd = SVD()
svd.fit(trainset)

# Make prediction for a user
pred = svd.predict(uid=1, iid=50)
print(f"Predicted Rating: {pred.est}")


Predicted Rating: 4.351308029870295


**Applying TF-IDF and cosine similarities for Genre**

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert genres back to strings
movies['genres_str'] = movies['genres'].apply(lambda x: ' '.join(x) if isinstance(x, list) else 'Unknown')

# Compute TF-IDF for genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres_str'])

# Compute similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


**Recommandation Function**

In [7]:
def get_recommendations(user_id, top_n=10):
    # Get all unique movieIds
    all_movie_ids = movies["movieId"].unique()

    # Predict rating for each movie for the given user
    predictions = []
    for movie_id in all_movie_ids:
        prediction = svd.predict(user_id, movie_id)
        predictions.append((movie_id, prediction.est))

    # Sort movies based on predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Convert predictions to DataFrame
    recommended_movies = pd.DataFrame(predictions[:top_n], columns=['movieId', 'predicted_rating'])

    # Merge with movie titles
    recommended_movies = recommended_movies.merge(movies, on='movieId', how='left')

    return recommended_movies


**Genrating Recommendation for all user and creating a dataframe**

In [8]:
# Generate Recommendations for ALL USERS
all_recommendations = []

# Loop through every user in the dataset
for user_id in ratings['userId'].unique():
    # Get recommendations for each user
    recommendations = get_recommendations(user_id, top_n=10)
    recommendations['userId'] = user_id
    all_recommendations.append(recommendations)

# Combine all user recommendations
all_recommendations_df = pd.concat(all_recommendations, ignore_index=True)

# Save to CSV
all_recommendations_df.to_csv('/content/recommendations_all_users.csv', index=False)

print("All user recommendations CSV generated successfully!")


All user recommendations CSV generated successfully!


For presenting the recommendation system use visualizing tools like power Bi or tableau by downloading the csv file. Visualizing in these tools make the recommendation system much more attractive easier.
