In [1]:
import pandas as pd
df=pd.read_csv("movie.csv")
print(df.head())


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [2]:
# Check for missing values
print(df.isnull().sum())


movieId    0
title      0
genres     0
dtype: int64


In [3]:
df_cleaned = df.dropna()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'))

tfidf_matrix = vectorizer.fit_transform(df['genres'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

print(tfidf_df)

       (no genres listed)  action  adventure  animation  children    comedy  \
0                     0.0     0.0   0.419151   0.518269  0.505742  0.264175   
1                     0.0     0.0   0.515311   0.000000  0.621768  0.000000   
2                     0.0     0.0   0.000000   0.000000  0.000000  0.602583   
3                     0.0     0.0   0.000000   0.000000  0.000000  0.544541   
4                     0.0     0.0   0.000000   0.000000  0.000000  1.000000   
...                   ...     ...        ...        ...       ...       ...   
27273                 0.0     0.0   0.000000   0.000000  0.000000  1.000000   
27274                 0.0     0.0   0.000000   0.000000  0.000000  1.000000   
27275                 0.0     0.0   1.000000   0.000000  0.000000  0.000000   
27276                 1.0     0.0   0.000000   0.000000  0.000000  0.000000   
27277                 0.0     0.0   0.555125   0.000000  0.000000  0.000000   

       crime  documentary     drama   fantasy  film



In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute pairwise cosine similarity based on TF-IDF features
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [7]:
def recommend_films(movie_id, df, cosine_sim):
    if movie_id not in df['movieId'].values:
        print(f"movie '{movie_id}' not found in the dataset.")
        return

    # Get the index of the m
    index = df[df['movieId'] == movie_id].index[0]

    # Get similarity scores for that film
    sim_scores = list(enumerate(cosine_sim[index]))

    # Sort the films based on similarity score (excluding self)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    similar_films = [df.iloc[i[0]]['movieId'] for i in sim_scores[1:6]]

    print(f"\nTop similar films to '{movie_id}':")
    for movie_id in similar_films:
        movie_title = df[df['movieId'] == movie_id]['title'].values[0]  
        print(f"✅ {movie_id} - {movie_title}")


recommend_films(19, df_cleaned, cosine_sim)


Top similar films to '19':
✅ 18 - Four Rooms (1995)
✅ 19 - Ace Ventura: When Nature Calls (1995)
✅ 65 - Bio-Dome (1996)
✅ 69 - Friday (1995)
✅ 88 - Black Sheep (1996)
