## Recommder Model 
### Recommending movies based on genres and cast - members

In [49]:
import pandas as pd;
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
def load_movie_data():
    response = requests.get('http://localhost:3001/api/movies')
    movies_data = response.json()
    df = pd.DataFrame(movies_data)
    
    
    
    # this for the both the features , the actors and genres 
    # df['combined_features'] = df.apply(lambda x: ' '.join(x['genres'] + x['actors']),axis=1)
    return df

In [28]:

movies_d = load_movie_data()
movies_d.head()
# movies_d.describe()
movies_d.shape

(2415, 7)

### Cleaning the data 

In [29]:
df_cleaned = movies_d[movies_d['genres'].apply(len) > 0 & (movies_d['actors'].apply(len) > 0)]
df_cleaned.shape

(2368, 7)

### Verify Cleaning

In [30]:
print("Movies with empty genres:", sum(df_cleaned['genres'].apply(len) == 0))
print("Movies with empty actors:", sum(df_cleaned['actors'].apply(len) == 0))

print("\nGenres count distribution:")
print(df_cleaned['genres'].apply(len).describe())
print("\nActors count distribution:")
print(df_cleaned['actors'].apply(len).describe())

Movies with empty genres: 0
Movies with empty actors: 20

Genres count distribution:
count    2368.000000
mean        2.037162
std         0.894694
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         6.000000
Name: genres, dtype: float64

Actors count distribution:
count    2368.000000
mean       10.857686
std         9.027701
min         0.000000
25%         3.000000
50%         8.000000
75%        16.000000
max        59.000000
Name: actors, dtype: float64


In [33]:
print(df_cleaned.columns)
# df_cleaned.drop(columns='href')


Index(['title', 'year', 'thumbnail', 'href', 'extract', 'genres', 'actors'], dtype='object')


In [32]:
print(df_cleaned.columns)

Index(['title', 'year', 'thumbnail', 'href', 'extract', 'genres', 'actors'], dtype='object')


### Creating the combined feature column 

In [34]:
df_cleaned['combined_features'] = df_cleaned.apply(lambda x: ''.join(x['genres']+ x['actors']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['combined_features'] = df_cleaned.apply(lambda x: ''.join(x['genres']+ x['actors']),axis=1)


In [37]:
print(df_cleaned.columns.to_list)
print(df_cleaned.head())

<bound method IndexOpsMixin.tolist of Index(['title', 'year', 'thumbnail', 'href', 'extract', 'genres', 'actors',
       'combined_features'],
      dtype='object')>
  title  year                                          thumbnail  \
0     9  2009  https://upload.wikimedia.org/wikipedia/en/c/c9...   
1    12  2009  https://upload.wikimedia.org/wikipedia/en/0/0e...   
2    21  2008  https://upload.wikimedia.org/wikipedia/en/a/a8...   
3   300  2007  https://upload.wikimedia.org/wikipedia/en/5/5c...   
4  1408  2007  https://upload.wikimedia.org/wikipedia/en/6/63...   

                     href                                            extract  \
0  9_(2009_animated_film)  9 is a 2009 computer-animated post-apocalyptic...   
1          12_(2007_film)  12 is a 2007 legal drama film by Russian direc...   
2          21_(2008_film)  21 is a 2008 American heist drama film directe...   
3              300_(film)  300 is a 2006 American epic historical action ...   
4             1408_(film)

In [47]:
# removed the column href column 
# df_cleaned = df_cleaned.drop(columns='href')
print(df_cleaned.columns)


Index(['title', 'year', 'thumbnail', 'extract', 'genres', 'actors',
       'combined_features'],
      dtype='object')


#### tf - idf matrix 

In [60]:
tdidf = TfidfVectorizer(stop_words='english')
tdidf_matrix = tdidf.fit_transform(df_cleaned['combined_features'])
simlarity_matrix = cosine_similarity(tdidf_matrix)
# tdidf_matrix
simlarity_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

#### recommendation functionn

In [54]:
def get_recommendations(title,num_recommendations=7):
    index = df_cleaned[df_cleaned['title'] == title].index[0]
    similarity_scores = list(enumerate(simlarity_matrix[index]))
    sorted(similarity_scores,key=lambda x : x[1] , reverse= True)
    
    similar_movies = similarity_scores[1:num_recommendations+1]
    movie_indicies = [i[0] for i in similar_movies]
    similarity_values = [i[1] for i in similar_movies]
    
    
    recommendations = pd.DataFrame({
            'Title': df_cleaned['title'].iloc[movie_indicies],
            'Year': df_cleaned['year'].iloc[movie_indicies],
            'Genres': df_cleaned['genres'].iloc[movie_indicies],
            'Similarity Score': similarity_values
        })
        
    return recommendations
    

In [56]:
# Cell 4: Test the recommender
# Example: Get recommendations for a movie
movie_title = df_cleaned['title'].iloc[1]  # Using first movie as example
print(f"Recommendations for: {movie_title}\n")
recommendations = get_recommendations(movie_title)
print(recommendations)

Recommendations for: 12

            Title  Year                       Genres  Similarity Score
1              12  2009               [Drama, Legal]               1.0
2              21  2008              [Drama, Action]               0.0
3             300  2007         [Action, Historical]               0.0
4            1408  2007                     [Horror]               0.0
5            2012  2009  [Disaster, Science Fiction]               0.0
6  102 Dalmatians  2000      [Comedy, Family, Crime]               0.0
7         28 Days  2000              [Drama, Comedy]               0.0


### Model for genre recommendatiosn 

### recommendations based on genre

In [71]:
def genre_based_recommendations(title,s_genre):
    num_recommendations = 5
    index = df_cleaned[df_cleaned['title'] == title].index[0]
    similarity_score = list(enumerate(simlarity_matrix[index]))
    sorted(similarity_score,key=lambda x: x[1],reverse=True)
    
       # Filter for movies containing the specific genre
    filtered_scores = []
    for i, score in similarity_score:
        if s_genre in df_cleaned['genres'].iloc[i]:
            filtered_scores.append((i, score))
    
    # Get top N recommendations (excluding the movie itself)
    similar_movies = filtered_scores[1:num_recommendations+1]
    
    movie_indices = [i[0] for i in similar_movies]
    similarity_values = [i[1] for i in similar_movies]
    
    recommendations = pd.DataFrame({
        'Title': df_cleaned['title'].iloc[movie_indices],
        'Year': df_cleaned['year'].iloc[movie_indices],
        'Genres': df_cleaned['genres'].iloc[movie_indices],
        'Similarity Score': similarity_values
    })
    
    return recommendations

### recommendations based on actors

In [72]:
# Cell 6: Create a function for actor-based recommendations
def get_actor_based_recommendations(title, specific_actor, n_recommendations=5):
    # Get initial recommendations
    idx = df_cleaned[df_cleaned['title'] == title].index[0]
    similarity_scores = list(enumerate(simlarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Filter for movies containing the specific actor
    filtered_scores = []
    for i, score in similarity_scores:
        if specific_actor in df_cleaned['actors'].iloc[i]:
            filtered_scores.append((i, score))
    
    # Get top N recommendations (excluding the movie itself)
    similar_movies = filtered_scores[1:n_recommendations+1]
    
    movie_indices = [i[0] for i in similar_movies]
    similarity_values = [i[1] for i in similar_movies]
    
    recommendations = pd.DataFrame({
        'Title': df_cleaned['title'].iloc[movie_indices],
        'Year': df_cleaned['year'].iloc[movie_indices],
        'Actors': df_cleaned['actors'].iloc[movie_indices],
        'Similarity Score': similarity_values
    })
    
    return recommendations

In [None]:
# Cell 7: Test genre and actor-based recommendations
# Test genre-based recommendations
movie_title = df_cleaned['title'].iloc[0]
genre = df_cleaned['genres'].iloc[0][2]  # First genre of the first movie
print(f"Genre-based recommendations for {movie_title} in genre {genre}:\n")
print(genre_based_recommendations(movie_title, genre))

# Test actor-based recommendations
actor = df_cleaned['actors'].iloc[0][0]  # First actor of the first movie
print(f"\nActor-based recommendations for {movie_title} with actor {actor}:\n")
print(get_actor_based_recommendations(movie_title, actor))

Genre-based recommendations for 9 in genre comedy:

Empty DataFrame
Columns: [Title, Year, Genres, Similarity Score]
Index: []

Actor-based recommendations for 9 with actor Elijah Wood:

                                                  Title  Year  \
357   The Lord of the Rings: The Fellowship of the Ring  2001   
580               The Lord of the Rings: The Two Towers  2002   
812       The Lord of the Rings: The Return of the King  2003   
940               Eternal Sunshine of the Spotless Mind  2004   
1148                                           Sin City  2005   

                                                 Actors  Similarity Score  
357   [Elijah Wood, Ian McKellen, Liv Tyler, Sean As...               0.0  
580   [Elijah Wood, Ian McKellen, Liv Tyler, Viggo M...               0.0  
812   [Elijah Wood, Ian McKellen, Liv Tyler, Sean As...               0.0  
940   [Jim Carrey, Kate Winslet, Kirsten Dunst, Tom ...               0.0  
1148  [Jessica Alba, Benicio del Toro, Bri