In [9]:
import pandas as pd
import ast

movies_rs = pd.read_csv('movies_eda.csv')

In [10]:
# Selecting only the useful features
movies_rs = movies_rs[[ 'title_x',
                       'genres', 'cast', 'keywords', 'crew']]

In [11]:
movies_rs.head()


Unnamed: 0,title_x,genres,cast,keywords,crew
0,Avatar,"['action', 'adventure', 'fantasy', 'science fi...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,Pirates of the Caribbean: At World's End,"['adventure', 'fantasy', 'action']","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,Spectre,"['action', 'adventure', 'crime']","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,The Dark Knight Rises,"['action', 'crime', 'drama', 'thriller']","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,John Carter,"['action', 'adventure', 'science fiction']","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [12]:

# Checking null values
movies_rs.isnull().sum()

title_x     0
genres      0
cast        0
keywords    0
crew        0
dtype: int64

In [13]:
def func(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    return List

In [16]:
movies_rs['keywords'] = movies_rs['keywords'].apply(func)

In [17]:
def func1(obj):
    List = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter !=3:
            List.append(i['name'])
            counter+=1
        else:
            break
    return List

In [18]:
movies_rs['cast'] = movies_rs['cast'].apply(func1)


In [20]:
def func2(obj):
    List = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            List.append(i['name'])
            break
    return List

In [21]:
movies_rs['crew'] = movies_rs['crew'].apply(func2)


In [23]:
movies_rs.head()

Unnamed: 0,title_x,genres,cast,keywords,crew
0,Avatar,"['action', 'adventure', 'fantasy', 'science fi...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[culture clash, future, space war, space colon...",[James Cameron]
1,Pirates of the Caribbean: At World's End,"['adventure', 'fantasy', 'action']","[Johnny Depp, Orlando Bloom, Keira Knightley]","[ocean, drug abuse, exotic island, east india ...",[Gore Verbinski]
2,Spectre,"['action', 'adventure', 'crime']","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[spy, based on novel, secret agent, sequel, mi...",[Sam Mendes]
3,The Dark Knight Rises,"['action', 'crime', 'drama', 'thriller']","[Christian Bale, Michael Caine, Gary Oldman]","[dc comics, crime fighter, terrorist, secret i...",[Christopher Nolan]
4,John Carter,"['action', 'adventure', 'science fiction']","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[based on novel, mars, medallion, space travel...",[Andrew Stanton]


In [24]:
# Applying a transformation to remove spaces between words 

movies_rs['genres'] = movies_rs['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies_rs['keywords'] = movies_rs['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies_rs['cast'] = movies_rs['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies_rs['crew'] = movies_rs['crew'].apply(lambda x:[i.replace(" ","") for i in x])


In [26]:
# Preprocess and Combine Tags
movies_rs['combined_tags'] = ( 
    movies_rs['genres'].apply(lambda x: " ".join(x)) + " " +
    movies_rs['cast'].apply(lambda x: " ".join(x)) + " " +
    movies_rs['keywords'].apply(lambda x: " ".join(x)) + " " +
    movies_rs['crew'].apply(lambda x: " ".join(x))
)


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_rs['combined_tags'])


In [28]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [30]:
# Mapping from movie title to index
indices = pd.Series(movies_rs.index, index=movies_rs['title_x']).drop_duplicates()

def recommend_movies(title, cosine_sim=cosine_sim, num_movies=5):
    if title not in indices:
        return "Movie not found in the dataset."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_movies + 1]  # Exclude the input movie itself
    movie_indices = [i[0] for i in sim_scores]
    return movies_rs['title_x'].iloc[movie_indices]

# Example usage
print(recommend_movies('Avatar'))


47      Star Trek Into Darkness
2403                     Aliens
838                      Alien³
1201                  Predators
1287         A Monster in Paris
Name: title_x, dtype: object
