# K-nearest neighbors: Movie recommendation system

## 1. Data loading
### 1.1. Load

In [14]:
# Handle imports up-front
import pandas as pd
import ast
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

movies=pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv")
credits=pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv")

### 1.2. Inspect

In [2]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [3]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


### 1.3. Join

In [None]:
# Combine the datasets (hint: you don't need SQL here - Pandas can do SQL-like joins directly).
# Merge the datasets on the movie ids
movies_credits = pd.merge(movies, credits, how='left', left_on='id', right_on='movie_id')

# Inspect updated df
movies_credits.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title_x,vote_average,vote_count,movie_id,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


## 2. EDA

### 2.1. Feature encoding

In [None]:
# Take a closer look at the features - some contain JSON and/or string data. See the project tutorial page for some hints on how to handle them.
# Function to safely convert the genre/keywords columns to lists
def parse_json_column(column):
    try:
        return ast.literal_eval(column) if isinstance(column, str) else []
    except (ValueError, SyntaxError):
        return []

# Apply the function to the 'genres' and 'keywords' columns
movies_credits['genres'] = movies_credits['genres'].apply(parse_json_column)
movies_credits['keywords'] = movies_credits['keywords'].apply(parse_json_column)

# Example: Extract the genre names into a list
movies_credits['genres_names'] = movies_credits['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# Example: Extract the keywords into a list
movies_credits['keywords_names'] = movies_credits['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# Inspect updated df
movies_credits[['original_title', 'genres_names', 'keywords_names']].head()

Unnamed: 0,original_title,genres_names,keywords_names
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


In the cell below, I re-wrote the .apply() lambda function in a more verbose - but possibly more familiar - style using loops. The lambda apply method is better here - not only is it more succinct, but there is a performance benefit to using apply vs looping on a Pandas dataframe. I added the loop version for comparison in case anyone is struggling to make sense of the lambda function

### 2.2. Missing and/or extreme values

In [None]:
# Look for and clean up any junk data, if it exists
# Check for missing values in each column
movies_credits.isnull().sum()

# Drop rows with missing critical values 
movies_credits.dropna(subset=['overview', 'release_date'], inplace=True)

# Check for extreme values in numeric columns
movies_credits[['budget', 'revenue', 'runtime']].describe()

Unnamed: 0,budget,revenue,runtime
count,4799.0,4799.0,4799.0
mean,29065930.0,82329200.0,106.903105
std,40732510.0,162907600.0,22.561305
min,0.0,0.0,0.0
25%,800000.0,0.0,94.0
50%,15000000.0,19184020.0,103.0
75%,40000000.0,92956520.0,118.0
max,380000000.0,2787965000.0,338.0


### 2.3. Feature selection

In [7]:
# Do we need all of the features?
# Drop desired columns
movies_credits_clean = movies_credits.drop(columns=['homepage', 'production_companies', 'production_countries', 'tagline'])

# Inspect updated df
movies_credits_clean.head()

Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,release_date,revenue,...,status,title_x,vote_average,vote_count,movie_id,title_y,cast,crew,genres_names,keywords_names
0,237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",19995,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,2009-12-10,2787965087,...,Released,Avatar,7.2,11800,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",285,"[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,2007-05-19,961000000,...,Released,Pirates of the Caribbean: At World's End,6.9,4500,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,245000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",206647,"[{'id': 470, 'name': 'spy'}, {'id': 818, 'name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,2015-10-26,880674609,...,Released,Spectre,6.3,4466,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,250000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",49026,"[{'id': 849, 'name': 'dc comics'}, {'id': 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,2012-07-16,1084939099,...,Released,The Dark Knight Rises,7.6,9106,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,260000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",49529,"[{'id': 818, 'name': 'based on novel'}, {'id':...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,2012-03-07,284139100,...,Released,John Carter,6.1,2124,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


## 3. Model training

In [None]:
# Vectorize text features
# Combine overview and keywords into one text column for vectorization
movies_credits_clean['text_features'] = movies_credits_clean['overview'] + " " + movies_credits_clean['keywords_names'].apply(lambda x: " ".join(x))

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the text data
tfidf_matrix = tfidf.fit_transform(movies_credits_clean['text_features'])

# Check the shape of the TF-IDF matrix
tfidf_matrix.shape

(4799, 5000)

In [None]:
# K Nearest Neighbors Model
# Instantiate the KNN model using cosine distance
knn = NearestNeighbors(n_neighbors=10, metric='cosine')

# Fit the KNN model with the TF-IDF matrix
knn.fit(tfidf_matrix)

# Find nearest neighbors for a movie (let's take movie with index 1000 as an example)
movie_idx = 1000
distances, indices = knn.kneighbors(tfidf_matrix[movie_idx], n_neighbors=10)

# Show the recommended movies
recommended_movies = movies_credits_clean.iloc[indices[0]]
recommended_movies[['original_title', 'genres_names', 'overview']]

Unnamed: 0,original_title,genres_names,overview
1000,Drive Angry,"[Fantasy, Thriller, Action, Crime]",Milton is a hardened felon who has broken out ...
4756,The Call of Cthulhu,"[Horror, Thriller, Fantasy]",A dying professor leaves his great-nephew a co...
4360,Martha Marcy May Marlene,"[Drama, Thriller]","After several years of living with a cult, Mar..."
4715,Sound of My Voice,"[Science Fiction, Drama, Mystery]",A journalist and his girlfriend get pulled in ...
1186,The Final Destination,"[Horror, Mystery]",After a young man's premonition of a deadly ra...
405,The Fast and the Furious: Tokyo Drift,"[Action, Crime, Drama, Thriller]","In order to avoid a jail sentence, Sean Boswel..."
935,Herbie Fully Loaded,"[Comedy, Family, Adventure, Fantasy, Romance]","Maggie Peyton, the new owner of Number 53 - th..."
2220,Superbabies: Baby Geniuses 2,"[Comedy, Family]","Following on from the plot of the last movie, ..."
44,Furious 7,[Action],Deckard Shaw seeks revenge against Dominic Tor...
1697,Indiana Jones and the Temple of Doom,"[Adventure, Action]","After arriving in India, Indiana Jones is aske..."


## 4. Recommender

In [13]:
def get_movie_recommendations(movie_title, movies_df, knn_model, tfidf_matrix, n_recommendations=10):
    # Find the index of the movie
    idx = movies_df[movies_df['original_title'] == movie_title].index[0]
    
    # Get the nearest neighbors
    distances, indices = knn_model.kneighbors(tfidf_matrix[idx], n_neighbors=n_recommendations)
    
    # Return the recommended movies
    recommended_movies = movies_df.iloc[indices[0]]
    return recommended_movies[['original_title', 'genres_names', 'overview']]

# Example: Get recommendations for a movie
get_movie_recommendations("Avatar", movies_credits_clean, knn, tfidf_matrix)

Unnamed: 0,original_title,genres_names,overview
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di..."
2403,Aliens,"[Horror, Action, Thriller, Science Fiction]",When Ripley's lifepod is found by a salvage cr...
838,Alien³,"[Science Fiction, Action, Horror]",After escaping with Newt and Hicks from the al...
4332,Silent Running,"[Adventure, Drama, Science Fiction]",In a future Earth barren of all flora and faun...
2015,Spaceballs,"[Comedy, Science Fiction]",When the nefarious Dark Helmet hatches a plan ...
1531,Moonraker,"[Action, Adventure, Thriller, Science Fiction]",During the transportation of a Space Shuttle a...
373,Mission to Mars,[Science Fiction],When contact is lost with the crew of the firs...
3158,Alien,"[Horror, Action, Thriller, Science Fiction]","During its return to the earth, commercial spa..."
2198,Lockout,"[Action, Thriller, Science Fiction]","Set in the near future, Lockout follows a fals..."
461,Lost in Space,"[Adventure, Family, Science Fiction]",The prospects for continuing life on Earth in ...
