# KNN Project

Install all requirements and imports 

In [1]:
!pip install -r "/workspaces/KNN_project/requirements.txt"

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [89]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json


from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import f1_score, accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors


from pickle import dump

from flask import Flask
from flask_sqlalchemy import SQLAlchemy

import sqlite3



In [3]:
url_movies = "https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv"
url_credits = "https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv"

In [4]:
df_movies = pd.read_csv(url_movies)
df_credits = pd.read_csv(url_credits)

# Movies Dataframe 

In [5]:
df_movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [6]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [7]:
df_movies.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [8]:
df_movies.shape

(4803, 20)

# Credits Dataframe

In [9]:
df_credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [10]:
df_credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [11]:
df_credits.isnull().sum()

movie_id    0
title       0
cast        0
crew        0
dtype: int64

# SQL
-Create a database to store the two DataFrames in separate tables

-Then join the two tables with SQL (and integrate it with Python) to generate a third table containing information from both tables unified

-The key through which the join can be done is the title of the movie (title).


In [12]:
conn = sqlite3.connect("../data/movies_database.db")
engine = create_engine('sqlite:///:memory:') #a different way to connect to the DB


df_movies.to_sql("df_movies", conn, if_exists = "replace", index = False)
df_credits.to_sql("df_credits", conn, if_exists = "replace", index = False)

4803

In [13]:
# join tables on 'title'
query = """
    SELECT *
    FROM df_movies
    INNER JOIN df_credits
    ON df_movies.title = df_credits.title;
"""

In [14]:
total_data = pd.read_sql_query(query, conn)
conn.close()

total_data = total_data.loc[:, ~total_data.columns.duplicated()]
total_data.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [15]:
total_data.isnull().sum()

budget                     0
genres                     0
homepage                3096
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
movie_id                   0
cast                       0
crew                       0
dtype: int64

movie_id
title
overview
genres
keywords
cast
crew

In [16]:
columns = ["movie_id","title","overview","genres","keywords","cast","crew"]
columns_to_drop = [col for col in total_data.columns if col not in columns]

df_clean = total_data.drop(columns_to_drop, axis = 1)
df_clean.head()

Unnamed: 0,genres,keywords,overview,title,movie_id,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",Avatar,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,Spectre,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,The Dark Knight Rises,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...",John Carter,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [17]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   genres    4809 non-null   object
 1   keywords  4809 non-null   object
 2   overview  4806 non-null   object
 3   title     4809 non-null   object
 4   movie_id  4809 non-null   int64 
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


From each of the JSONs, select the name attribute and replace the genres and keywords columns. For the cast column, select the first three names.

In [18]:
print(df_clean.genres[0])

[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]


In [50]:
genres_list = []
for i in range(len(df_clean)):
    temp_list = []
    for l in json.loads(df_clean.genres[i]):
        
        temp_list.append(l['name'])
         
    genres_list.append(','.join(temp_list))



In [48]:
keywords_list = []
for i in range(len(df_clean)):
    temp_list = []
    for l in json.loads(df_clean.keywords[i]):
        
        temp_list.append(l['name'])
         
    keywords_list.append(','.join(temp_list))

In [51]:
len(keywords_list)

4809

In [55]:
#in order to combine 2 lists into a df, use zip


In [56]:
alt_df.head()

Unnamed: 0,genres,keywords
0,"Action,Adventure,Fantasy,Science Fiction","culture clash,future,space war,space colony,so..."
1,"Adventure,Fantasy,Action","ocean,drug abuse,exotic island,east india trad..."
2,"Action,Adventure,Crime","spy,based on novel,secret agent,sequel,mi6,bri..."
3,"Action,Crime,Drama,Thriller","dc comics,crime fighter,terrorist,secret ident..."
4,"Action,Adventure,Science Fiction","based on novel,mars,medallion,space travel,pri..."


In [57]:
cast_list = []
for i in range(len(df_clean)):
    temp_list = []
    for l in json.loads(df_clean.cast[i])[0:3]:
        
        temp_list.append(l['name'])
         
    cast_list.append(','.join(temp_list))

In [62]:
json.loads(df_clean.crew[0])

[{'credit_id': '52fe48009251416c750aca23',
  'department': 'Editing',
  'gender': 0,
  'id': 1721,
  'job': 'Editor',
  'name': 'Stephen E. Rivkin'},
 {'credit_id': '539c47ecc3a36810e3001f87',
  'department': 'Art',
  'gender': 2,
  'id': 496,
  'job': 'Production Design',
  'name': 'Rick Carter'},
 {'credit_id': '54491c89c3a3680fb4001cf7',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Sound Designer',
  'name': 'Christopher Boyes'},
 {'credit_id': '54491cb70e0a267480001bd0',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Supervising Sound Editor',
  'name': 'Christopher Boyes'},
 {'credit_id': '539c4a4cc3a36810c9002101',
  'department': 'Production',
  'gender': 1,
  'id': 1262,
  'job': 'Casting',
  'name': 'Mali Finn'},
 {'credit_id': '5544ee3b925141499f0008fc',
  'department': 'Sound',
  'gender': 2,
  'id': 1729,
  'job': 'Original Music Composer',
  'name': 'James Horner'},
 {'credit_id': '52fe48009251416c750ac9c3',
  'department': 'Directing',
  

In [70]:
crew_list = []
for i in range(len(df_clean)):
    temp_list = []
    for l in json.loads(df_clean.crew[i]):
        if 'director' == l['job'].lower():
            temp_list.append(l['name'])
         
    crew_list.append(','.join(temp_list))

In [65]:
len(crew_list)

4809

In [71]:
alt_df = pd.DataFrame(list(zip(genres_list, keywords_list, cast_list, crew_list, df_clean.overview)), columns=['genres', 'keywords','cast','crew', 'overview'])

In [72]:
alt_df.head()

Unnamed: 0,genres,keywords,cast,crew,overview
0,"Action,Adventure,Fantasy,Science Fiction","culture clash,future,space war,space colony,so...","Sam Worthington,Zoe Saldana,Sigourney Weaver",James Cameron,"In the 22nd century, a paraplegic Marine is di..."
1,"Adventure,Fantasy,Action","ocean,drug abuse,exotic island,east india trad...","Johnny Depp,Orlando Bloom,Keira Knightley",Gore Verbinski,"Captain Barbossa, long believed to be dead, ha..."
2,"Action,Adventure,Crime","spy,based on novel,secret agent,sequel,mi6,bri...","Daniel Craig,Christoph Waltz,Léa Seydoux",Sam Mendes,A cryptic message from Bond’s past sends him o...
3,"Action,Crime,Drama,Thriller","dc comics,crime fighter,terrorist,secret ident...","Christian Bale,Michael Caine,Gary Oldman",Christopher Nolan,Following the death of District Attorney Harve...
4,"Action,Adventure,Science Fiction","based on novel,mars,medallion,space travel,pri...","Taylor Kitsch,Lynn Collins,Samantha Morton",Andrew Stanton,"John Carter is a war-weary, former military ca..."


In [73]:
alt_df['tags'] = alt_df.genres + alt_df.keywords + alt_df.cast + alt_df.crew + alt_df.overview

In [79]:
alt_df.tags = alt_df.tags.str.replace(',',' ')
alt_df.head()

Unnamed: 0,genres,keywords,cast,crew,overview,tags
0,"Action,Adventure,Fantasy,Science Fiction","culture clash,future,space war,space colony,so...","Sam Worthington,Zoe Saldana,Sigourney Weaver",James Cameron,"In the 22nd century, a paraplegic Marine is di...",Action Adventure Fantasy Science Fictioncultur...
1,"Adventure,Fantasy,Action","ocean,drug abuse,exotic island,east india trad...","Johnny Depp,Orlando Bloom,Keira Knightley",Gore Verbinski,"Captain Barbossa, long believed to be dead, ha...",Adventure Fantasy Actionocean drug abuse exoti...
2,"Action,Adventure,Crime","spy,based on novel,secret agent,sequel,mi6,bri...","Daniel Craig,Christoph Waltz,Léa Seydoux",Sam Mendes,A cryptic message from Bond’s past sends him o...,Action Adventure Crimespy based on novel secre...
3,"Action,Crime,Drama,Thriller","dc comics,crime fighter,terrorist,secret ident...","Christian Bale,Michael Caine,Gary Oldman",Christopher Nolan,Following the death of District Attorney Harve...,Action Crime Drama Thrillerdc comics crime fig...
4,"Action,Adventure,Science Fiction","based on novel,mars,medallion,space travel,pri...","Taylor Kitsch,Lynn Collins,Samantha Morton",Andrew Stanton,"John Carter is a war-weary, former military ca...",Action Adventure Science Fictionbased on novel...


In [86]:
alt_df.fillna("", inplace=True)

In [80]:
alt_df.tags[0]

'Action Adventure Fantasy Science Fictionculture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3dSam Worthington Zoe Saldana Sigourney WeaverJames CameronIn the 22nd century  a paraplegic Marine is dispatched to the moon Pandora on a unique mission  but becomes torn between following orders and protecting an alien civilization.'

In [87]:
vectorizer = TfidfVectorizer()
tdm = vectorizer.fit_transform(alt_df.tags)

In [100]:
model = NearestNeighbors(n_neighbors=5, metric = 'cosine')
model.fit(tdm)


In [101]:
def get_movie_recommendations(movie_title):
    movie_index = df_clean[df_clean["title"] == movie_title].index[0]
    distances, indices = model.kneighbors(tdm[movie_index])
    similar_movies = [(df_clean["title"][i], distances[0][j]) for j, i in enumerate(indices[0])]
    return similar_movies[0:]

input_movie = "Toy Story"
recommendations = get_movie_recommendations(input_movie)
print("Film recommendations '{}'".format(input_movie))
for movie, distance in recommendations:
    print("- Film: {}".format(movie))

Film recommendations 'Toy Story'
- Film: Toy Story
- Film: Toy Story 2
- Film: Toy Story 3
- Film: Small Soldiers
- Film: The 40 Year Old Virgin
