- ### Planteamiento del problema:

Crear un sistema de recomendación de películas.

In [32]:
# Se importan los dos datasets
import pandas as pd

url1 = 'https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv'
dataset1= pd.read_csv(url1)

url2 = 'https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv'
dataset2= pd.read_csv(url2)

- ### Exploración y limpieza de datos:

In [33]:
# Primera apreciacion del dataset1
filas1 = dataset1.shape[0]
columnas1 = dataset1.shape[1]
print(f'Filas: {filas1} | Columnas: {columnas1}')
print()
print(dataset1.info())

Filas: 4803 | Columnas: 20

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object

In [34]:
# Primera apreciacion del dataset2
filas2 = dataset2.shape[0]
columnas2 = dataset2.shape[1]
print(f'Filas: {filas2} | Columnas: {columnas2}')
print()
print(dataset2.info())

Filas: 4803 | Columnas: 4

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB
None


Creando la base de datos

In [35]:
import sqlite3

connection = sqlite3.connect("movies.db")

In [36]:
# Guardando ambos datasets en la base de datos

dataset1.to_sql('movies', connection, index = False, if_exists = 'replace')
dataset2.to_sql('credits', connection, index = False, if_exists = 'replace')

4803

In [37]:
#Uniendo ambas tablas

query = '''
    SELECT m.id AS movie_id, 
           m.title, 
           m.overview, 
           m.genres, 
           m.keywords, 
           c.cast, 
           c.crew
    FROM movies m
    JOIN credits c ON m.title = c.title
'''

In [38]:
import pandas as pd

dataframe = pd.read_sql_query(sql=query, con = connection)

In [39]:
# Cerrar la conexión a la base de datos
connection.close()

In [40]:
dataframe.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [41]:
dataframe.duplicated().sum()

0

In [42]:
import json

dataframe['genres'] = dataframe['genres'].apply(lambda x: [item['name'] for item in json.loads(x)])
dataframe['keywords'] = dataframe['keywords'].apply(lambda x: [item['name'] for item in json.loads(x)])
dataframe['cast'] = dataframe['cast'].apply(lambda x: [item['name'] for item in json.loads(x)][:3])

In [43]:
dataframe.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [44]:
import json

def load_json_safe(json_str, default_value = None):
    try:
        return json.loads(json_str)
    except (TypeError, json.JSONDecodeError):
        return default_value
    
dataframe["crew"] = dataframe["crew"].apply(lambda x: " ".join([crew_member['name'] for crew_member in load_json_safe(x) if crew_member['job'] == 'Director']))


In [45]:
dataframe.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski


In [46]:
dataframe['overview'] = dataframe['overview'].apply(lambda x: [x] if pd.notna(x) else None)

In [47]:
dataframe.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In the 22nd century, a paraplegic Marine is d...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron
1,285,Pirates of the Caribbean: At World's End,"[Captain Barbossa, long believed to be dead, h...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski


Convertir en string todos los registros de las columnas antes de unirlas.

In [48]:
dataframe["genres"] = dataframe["genres"].apply(lambda x: [str(genre) for genre in x])
dataframe["keywords"] = dataframe["keywords"].apply(lambda x: [str(keyword) for keyword in x])
dataframe["cast"] = dataframe["cast"].apply(lambda x: [str(actor) for actor in x])
dataframe["crew"] = dataframe["crew"].apply(lambda x: [str(x)])
dataframe["overview"] = dataframe["overview"].apply(lambda x: [str(x)])

In [49]:
dataframe.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[['In the 22nd century, a paraplegic Marine is...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[['Captain Barbossa, long believed to be dead,...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [50]:
dataframe['tags'] =  dataframe['overview'] + dataframe['genres'] + dataframe['keywords'] + dataframe['cast'] + dataframe['crew']

In [51]:
dataframe = dataframe[['movie_id','title','tags']]

In [52]:
dataframe["tags"] = dataframe["tags"].apply(lambda x: ",".join(x).replace(",", " "))

In [53]:
dataframe['tags'] = dataframe['tags'].str.lstrip("['")

In [54]:
dataframe['tags'] = dataframe['tags'].str.replace(".']", '')

In [55]:
dataframe['tags'][0]

'In the 22nd century  a paraplegic Marine is dispatched to the moon Pandora on a unique mission  but becomes torn between following orders and protecting an alien civilization Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Sam Worthington Zoe Saldana Sigourney Weaver James Cameron'

In [56]:
dataframe['tags'] = dataframe['tags'].str.replace(',',' ')

In [57]:
dataframe['tags'][0]

'In the 22nd century  a paraplegic Marine is dispatched to the moon Pandora on a unique mission  but becomes torn between following orders and protecting an alien civilization Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Sam Worthington Zoe Saldana Sigourney Weaver James Cameron'

In [58]:
dataframe.head(2)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,In the 22nd century a paraplegic Marine is di...
1,285,Pirates of the Caribbean: At World's End,Captain Barbossa long believed to be dead ha...


In [61]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vectors = vectorizer.fit_transform(dataframe['tags'])

k = 6 
knn_model = NearestNeighbors(n_neighbors=k, algorithm = "brute", metric='cosine')
knn_model.fit(vectors)

def recommend_knn(movie):
    movie_index = dataframe[dataframe["title"] == movie].index[0]
    distances, indices = knn_model.kneighbors(vectors[movie_index])
    
    # Obtener los índices de las películas más similares
    similar_movies_indices = indices[0][1:]  # Excluyendo la propia película
    
    # Imprimir los títulos de las películas más similares
    for idx in similar_movies_indices:
        print(dataframe.iloc[idx].title)

# Ejemplo de uso
recommend_knn("How to Train Your Dragon")

How to Train Your Dragon 2
Dragon Nest: Warriors' Dawn
Pete's Dragon
George and the Dragon
Eragon


In [None]:
from pickle import dump

dump(knn_model, open("../models/KNN-6_algorithm-brute_metric-cosine.sav", "wb"))