In [102]:
import pandas as pd
import numpy as np
import openai
import ast
import sys
import json
pd.set_option('display.max_columns', 200)

from dotenv import load_dotenv
import os
load_dotenv("../config.env")

True

## **Carga del dataframe:**

In [103]:
df = pd.read_csv('df.csv')

## **Transformaciones:**

### Belong to collection

In [104]:
#transformar a diccionarios:
df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {'data': None})

La mayor cantidad de belong to collection esta vacia, probaré dejandola a ver como quedan los vectores, solo voy a usar 'name':'

In [105]:
df['belongs_to_collection'][0]

{'id': 10194,
 'name': 'Toy Story Collection',
 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}

Defino una nueva columna útil para el modelo

In [106]:
df['collection'] = df['belongs_to_collection'].apply(lambda x: x['name'] if isinstance(x, dict) and 'name' in x else '')

### Genres

In [107]:
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {'data': None})

In [108]:
df['genres'][0]

{'genres': [{'id': 16, 'name': 'Animation'},
  {'id': 35, 'name': 'Comedy'},
  {'id': 10751, 'name': 'Family'}]}

In [109]:
df['joined_genres'] = df['genres'].apply(lambda x: ' '.join([genre['name'] for genre in x['genres']]) if isinstance(x, dict) and 'genres' in x else '')

#Por ahora no voy a trabajar con production company ni country

### Cast


Sólo tomaré los nombres de los actores

In [110]:
df['cast_names'] = df['cast'].apply(lambda x: ' '.join([actor['name'] for actor in ast.literal_eval(x)]) if isinstance(x, str) else '')

In [111]:
df['cast'][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

### Crew


Solo tomaré el director

In [112]:
import math

def convert_to_json(texto):
    if pd.isna(texto) or (isinstance(texto, float) and math.isnan(texto)):
        return ''  # Reemplazar NaN por cadena vacía
    
    if isinstance(texto, str):
        try:
            estructura_datos = ast.literal_eval(texto)
            if isinstance(estructura_datos, (dict, list)):
                texto_json = json.dumps(estructura_datos)
                objeto_json = json.loads(texto_json)
                return objeto_json
            else:
                raise ValueError("El contenido no es una estructura de datos válida")
        except (SyntaxError, ValueError) as e:
            raise ValueError("El contenido no se puede evaluar correctamente") from e
    else:
        raise ValueError("El contenido no es una cadena de texto")

In [113]:
df['crew'] = df['crew'].apply(convert_to_json)

In [114]:
df['crew'][0]

[{'credit_id': '52fe4284c3a36847f8024f49',
  'department': 'Directing',
  'gender': 2,
  'id': 7879,
  'job': 'Director',
  'name': 'John Lasseter',
  'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f4f',
  'department': 'Writing',
  'gender': 2,
  'id': 12891,
  'job': 'Screenplay',
  'name': 'Joss Whedon',
  'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f55',
  'department': 'Writing',
  'gender': 2,
  'id': 7,
  'job': 'Screenplay',
  'name': 'Andrew Stanton',
  'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f5b',
  'department': 'Writing',
  'gender': 2,
  'id': 12892,
  'job': 'Screenplay',
  'name': 'Joel Cohen',
  'profile_path': '/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f61',
  'department': 'Writing',
  'gender': 0,
  'id': 12893,
  'job': 'Screenplay',
  'name': 'Alec Sokolow',
  'profile_path': '/v79vlRYi94BZUQnkkyzn

In [115]:
# Crear la nueva columna "director"
df['director'] = ''

# Recorrer cada fila del DataFrame
for index, row in df.iterrows():
    crew = row['crew']
    director_name = ''
    if crew != '':
        for entry in crew:
            if entry['job'] == 'Director':
                director_name = entry['name']
                break
    df.at[index, 'director'] = director_name

In [123]:
df.head(1)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,release_year,return,cast,crew,collection,joined_genres,cast_names,director
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"{'genres': [{'id': 16, 'name': 'Animation'}, {...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,{'production_companies': [{'name': 'Pixar Anim...,"{'production_countries': [{'iso_3166_1': 'US',...",1995-10-30,373554050.0,81.0,"{'spoken_languages': [{'iso_639_1': 'en', 'nam...",Released,,Toy Story,7.7,5415.0,1995,12.451801,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",Toy Story Collection,Animation Comedy Family,Tom Hanks Tim Allen Don Rickles Jim Varney Wal...,John Lasseter


## Redimensionar a las features que serán usadas:

In [124]:

data = df[['id','title','overview','release_year','collection','joined_genres','cast_names','director']]

In [125]:
data[data==''].count()

id                   0
title                0
overview             0
release_year         0
collection       40861
joined_genres     2384
cast_names        2349
director           836
dtype: int64

#### Por limitaciones tecnicas será necesario hacer un muestro de los datos

In [126]:
# Establecer la semilla (seed)
seed = 33
np.random.seed(seed)

# Realizar el muestreo aleatorio
sample_df = data.sample(frac=0.12) 

sample_df = sample_df.reset_index(drop=True)

In [127]:
sample_df.head()

Unnamed: 0,id,title,overview,release_year,collection,joined_genres,cast_names,director
26579,55531,The Radio Burglary,Eager young radio reporter Teräsvuori stages a...,1951,,Comedy Crime Thriller,Hannes Häyrinen Ritva Arvelo Kullervo Kalske K...,Matti Kassila
40897,104044,Roller Blade Warriors: Taken by Force,"In the future, a warrior nun on roller skates ...",1989,,Fantasy Science Fiction,Kathleen Kinmont Rory Calhoun Cleve Hall Jack ...,Donald G. Jackson
4530,30666,Stepfather II: Make Room For Daddy,The Stepfather escapes an insane asylum and wi...,1989,The Stepfather Collection,Horror Thriller,Terry O'Quinn Meg Foster Caroline Williams Jon...,Jeff Burr
14283,50936,Moon Warriors,"A kind-hearted fisherman, content with simple ...",1992,,Action Adventure Drama Fantasy Foreign Romance...,Andy Lau Anita Mui Kenny Bee Maggie Cheung Kel...,Sammo Hung
31805,108632,The Show,Cock Robin (John Gilbert) is the swaggering ba...,1927,,Crime Drama,John Gilbert Renée Adorée Lionel Barrymore Edw...,Tod Browning


## **Feature engineering** (esto no va)

In [None]:
#partir el dataset en varios para poder pasarlo a openai

In [89]:
sample_df.shape

(5442, 26)

In [87]:
token = os.getenv("TOKEN")
openai.api_key  = token

In [92]:
#Tomado del curso de # Prompt Engineer de deeplearning.ia-Inferring by AndrewNg / Isa Fulford) 

'''def get_completion(x, model="gpt-3.5-turbo"):
    prompt = f""" 
    Analyze the following movie overview: '''{x}'''. Please generate a list of up to 6 keywords that accurately describe the content and main themes of the movie. Exclude common character names and focus on more general and relevant aspects, the return must be like this: 'keyword1 keyword2...keyword6'. If no keywords are identified, please return ''.
    """ 
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [None]:
'''import time

# Dividir el DataFrame original en 10 partes
num_parts = 40
df_parts = np.array_split(sample_df, num_parts)

# Realizar operaciones en cada parte del DataFrame
for i, df_part in enumerate(df_parts):

    df_part["Nueva_Columna"] = df_part['overview'].apply(lambda x: get_completion(x))
    print('procesado')
    # Agregar un tiempo de espera de 30 segundos
    time.sleep(120)
# Unir los DataFrames
merged_df = pd.concat(df_parts)

# Imprimir el número de registros en el DataFrame original y el DataFrame unido
print(f"Original DataFrame: {len(sample_df)} registros")
print(f"Merged DataFrame: {len(merged_df)} registros")'''

#continuamos acá:

In [137]:
sample_df.head()

Unnamed: 0,id,title,overview,release_year,collection,joined_genres,cast_names,director
26579,55531,The Radio Burglary,Eager young radio reporter Teräsvuori stages a...,1951,,Comedy Crime Thriller,Hannes Häyrinen Ritva Arvelo Kullervo Kalske K...,Matti Kassila
40897,104044,Roller Blade Warriors: Taken by Force,"In the future, a warrior nun on roller skates ...",1989,,Fantasy Science Fiction,Kathleen Kinmont Rory Calhoun Cleve Hall Jack ...,Donald G. Jackson
4530,30666,Stepfather II: Make Room For Daddy,The Stepfather escapes an insane asylum and wi...,1989,The Stepfather Collection,Horror Thriller,Terry O'Quinn Meg Foster Caroline Williams Jon...,Jeff Burr
14283,50936,Moon Warriors,"A kind-hearted fisherman, content with simple ...",1992,,Action Adventure Drama Fantasy Foreign Romance...,Andy Lau Anita Mui Kenny Bee Maggie Cheung Kel...,Sammo Hung
31805,108632,The Show,Cock Robin (John Gilbert) is the swaggering ba...,1927,,Crime Drama,John Gilbert Renée Adorée Lionel Barrymore Edw...,Tod Browning


In [145]:
combined_features = pd.DataFrame()

In [146]:
combined_features['id'] = sample_df['id']
combined_features['text'] = sample_df['title'] + ' ' + sample_df['overview'] + ' ' + sample_df['release_year'].astype(str) + ' ' + sample_df['collection'] + ' ' + sample_df['joined_genres'] + ' ' + sample_df['cast_names'] + ' ' + sample_df['director']

In [147]:
combined_features

Unnamed: 0,id,text
26579,55531,The Radio Burglary Eager young radio reporter ...
40897,104044,Roller Blade Warriors: Taken by Force In the f...
4530,30666,Stepfather II: Make Room For Daddy The Stepfat...
14283,50936,"Moon Warriors A kind-hearted fisherman, conten..."
31805,108632,The Show Cock Robin (John Gilbert) is the swag...
...,...,...
485,2246,Malice A tale about a happily married couple w...
569,10897,"The Little Rascals Spanky, Alfalfa, Buckwheat,..."
26560,178607,Right Cross A sportswriter (Dick Powell) forms...
24549,71114,Cleopatra A retired teacher and a soap star le...


## Limpieza

In [135]:
import re
from unidecode import unidecode

def limpiar_texto(texto):
    texto_sin_saltos = texto.replace("\n", " ")
    texto_minusculas = texto_sin_saltos.lower()
    texto_sin_tildes = unidecode(texto_minusculas)
    patron = r'@[\w]+|#\w+|[!,".]|(\b[^\w\s]\b)|\bhttps?\S+\b'
    texto_limpio = re.sub(patron, "", texto_sin_tildes)
    return texto_limpio

In [150]:
combined_features['cleaned_text'] = combined_features['text'].astype(str).apply(limpiar_texto)

In [151]:
combined_features

Unnamed: 0,id,text,cleaned_text
26579,55531,The Radio Burglary Eager young radio reporter ...,the radio burglary eager young radio reporter ...
40897,104044,Roller Blade Warriors: Taken by Force In the f...,roller blade warriors: taken by force in the f...
4530,30666,Stepfather II: Make Room For Daddy The Stepfat...,stepfather ii: make room for daddy the stepfat...
14283,50936,"Moon Warriors A kind-hearted fisherman, conten...",moon warriors a kindhearted fisherman content ...
31805,108632,The Show Cock Robin (John Gilbert) is the swag...,the show cock robin (john gilbert) is the swag...
...,...,...,...
485,2246,Malice A tale about a happily married couple w...,malice a tale about a happily married couple w...
569,10897,"The Little Rascals Spanky, Alfalfa, Buckwheat,...",the little rascals spanky alfalfa buckwheat an...
26560,178607,Right Cross A sportswriter (Dick Powell) forms...,right cross a sportswriter (dick powell) forms...
24549,71114,Cleopatra A retired teacher and a soap star le...,cleopatra a retired teacher and a soap star le...


In [152]:
### Tokenizar y quitar stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abrahan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [153]:
from nltk.tokenize import RegexpTokenizer
regexp = RegexpTokenizer('\w+')
combined_features['token'] = combined_features['cleaned_text'].apply(regexp.tokenize)
combined_features.head()

Unnamed: 0,id,text,cleaned_text,token
26579,55531,The Radio Burglary Eager young radio reporter ...,the radio burglary eager young radio reporter ...,"[the, radio, burglary, eager, young, radio, re..."
40897,104044,Roller Blade Warriors: Taken by Force In the f...,roller blade warriors: taken by force in the f...,"[roller, blade, warriors, taken, by, force, in..."
4530,30666,Stepfather II: Make Room For Daddy The Stepfat...,stepfather ii: make room for daddy the stepfat...,"[stepfather, ii, make, room, for, daddy, the, ..."
14283,50936,"Moon Warriors A kind-hearted fisherman, conten...",moon warriors a kindhearted fisherman content ...,"[moon, warriors, a, kindhearted, fisherman, co..."
31805,108632,The Show Cock Robin (John Gilbert) is the swag...,the show cock robin (john gilbert) is the swag...,"[the, show, cock, robin, john, gilbert, is, th..."


In [154]:
### Quitar stopwords

combined_features['token_no_stopwords'] = combined_features['token'].apply(lambda x: [item for item in x if item not in stopwords])

In [156]:
### Lematización y stemming

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


In [157]:
combined_features['stemming'] = combined_features['token_no_stopwords'].apply(lambda x: [stemmer.stem(item) for item in x ])

In [167]:
combined_features['stemming_unique'] = combined_features['stemming'].apply(lambda x: list(set(x)))

In [169]:
combined_features['stemming_unique']

26579    [comedi, laakso, heist, escap, gang, ritva, he...
40897    [futur, donald, jackson, michael, taken, eliza...
4530     [town, futur, daddi, howev, escap, wind, wife,...
14283    [reluctantli, chin, bee, michael, law, biuchue...
31805    [ballyhoo, sinist, rene, gertrud, perform, aff...
                               ...                        
485      [malic, baldwin, georg, josef, harold, c, coll...
569      [nostalg, famili, trump, made, marsico, nineye...
26560    [june, celli, daughter, marilyn, sportswrit, p...
24549    [sbaraglia, comedi, weekend, leonardo, trip, 2...
1361     [duart, actress, wife, 1996, littman, woman, p...
Name: stemming_unique, Length: 5442, dtype: object

In [178]:
combined_features['string_ready'] = combined_features['stemming_unique'].apply(lambda words: ' '.join(set(words)))
df_ready = combined_features['string_ready']

In [179]:
df_ready

26579    comedi laakso escap heist gang ritva helsinki ...
40897    futur donald jackson michael taken elizabeth g...
4530     town futur daddi howev escap wind wife oleari ...
14283    content reluctantli chin bee michael law biuch...
31805    ballyhoo sinist rene gertrud perform affect ve...
                               ...                        
485      malic baldwin georg josef harold c colleg neve...
569      nostalg famili trump made marsico heather nine...
26560    june celli daughter marilyn sportswrit powel 1...
24549    sbaraglia comedi weekend leonardo trip 2003 so...
1361     duart actress littman wife 1996 woman peron su...
Name: string_ready, Length: 5442, dtype: object

In [180]:
type(df_ready)

pandas.core.series.Series

In [181]:
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [182]:
vectorizer = TfidfVectorizer()

In [183]:
feature_vectors = vectorizer.fit_transform(df_ready)

In [185]:
print(feature_vectors)

  (0, 9219)	0.14638229989440488
  (0, 13188)	0.1404654101614472
  (0, 45446)	0.08989989873161446
  (0, 45546)	0.05651385269983939
  (0, 10474)	0.148171902975239
  (0, 10163)	0.06640751527110655
  (0, 43181)	0.11260584541078504
  (0, 25937)	0.09644968334634099
  (0, 33987)	0.1643884687857352
  (0, 50299)	0.060898266281317205
  (0, 13482)	0.18323424244700212
  (0, 35760)	0.09198152941686544
  (0, 31553)	0.13487242414227188
  (0, 6918)	0.1643884687857352
  (0, 45194)	0.18323424244700212
  (0, 37901)	0.1129402830355519
  (0, 23457)	0.18323424244700212
  (0, 15082)	0.1143352216235538
  (0, 33535)	0.1643884687857352
  (0, 37420)	0.09818436660616478
  (0, 9278)	0.13212602246588656
  (0, 23589)	0.1748948547514415
  (0, 204)	0.12895553440931423
  (0, 10176)	0.10216591333406257
  (0, 44814)	0.1392185156318959
  :	:
  (5441, 4340)	0.1310799753949926
  (5441, 23014)	0.11338730678163358
  (5441, 48081)	0.12141130101288959
  (5441, 34460)	0.11003655128107058
  (5441, 254)	0.11213115385859612
  (5441

In [186]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [187]:
print(similarity.shape)

(5442, 5442)


In [188]:
import pickle

with open('similarity.pickle','wb') as f:
    pickle.dump(similarity,f)