# Creacion de DataFrames de Prime

## Lecturas archivos

In [1]:
import numpy as np
import pandas as pd

### api_prime
Informacion de las peliculas dada por la API. Sus columnas son:
Las columnas nos entregan los siguientes datos:
- `id`: Identificador de la pelicula en la API
- `title`: Nombre de la pelicula
- `year`: Año de estreno de la pelicula
- `imdb_id`: Identificador de la pelicula en IMDB
- `tmdb_id`: Identificador de la pelicula en TMDB
- `tmdb_type`: Tipo del titulo en TMDB
- `type`: Tipo del titulo en la API

In [3]:
dfprime = pd.read_csv('data/api_prime.csv')
dfprime.head()

Unnamed: 0,id,title,year,imdb_id,tmdb_id
0,11001921,John Candy: I Like Me,2025,tt26683420,1492608
1,1882904,Maintenance Required,2025,tt33335602,1352755
2,1535445,Gladiator II,2024,tt9218128,558449
3,1532981,The Batman,2022,tt1877830,414906
4,1498670,Nosferatu,2024,tt5040012,426063


### imdb_basics
Informacion basica de cada titulo en IMDB. Sus columnas son:
- `tconst`: Id del titulo en IMDB
- `titleType`: Tipo del titulo
- `primaryTitle`: Nombre mas comun del titulo
- `originalTitle`: Nombre original del titulo
- `isAdult`: Bool que indica si es para adultos o no
- `startYear`: Año de salida, en series es el año de comienzo de la serie
- `endYear`: Año de fin de la serie (No muy util ya que solo trabajaremos con peliculas)
- `runtimeMinutes`: Duracion del titulo en minutos
- `genres`: Lista de generos del titulo

In [4]:
imdb_basics = pd.read_csv('data/title.basics.tsv', sep='\t')
imdb_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short


### imdb_ratings
Rating de cada pelicula en IMDB. Sus columnas son:
- `tconst`: Id de IMDB
- `averageRating`: Puntaje promedio dado por los votos
- `numVotes`: Cantidad de votos

In [5]:
imdb_ratings = pd.read_csv('data/title.ratings.tsv', sep='\t')
imdb_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2178
1,tt0000002,5.5,299
2,tt0000003,6.4,2243
3,tt0000004,5.2,193
4,tt0000005,6.2,2986


### imdb_principals
Trabajadores involucrados en cada titulo de IMDB (directores, productores, actores, etc.). Sus columnas son:
- `tconst`: Id del titulo en IMDB
- `ordering`: Id para enumerar a los trabajadores por titulo
- `nconst`: Id de persona en IMDB
- `category`: Categoria del rol que cumplio en el titulo
- `job`: Trabajo que tenia en el titulo
- `characters`: En caso de ser actor, muestra los nombres de los personajes que interpreta

In [6]:
imdb_principals = pd.read_csv('data/title.principals.tsv', sep='\t')
imdb_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0005690,producer,producer,\N
3,tt0000001,4,nm0374658,cinematographer,director of photography,\N
4,tt0000002,1,nm0721526,director,\N,\N


### imdb_crew
Directores y escritores de cada titulo en IMDB. Sus columnas son:
- `tconst`: Id del titulo en IMDB
- `directors`: Id de persona del director en IMDB
- `writers`: Id de persona de los escritores en IMDB

In [7]:
imdb_crew = pd.read_csv('data/title.crew.tsv', sep='\t')
imdb_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,nm0721526
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


### imdb_name
Este Dataframe contiene informacion de cada persona relacionada a titulos dentro de IMDB. Sus columnas son:
- `nconst`: Id de la persona en IMDB
- `primaryName`: Nombre por el que es mas conocida la persona
- `birthYear`: Año de nacimiento de la persona
- `deathYear`: Año de fallecimiento de la persona
- `primaryProfession`: Los tres roles que mas suele cumplir en los titulos
- `knownForTitle`: Titulos por los que es conocido

In [8]:
imdb_name = pd.read_csv('data/name.basics.tsv', sep='\t')
imdb_name.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0050419,tt0072308,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


## Creacion de Dataframe principal

### Join con `imdb_basics`

In [9]:
df_main1 = dfprime.merge(imdb_basics, how='left', left_on='imdb_id', right_on='tconst')
df_main1.head()

Unnamed: 0,id,title,year,imdb_id,tmdb_id,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,11001921,John Candy: I Like Me,2025,tt26683420,1492608,tt26683420,movie,John Candy: I Like Me,John Candy: I Like Me,0.0,2025,\N,113,"Biography,Comedy,Documentary"
1,1882904,Maintenance Required,2025,tt33335602,1352755,tt33335602,movie,Maintenance Required,Maintenance Required,0.0,2025,\N,\N,"Comedy,Romance"
2,1535445,Gladiator II,2024,tt9218128,558449,tt9218128,movie,Gladiator II,Gladiator II,0.0,2024,\N,148,"Action,Adventure,Drama"
3,1532981,The Batman,2022,tt1877830,414906,tt1877830,movie,The Batman,The Batman,0.0,2022,\N,176,"Action,Crime,Drama"
4,1498670,Nosferatu,2024,tt5040012,426063,tt5040012,movie,Nosferatu,Nosferatu,0.0,2024,\N,132,"Fantasy,Horror,Mystery"


### Join con `imdb_ratings`

In [10]:
df_main2 = df_main1.merge(imdb_ratings, how='left', left_on='imdb_id', right_on='tconst')
df_main2.head()

Unnamed: 0,id,title,year,imdb_id,tmdb_id,tconst_x,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,tconst_y,averageRating,numVotes
0,11001921,John Candy: I Like Me,2025,tt26683420,1492608,tt26683420,movie,John Candy: I Like Me,John Candy: I Like Me,0.0,2025,\N,113,"Biography,Comedy,Documentary",tt26683420,7.5,78.0
1,1882904,Maintenance Required,2025,tt33335602,1352755,tt33335602,movie,Maintenance Required,Maintenance Required,0.0,2025,\N,\N,"Comedy,Romance",,,
2,1535445,Gladiator II,2024,tt9218128,558449,tt9218128,movie,Gladiator II,Gladiator II,0.0,2024,\N,148,"Action,Adventure,Drama",tt9218128,6.5,272412.0
3,1532981,The Batman,2022,tt1877830,414906,tt1877830,movie,The Batman,The Batman,0.0,2022,\N,176,"Action,Crime,Drama",tt1877830,7.8,893476.0
4,1498670,Nosferatu,2024,tt5040012,426063,tt5040012,movie,Nosferatu,Nosferatu,0.0,2024,\N,132,"Fantasy,Horror,Mystery",tt5040012,7.2,232086.0


### Join con `imdb_crew`

In [11]:
df_main3 = df_main2.merge(imdb_crew, how='left', left_on='imdb_id', right_on='tconst')
df_main3.head()

Unnamed: 0,id,title,year,imdb_id,tmdb_id,tconst_x,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,tconst_y,averageRating,numVotes,tconst,directors,writers
0,11001921,John Candy: I Like Me,2025,tt26683420,1492608,tt26683420,movie,John Candy: I Like Me,John Candy: I Like Me,0.0,2025,\N,113,"Biography,Comedy,Documentary",tt26683420,7.5,78.0,tt26683420,nm0004988,\N
1,1882904,Maintenance Required,2025,tt33335602,1352755,tt33335602,movie,Maintenance Required,Maintenance Required,0.0,2025,\N,\N,"Comedy,Romance",,,,tt33335602,nm8919237,"nm12913021,nm2874428,nm8919237"
2,1535445,Gladiator II,2024,tt9218128,558449,tt9218128,movie,Gladiator II,Gladiator II,0.0,2024,\N,148,"Action,Adventure,Drama",tt9218128,6.5,272412.0,tt9218128,nm0000631,"nm0291905,nm0185976,nm0769227"
3,1532981,The Batman,2022,tt1877830,414906,tt1877830,movie,The Batman,The Batman,0.0,2022,\N,176,"Action,Crime,Drama",tt1877830,7.8,893476.0,tt1877830,nm0716257,"nm0716257,nm0185976,nm0004170,nm0277730"
4,1498670,Nosferatu,2024,tt5040012,426063,tt5040012,movie,Nosferatu,Nosferatu,0.0,2024,\N,132,"Fantasy,Horror,Mystery",tt5040012,7.2,232086.0,tt5040012,nm3211470,"nm3211470,nm0301961,nm0831290"


### Limpieza
#### Eliminar columnas
Eliminamos columnas con informacion repetida o irrelevante

In [12]:
df_main4 = df_main3[['id', 'imdb_id', 'title', 'primaryTitle', 'originalTitle', 'titleType', 'year', 'startYear', 
                     'isAdult', 'runtimeMinutes', 'genres', 'averageRating', 'numVotes', 'directors', 'writers']]
df_main4.head()

Unnamed: 0,id,imdb_id,title,primaryTitle,originalTitle,titleType,year,startYear,isAdult,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,11001921,tt26683420,John Candy: I Like Me,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0.0,113,"Biography,Comedy,Documentary",7.5,78.0,nm0004988,\N
1,1882904,tt33335602,Maintenance Required,Maintenance Required,Maintenance Required,movie,2025,2025,0.0,\N,"Comedy,Romance",,,nm8919237,"nm12913021,nm2874428,nm8919237"
2,1535445,tt9218128,Gladiator II,Gladiator II,Gladiator II,movie,2024,2024,0.0,148,"Action,Adventure,Drama",6.5,272412.0,nm0000631,"nm0291905,nm0185976,nm0769227"
3,1532981,tt1877830,The Batman,The Batman,The Batman,movie,2022,2022,0.0,176,"Action,Crime,Drama",7.8,893476.0,nm0716257,"nm0716257,nm0185976,nm0004170,nm0277730"
4,1498670,tt5040012,Nosferatu,Nosferatu,Nosferatu,movie,2024,2024,0.0,132,"Fantasy,Horror,Mystery",7.2,232086.0,nm3211470,"nm3211470,nm0301961,nm0831290"


#### Correcion valores nulos y tipos de columnas

In [13]:
df_main5 = df_main4.replace(['\\N', np.nan, None], pd.NA)

In [14]:
df_main5.dtypes

id                 int64
imdb_id           object
title             object
primaryTitle      object
originalTitle     object
titleType         object
year               int64
startYear         object
isAdult           object
runtimeMinutes    object
genres            object
averageRating     object
numVotes          object
directors         object
writers           object
dtype: object

In [15]:
df_main5['titleType'] = df_main5['titleType'].astype('category')
df_main5['startYear'] = df_main5['startYear'].astype('Int64')
df_main5['isAdult'] = df_main5['isAdult'].astype('Int64')
df_main5['runtimeMinutes'] = df_main5['runtimeMinutes'].astype('Int64')
df_main5['averageRating'] = df_main5['averageRating'].astype('Float64')
df_main5['numVotes'] = df_main5['numVotes'].astype('Int64')

In [16]:
df_main5.head()

Unnamed: 0,id,imdb_id,title,primaryTitle,originalTitle,titleType,year,startYear,isAdult,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,11001921,tt26683420,John Candy: I Like Me,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113.0,"Biography,Comedy,Documentary",7.5,78.0,nm0004988,
1,1882904,tt33335602,Maintenance Required,Maintenance Required,Maintenance Required,movie,2025,2025,0,,"Comedy,Romance",,,nm8919237,"nm12913021,nm2874428,nm8919237"
2,1535445,tt9218128,Gladiator II,Gladiator II,Gladiator II,movie,2024,2024,0,148.0,"Action,Adventure,Drama",6.5,272412.0,nm0000631,"nm0291905,nm0185976,nm0769227"
3,1532981,tt1877830,The Batman,The Batman,The Batman,movie,2022,2022,0,176.0,"Action,Crime,Drama",7.8,893476.0,nm0716257,"nm0716257,nm0185976,nm0004170,nm0277730"
4,1498670,tt5040012,Nosferatu,Nosferatu,Nosferatu,movie,2024,2024,0,132.0,"Fantasy,Horror,Mystery",7.2,232086.0,nm3211470,"nm3211470,nm0301961,nm0831290"


In [17]:
for i in ['genres', 'directors', 'writers']:
    df_main5[i] = df_main5[i].apply(lambda x: str(x).split(',') if pd.notna(x) else [])

df_main5.head()

Unnamed: 0,id,imdb_id,title,primaryTitle,originalTitle,titleType,year,startYear,isAdult,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,11001921,tt26683420,John Candy: I Like Me,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113.0,"[Biography, Comedy, Documentary]",7.5,78.0,[nm0004988],[]
1,1882904,tt33335602,Maintenance Required,Maintenance Required,Maintenance Required,movie,2025,2025,0,,"[Comedy, Romance]",,,[nm8919237],"[nm12913021, nm2874428, nm8919237]"
2,1535445,tt9218128,Gladiator II,Gladiator II,Gladiator II,movie,2024,2024,0,148.0,"[Action, Adventure, Drama]",6.5,272412.0,[nm0000631],"[nm0291905, nm0185976, nm0769227]"
3,1532981,tt1877830,The Batman,The Batman,The Batman,movie,2022,2022,0,176.0,"[Action, Crime, Drama]",7.8,893476.0,[nm0716257],"[nm0716257, nm0185976, nm0004170, nm0277730]"
4,1498670,tt5040012,Nosferatu,Nosferatu,Nosferatu,movie,2024,2024,0,132.0,"[Fantasy, Horror, Mystery]",7.2,232086.0,[nm3211470],"[nm3211470, nm0301961, nm0831290]"


#### Valores Na

In [18]:
resultados = {}
for i in df_main5.columns:
    resultados[i] = df_main5[i].isna().sum()
resultados

{'id': np.int64(0),
 'imdb_id': np.int64(0),
 'title': np.int64(0),
 'primaryTitle': np.int64(4),
 'originalTitle': np.int64(4),
 'titleType': np.int64(4),
 'year': np.int64(0),
 'startYear': np.int64(5),
 'isAdult': np.int64(4),
 'runtimeMinutes': np.int64(44),
 'genres': np.int64(0),
 'averageRating': np.int64(9),
 'numVotes': np.int64(9),
 'directors': np.int64(0),
 'writers': np.int64(0)}

In [19]:
df_main5[pd.isna(df_main5['runtimeMinutes'])] 

Unnamed: 0,id,imdb_id,title,primaryTitle,originalTitle,titleType,year,startYear,isAdult,runtimeMinutes,genres,averageRating,numVotes,directors,writers
1,1882904,tt33335602,Maintenance Required,Maintenance Required,Maintenance Required,movie,2025,2025.0,0.0,,"[Comedy, Romance]",,,[nm8919237],"[nm12913021, nm2874428, nm8919237]"
18,11035495,tt38119836,SAQUON,Saquon,Saquon,movie,2025,2025.0,0.0,,[Documentary],,,[nm11941708],[]
537,1623475,tt11422728,"Summer of Soul (...Or, When the Revolution Cou...",,,,2021,,,,[],,,[],[]
2526,1897521,tt33311244,Our Fault,Our Fault,Culpa nuestra,movie,2025,2025.0,0.0,,[Romance],,,[nm0328126],"[nm3399103, nm0328126, nm13982803]"
2820,1169647,tt6793470,Forced Escape,Huroob Idhterari,Huroob Idhterari,movie,2017,2017.0,0.0,,"[Action, Comedy, Drama]",6.0,1850.0,[nm5394449],[nm10080143]
2864,1872401,tt34754710,Baban Baban Ban Vampire,Baban Baban Ban Vampire,Baban Baban Ban Vampire,movie,2025,2025.0,0.0,,"[Comedy, Fantasy, Romance]",5.6,18.0,[nm11008729],"[nm1883488, nm16768805]"
3793,1700877,tt14832738,Solid Rock Trust,Solid Rock Trust,Solid Rock Trust,movie,2022,2022.0,0.0,,"[Drama, Thriller]",6.2,110.0,[nm3546744],[nm3546744]
4256,1691266,tt13515772,Trust,Trust,Trust,movie,2022,2022.0,0.0,,[Drama],7.0,111.0,[nm4509732],"[nm4509732, nm4578941]"
4264,1653436,tt14176892,Untangled,Untangled,Untangled,movie,2022,2022.0,0.0,,"[Drama, Romance]",4.4,58.0,[nm6963836],"[nm10035732, nm11834397, nm8355768]"
4316,1634173,tt6239078,100m Criminal Conviction,100M Criminal Conviction,100M Criminal Conviction,movie,2021,2021.0,0.0,,[Crime],5.4,86.0,[nm8576884],[nm8148131]


In [20]:
df_main5[pd.isna(df_main5['averageRating'])] 

Unnamed: 0,id,imdb_id,title,primaryTitle,originalTitle,titleType,year,startYear,isAdult,runtimeMinutes,genres,averageRating,numVotes,directors,writers
1,1882904,tt33335602,Maintenance Required,Maintenance Required,Maintenance Required,movie,2025,2025.0,0.0,,"[Comedy, Romance]",,,[nm8919237],"[nm12913021, nm2874428, nm8919237]"
18,11035495,tt38119836,SAQUON,Saquon,Saquon,movie,2025,2025.0,0.0,,[Documentary],,,[nm11941708],[]
36,1684871,tt18392014,Play Dirty,Play Dirty,Play Dirty,movie,2025,2025.0,0.0,125.0,"[Crime, Drama]",,,[nm0000948],"[nm0922799, nm0000948, nm3143168, nm0046524]"
537,1623475,tt11422728,"Summer of Soul (...Or, When the Revolution Cou...",,,,2021,,,,[],,,[],[]
2526,1897521,tt33311244,Our Fault,Our Fault,Culpa nuestra,movie,2025,2025.0,0.0,,[Romance],,,[nm0328126],"[nm3399103, nm0328126, nm13982803]"
6207,4148099,tt11015752,"PAW Patrol: Ready, Race, Rescue!",,,,2019,,,,[],,,[],[]
6582,1625502,tt13836494,Pinkfong & Baby Shark's Space Adventure,,,,2019,,,,[],,,[],[]
7353,1583606,tt10274176,Most Dangerous Game,Most Dangerous Game,Most Dangerous Game,movie,2020,,0.0,,[Comedy],,,[],[nm4199289]
7358,1796155,tt27599817,A Hipster in Rural Spain,,,,2024,,,,[],,,[],[]


#### Duplicados

In [21]:
df_main5.duplicated('imdb_id').sum()

np.int64(0)

#### Analisis de columnas

In [22]:
df_main5.describe()

Unnamed: 0,id,year,startYear,isAdult,runtimeMinutes,averageRating,numVotes
count,7387.0,7387.0,7382.0,7383.0,7343.0,7378.0,7378.0
mean,1286783.0,2008.658996,2008.55527,0.0,102.200327,5.821618,17435.83505
std,581003.4,18.162548,18.136997,0.0,21.347217,1.26408,68580.522345
min,124.0,1916.0,1916.0,0.0,40.0,1.4,6.0
25%,1189871.0,2005.0,2005.0,0.0,89.0,5.1,929.25
50%,1399781.0,2016.0,2016.0,0.0,97.0,6.0,2911.0
75%,1608651.0,2020.0,2020.0,0.0,110.0,6.7,9149.25
max,11035500.0,2025.0,2025.0,0.0,357.0,9.6,2367400.0


In [23]:
dfmain = df_main5

In [24]:
dfmain.to_csv('data/imdb_prime.csv')

## Creacion Dataframe personas

### Join con `imdb_principals`

In [25]:
df_personas1 = dfmain.merge(imdb_principals, how='left', left_on='imdb_id', right_on='tconst')
df_personas1.head()

Unnamed: 0,id,imdb_id,title,primaryTitle,originalTitle,titleType,year,startYear,isAdult,runtimeMinutes,...,averageRating,numVotes,directors,writers,tconst,ordering,nconst,category,job,characters
0,11001921,tt26683420,John Candy: I Like Me,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,...,7.5,78,[nm0004988],[],tt26683420,1.0,nm0000158,actor,\N,\N
1,11001921,tt26683420,John Candy: I Like Me,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,...,7.5,78,[nm0004988],[],tt26683420,2.0,nm0001006,archive_footage,\N,"[""Self""]"
2,11001921,tt26683420,John Candy: I Like Me,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,...,7.5,78,[nm0004988],[],tt26683420,3.0,nm0000195,self,\N,"[""Self""]"
3,11001921,tt26683420,John Candy: I Like Me,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,...,7.5,78,[nm0004988],[],tt26683420,4.0,nm0000188,actor,\N,\N
4,11001921,tt26683420,John Candy: I Like Me,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,...,7.5,78,[nm0004988],[],tt26683420,5.0,nm0000316,self,\N,"[""Self""]"


### Eliminar columnas

In [26]:
df_personas2 = df_personas1[['id', 'imdb_id', 'title', 'nconst', 'ordering', 'category', 'job', 'characters', 'primaryTitle',
                             'originalTitle', 'titleType', 'year', 'startYear', 'isAdult', 'runtimeMinutes', 'genres', 'averageRating',
                             'numVotes', 'directors', 'writers']]
df_personas2.head()

Unnamed: 0,id,imdb_id,title,nconst,ordering,category,job,characters,primaryTitle,originalTitle,titleType,year,startYear,isAdult,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,11001921,tt26683420,John Candy: I Like Me,nm0000158,1.0,actor,\N,\N,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]
1,11001921,tt26683420,John Candy: I Like Me,nm0001006,2.0,archive_footage,\N,"[""Self""]",John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]
2,11001921,tt26683420,John Candy: I Like Me,nm0000195,3.0,self,\N,"[""Self""]",John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]
3,11001921,tt26683420,John Candy: I Like Me,nm0000188,4.0,actor,\N,\N,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]
4,11001921,tt26683420,John Candy: I Like Me,nm0000316,5.0,self,\N,"[""Self""]",John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]


### Arreglar valores nulos y columnas

In [27]:
df_personas3 = df_personas2.replace(['\\N', np.nan, None], pd.NA)
df_personas3.head()

Unnamed: 0,id,imdb_id,title,nconst,ordering,category,job,characters,primaryTitle,originalTitle,titleType,year,startYear,isAdult,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,11001921,tt26683420,John Candy: I Like Me,nm0000158,1.0,actor,,,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]
1,11001921,tt26683420,John Candy: I Like Me,nm0001006,2.0,archive_footage,,"[""Self""]",John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]
2,11001921,tt26683420,John Candy: I Like Me,nm0000195,3.0,self,,"[""Self""]",John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]
3,11001921,tt26683420,John Candy: I Like Me,nm0000188,4.0,actor,,,John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]
4,11001921,tt26683420,John Candy: I Like Me,nm0000316,5.0,self,,"[""Self""]",John Candy: I Like Me,John Candy: I Like Me,movie,2025,2025,0,113,"[Biography, Comedy, Documentary]",7.5,78,[nm0004988],[]


In [28]:
df_personas3.dtypes

id                   int64
imdb_id             object
title               object
nconst              object
ordering            object
category            object
job                 object
characters          object
primaryTitle        object
originalTitle       object
titleType         category
year                 int64
startYear            Int64
isAdult              Int64
runtimeMinutes       Int64
genres              object
averageRating      Float64
numVotes             Int64
directors           object
writers             object
dtype: object

In [29]:
df_personas3['ordering'] = df_personas3['ordering'].astype('Int64')
df_personas3['category'] = df_personas3['category'].astype('category')

### Valores nulos

In [30]:
resultados = {}
for i in df_personas3.columns:
    resultados[i] = df_personas3[i].isna().sum()
resultados

{'id': np.int64(0),
 'imdb_id': np.int64(0),
 'title': np.int64(0),
 'nconst': np.int64(4),
 'ordering': np.int64(4),
 'category': np.int64(4),
 'job': np.int64(110882),
 'characters': np.int64(75591),
 'primaryTitle': np.int64(4),
 'originalTitle': np.int64(4),
 'titleType': np.int64(4),
 'year': np.int64(0),
 'startYear': np.int64(10),
 'isAdult': np.int64(4),
 'runtimeMinutes': np.int64(607),
 'genres': np.int64(0),
 'averageRating': np.int64(81),
 'numVotes': np.int64(81),
 'directors': np.int64(0),
 'writers': np.int64(0)}

In [31]:
dfpersonas = df_personas3

In [33]:
dfpersonas.to_csv('data/personas_prime.csv')