# Extraccion y transformacion de los datasets

### 1. Extraccion de los datos en dataframes

In [7]:
# Importar las librerias correspondientes
import pandas as pd
import ast
import json
import numpy as np

# Extraer los datos del dataset
movies_df = pd.read_csv('Datasets/movies_dataset.csv', low_memory=False)
credits_df = pd.read_csv('Datasets/credits.csv', low_memory=False)

# Realizar el cambio de csv a parquet
movies_df.to_parquet('Datasets/movies_dataset_parquet') 
credits_df.to_parquet('Datasets/credits_dataset_parquet')

# Extraer los datos del dataset en parquet
movies_df_parquet = pd.read_parquet('Datasets/movies_dataset_parquet')
credits_df_parquet = pd.read_parquet('Datasets/credits_dataset_parquet')


In [8]:
# observacion de datos del dataframe movies_df_parquet
print(movies_df_parquet.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [9]:
# observacion de datos del dataframe credits_df_parquet
print(credits_df_parquet.head())        


                                                cast  \
0  [{'cast_id': 14, 'character': 'Woody (voice)',...   
1  [{'cast_id': 1, 'character': 'Alan Parrish', '...   
2  [{'cast_id': 2, 'character': 'Max Goldman', 'c...   
3  [{'cast_id': 1, 'character': "Savannah 'Vannah...   
4  [{'cast_id': 1, 'character': 'George Banks', '...   

                                                crew     id  
0  [{'credit_id': '52fe4284c3a36847f8024f49', 'de...    862  
1  [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...   8844  
2  [{'credit_id': '52fe466a9251416c75077a89', 'de...  15602  
3  [{'credit_id': '52fe44779251416c91011acb', 'de...  31357  
4  [{'credit_id': '52fe44959251416c75039ed7', 'de...  11862  


### 2. Eliminacion de columnas no utiles

In [10]:
# Eliminar las columnas
movies_df_parquet = movies_df_parquet.drop(columns=['adult','homepage','imdb_id','poster_path','original_title','video'])

# Guardar el DataFrame modificado en un nuevo archivo
movies_df_parquet.to_parquet('Datasets/movies_dataset_parquet', index=False)  

# Comprobar la eliminacion exitosa de las columnas
movies_df_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4494 non-null   object 
 1   budget                 45466 non-null  object 
 2   genres                 45466 non-null  object 
 3   id                     45466 non-null  object 
 4   original_language      45455 non-null  object 
 5   overview               44512 non-null  object 
 6   popularity             45461 non-null  object 
 7   production_companies   45463 non-null  object 
 8   production_countries   45463 non-null  object 
 9   release_date           45379 non-null  object 
 10  revenue                45460 non-null  float64
 11  runtime                45203 non-null  float64
 12  spoken_languages       45460 non-null  object 
 13  status                 45379 non-null  object 
 14  tagline                20412 non-null  object 
 15  ti

### 3. Eliminacion o modificacion de datos nulos

In [11]:
# Rellenar los valores nulos de las columnas Revenue y budget por el numero cero (0)

movies_df_parquet['revenue'] = movies_df_parquet['revenue'].fillna(0)
movies_df_parquet['budget'] = movies_df_parquet['budget'].fillna(0)

# Eliminar las filas con valores nulos de la columna release_date
movies_df_parquet = movies_df_parquet.dropna(subset=['release_date'])


In [12]:
# comprobacion de la limpieza de datos nulos

print(movies_df_parquet['release_date'].isnull().sum())
print(movies_df_parquet['revenue'].isnull().sum())
print(movies_df_parquet['budget'].isnull().sum())

0
0
0


### 4. Limpieza de columnas con valores tipo Fecha

In [13]:
# Ordenamiento de las fechas de la columna release_date en un formato AAAA-mm-dd
movies_df_parquet['release_date'] = pd.to_datetime(movies_df_parquet['release_date'], format ='%Y-%m-%d', errors= 'coerce')

# Creacion de una columna llamada release_year tomando el año de la fecha de estreno 

movies_df_parquet['release_year'] = movies_df_parquet['release_date'].dt.year
movies_df_parquet.to_parquet('Datasets/movies_dataset_parquet', index=False)


#comprobar los cambios y agregaciones correspondientes

print(movies_df_parquet[['release_date', 'release_year']].head())
print(movies_df_parquet.info())

  release_date  release_year
0   1995-10-30        1995.0
1   1995-12-15        1995.0
2   1995-12-22        1995.0
3   1995-12-22        1995.0
4   1995-02-10        1995.0
<class 'pandas.core.frame.DataFrame'>
Index: 45379 entries, 0 to 45465
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   belongs_to_collection  4491 non-null   object        
 1   budget                 45379 non-null  object        
 2   genres                 45379 non-null  object        
 3   id                     45379 non-null  object        
 4   original_language      45368 non-null  object        
 5   overview               44438 non-null  object        
 6   popularity             45377 non-null  object        
 7   production_companies   45379 non-null  object        
 8   production_countries   45379 non-null  object        
 9   release_date           45376 non-null  datetime64[ns]
 10  revenue  

### 5. Creacion de columnas calculadas

In [14]:
# Asegurar que los datos de las columnas revenue y budget sean numericos

movies_df_parquet['revenue'] = pd.to_numeric(movies_df_parquet['revenue'], errors='coerce')
movies_df_parquet['budget'] = pd.to_numeric(movies_df_parquet['budget'], errors='coerce')

# Creacion de la columna 'return' la cual va a calcular el retorno de la inversion entre las columnas 'revenue' y 'budget' usando numpy

movies_df_parquet['return'] = np.where(
    # condicion
    movies_df_parquet['budget'] > 0,
    # valor si la condicion es True
    movies_df_parquet['revenue']/movies_df_parquet['budget'],
    # valor si la condicion es False
    0
)

# guardar los datos en el dataset y comprobar los cambios y agregaciones correspondientes

movies_df_parquet.to_parquet('Datasets/movies_dataset_parquet', index=False)
print(movies_df_parquet [['revenue', 'budget', 'return']].head())
print(movies_df_parquet .info())

       revenue      budget     return
0  373554033.0  30000000.0  12.451801
1  262797249.0  65000000.0   4.043035
2          0.0         0.0   0.000000
3   81452156.0  16000000.0   5.090760
4   76578911.0         0.0   0.000000
<class 'pandas.core.frame.DataFrame'>
Index: 45379 entries, 0 to 45465
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   belongs_to_collection  4491 non-null   object        
 1   budget                 45376 non-null  float64       
 2   genres                 45379 non-null  object        
 3   id                     45379 non-null  object        
 4   original_language      45368 non-null  object        
 5   overview               44438 non-null  object        
 6   popularity             45377 non-null  object        
 7   production_companies   45379 non-null  object        
 8   production_countries   45379 non-null  object        
 9   release_date  

### 6. Desanidado de Columnas del dataset movies

In [15]:
# Confirmar los tipos de datos que puedo encontrar dentro de la columna belongs_to_collection
movies_df_parquet ['belongs_to_collection'].unique()

array(["{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",
       None,
       "{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg'}",
       ...,
       "{'id': 148603, 'name': 'Ducobu Collection', 'poster_path': '/rd7AWZUy2QFPIblNWToVmdfXQcA.jpg', 'backdrop_path': '/7mzKmoIrvGapvsSbAVlX4HtCnFj.jpg'}",
       "{'id': 152918, 'name': 'Mister Blot Collection', 'poster_path': '/44PYEwwjGts8pAob59RHd6zlkKc.jpg', 'backdrop_path': '/5uoPsNiFpUYNamSGqE8okN27VRK.jpg'}",
       "{'id': 200641, 'name': 'Red Lotus Collection', 'poster_path': '/yf9Eod9ANXyHTzDpAxz9ezgvlL4.jpg', 'backdrop_path': '/3fhHbLeO3DqdHvgHg5szs399eBb.jpg'}"],
      dtype=object)

In [16]:
# Realizar una función para desanidar los diccionarios de la columna belongs_to_collection

def desanidar_belongs_to_collection(columna):
    """
    Desanida la columna 'belongs_to_collection' del DataFrame movies_df_parquet_modificado, extrayendo los valores
    de 'id' y 'name' de la columna 'belongs_to_collection' y creando nuevas columnas 'id_belongs_to_collection' y 
    'name_belongs_to_collection'.

    Parametros:
        columna: la columna 'belongs_to_collection' del DataFrame que contiene datos en formato diccionario.

    Retorna:
        Un DataFrame modificado con dos nuevas columnas:
            * 'id_belongs_to_collection': El id de la colección a la que pertenece la película.
            * 'name_belongs_to_collection': El nombre de la colección a la que pertenece la película.
    """

    if pd.isna(columna) or not isinstance(columna, str): 
        return None, None
    try:
        # Intentar convertir el string en un diccionario

        columna_dict = ast.literal_eval(columna)

        # Asegurarse de que sea un diccionario antes de extraer los valores

        if isinstance(columna_dict, dict):
            return columna_dict.get('id', None), columna_dict.get('name', None)
        else:
            return None, None
        
    # tener en cuenta los errores que pueden ocurrir 

    except (ValueError, SyntaxError):
        return None, None

# Aplicar la función para extraer 'id' y 'name'
movies_df_parquet ['id_belongs_to_collection'], movies_df_parquet ['name_belongs_to_collection'] = zip(*movies_df_parquet ['belongs_to_collection'].apply(desanidar_belongs_to_collection))


In [17]:
# Verificar la existencia y correcto guardado de las nuevas columnas

print(movies_df_parquet [['id_belongs_to_collection', 'name_belongs_to_collection']].head())
print(movies_df_parquet .info())

   id_belongs_to_collection      name_belongs_to_collection
0                   10194.0            Toy Story Collection
1                       NaN                            None
2                  119050.0       Grumpy Old Men Collection
3                       NaN                            None
4                   96871.0  Father of the Bride Collection
<class 'pandas.core.frame.DataFrame'>
Index: 45379 entries, 0 to 45465
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   belongs_to_collection       4491 non-null   object        
 1   budget                      45376 non-null  float64       
 2   genres                      45379 non-null  object        
 3   id                          45379 non-null  object        
 4   original_language           45368 non-null  object        
 5   overview                    44438 non-null  object        
 6   popularity         

In [18]:
# Confirmar los tipos de datos que puedo encontrar dentro de la columna production_companies
print(movies_df_parquet ['production_companies'].unique())



["[{'name': 'Pixar Animation Studios', 'id': 3}]"
 "[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communications', 'id': 10201}]"
 "[{'name': 'Warner Bros.', 'id': 6194}, {'name': 'Lancaster Gate', 'id': 19464}]"
 ...
 "[{'name': 'Westdeutscher Rundfunk (WDR)', 'id': 7025}, {'name': 'Working Title Films', 'id': 10163}, {'name': '20th Century Fox Television', 'id': 16323}, {'name': 'CanWest Global Communications', 'id': 38978}]"
 "[{'name': 'Sine Olivia', 'id': 19653}]"
 "[{'name': 'Yermoliev', 'id': 88753}]"]


In [19]:
# Realizar una función para desanidar los diccionarios de la columna production_companies

def desanidar_production_companies(columna):
    """
    Desanida la columna 'production_companies', extrayendo los valores de 'id' y 'name' y creando
    nuevas columnas 'id_production_companies' y 'name_production_companies'.

    Parametros:
        columna: La columna 'production_companies' del DataFrame que contiene datos en formato lista de diccionarios.

    Retorna:
        Tuplas con 'id_production_companies' y 'name_production_companies'.
    """
    if pd.isna(columna) or not isinstance(columna, str):
        return None, None
    
    try:
        columna_list = ast.literal_eval(columna)
        if isinstance(columna_list, list) and len(columna_list) > 0:
            return columna_list[0].get('id', None), columna_list[0].get('name', None)
        else:
            return None, None
    except (ValueError, SyntaxError):
        return None, None

# Aplicar la función para extraer las tuplas
movies_df_parquet ['id_production_companies'], movies_df_parquet ['name_production_companies'] = zip(*movies_df_parquet ['production_companies'].apply(desanidar_production_companies))


In [20]:
# Verificar la existencia y correcto guardado de las nuevas columnas

print(movies_df_parquet [['id_production_companies', 'name_production_companies']].head())
print(movies_df_parquet .info())

   id_production_companies               name_production_companies
0                      3.0                 Pixar Animation Studios
1                    559.0                        TriStar Pictures
2                   6194.0                            Warner Bros.
3                    306.0  Twentieth Century Fox Film Corporation
4                   5842.0                   Sandollar Productions
<class 'pandas.core.frame.DataFrame'>
Index: 45379 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   belongs_to_collection       4491 non-null   object        
 1   budget                      45376 non-null  float64       
 2   genres                      45379 non-null  object        
 3   id                          45379 non-null  object        
 4   original_language           45368 non-null  object        
 5   overview                    44438 non-nul

In [21]:
# Confirmar los tipos de datos que puedo encontrar dentro de la columna spoken_languages
print(movies_df_parquet ['spoken_languages'].unique())

["[{'iso_639_1': 'en', 'name': 'English'}]"
 "[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]"
 "[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'es', 'name': 'Español'}]"
 ...
 "[{'iso_639_1': 'sv', 'name': 'svenska'}, {'iso_639_1': 'de', 'name': 'Deutsch'}]"
 "[{'iso_639_1': 'ar', 'name': 'العربية'}, {'iso_639_1': 'pl', 'name': 'Polski'}]"
 "[{'iso_639_1': 'ff', 'name': 'Fulfulde'}, {'iso_639_1': 'en', 'name': 'English'}]"]


In [22]:
# Realizar una función para desanidar los diccionarios de la columna spoken_languages

def desanidar_spoken_languages(columna):
    """
    Desanida la columna 'spoken_languages', extrayendo los valores de 'iso_639_1' y 'name' y creando
    nuevas columnas 'iso_poken_language' y 'name_spoken_language'.

    Parametros:
        columna (pd.Series): Columna del DataFrame que contiene datos en formato lista de diccionarios.

    Retorna:
        Tuplas con 'iso_poken_language' y 'name_spoken_language'.
    """
    if pd.isna(columna) or not isinstance(columna, str):
        return None, None
    
    try:
        columna_list = ast.literal_eval(columna)
        if isinstance(columna_list, list) and len(columna_list) > 0:
            return columna_list[0].get('iso_639_1', None), columna_list[0].get('name', None)
        else:
            return None, None
    except (ValueError, SyntaxError):
        return None, None

# Aplicar la función
movies_df_parquet ['iso_poken_language'], movies_df_parquet ['name_spoken_language'] = zip(*movies_df_parquet ['spoken_languages'].apply(desanidar_spoken_languages))

In [23]:
# Verificar la existencia y correcto guardado de las nuevas columnas
print(movies_df_parquet [['iso_poken_language', 'name_spoken_language']].head())
print(movies_df_parquet .info())

  iso_poken_language name_spoken_language
0                 en              English
1                 en              English
2                 en              English
3                 en              English
4                 en              English
<class 'pandas.core.frame.DataFrame'>
Index: 45379 entries, 0 to 45465
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   belongs_to_collection       4491 non-null   object        
 1   budget                      45376 non-null  float64       
 2   genres                      45379 non-null  object        
 3   id                          45379 non-null  object        
 4   original_language           45368 non-null  object        
 5   overview                    44438 non-null  object        
 6   popularity                  45377 non-null  object        
 7   production_companies        45379 non-null  object        
 8  

In [24]:
# Confirmar los tipos de datos que puedo encontrar dentro de la columna production_countries
print(movies_df_parquet ['production_countries'].unique())

["[{'iso_3166_1': 'US', 'name': 'United States of America'}]"
 "[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]"
 "[{'iso_3166_1': 'GB', 'name': 'United Kingdom'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]"
 ...
 "[{'iso_3166_1': 'PL', 'name': 'Poland'}, {'iso_3166_1': 'CZ', 'name': 'Czech Republic'}, {'iso_3166_1': 'SK', 'name': 'Slovakia'}]"
 "[{'iso_3166_1': 'CU', 'name': 'Cuba'}, {'iso_3166_1': 'DE', 'name': 'Germany'}, {'iso_3166_1': 'ES', 'name': 'Spain'}]"
 "[{'iso_3166_1': 'EG', 'name': 'Egypt'}, {'iso_3166_1': 'IT', 'name': 'Italy'}, {'iso_3166_1': 'US', 'name': 'United States of America'}]"]


In [25]:
# Crear una función para desanidar los diccionarios de la columna production_countries

def desanidar_production_countries(columna):
    """
    Desanida la columna 'production_countries', extrayendo los valores de 'iso_3166_1' y 'name' y creando
    nuevas columnas 'iso_production_countries' y 'name_production_countries'.

    Parametros:
        columna (pd.Series): Columna del DataFrame que contiene datos en formato lista de diccionarios.

    Retorna:
        Tuplas con 'iso_country' y 'name_country'.
    """
    if pd.isna(columna) or not isinstance(columna, str):
        return None, None
    
    try:
        columna_list = ast.literal_eval(columna)
        if isinstance(columna_list, list) and len(columna_list) > 0:
            return columna_list[0].get('iso_3166_1', None), columna_list[0].get('name', None)
        else:
            return None, None
    except (ValueError, SyntaxError):
        return None, None

# Aplicar la función
movies_df_parquet ['iso_production_countries'], movies_df_parquet ['name_production_countries'] = zip(*movies_df_parquet ['production_countries'].apply(desanidar_production_countries))

In [26]:
# Verificar la existencia y correcto guardado de las nuevas columnas
print(movies_df_parquet [['iso_production_countries', 'name_production_countries']].head())
print(movies_df_parquet .info())



  iso_production_countries name_production_countries
0                       US  United States of America
1                       US  United States of America
2                       US  United States of America
3                       US  United States of America
4                       US  United States of America
<class 'pandas.core.frame.DataFrame'>
Index: 45379 entries, 0 to 45465
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   belongs_to_collection       4491 non-null   object        
 1   budget                      45376 non-null  float64       
 2   genres                      45379 non-null  object        
 3   id                          45379 non-null  object        
 4   original_language           45368 non-null  object        
 5   overview                    44438 non-null  object        
 6   popularity                  45377 non-null  object        
 7

In [27]:
# Confirmar los tipos de datos que puedo encontrar dentro de la columna genres
print(movies_df_parquet ['genres'].unique())

["[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"
 "[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]"
 "[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]" ...
 "[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name': 'Comedy'}, {'id': 28, 'name': 'Action'}, {'id': 10751, 'name': 'Family'}]"
 "[{'id': 28, 'name': 'Action'}, {'id': 9648, 'name': 'Mystery'}, {'id': 53, 'name': 'Thriller'}, {'id': 27, 'name': 'Horror'}]"
 "[{'id': 10751, 'name': 'Family'}, {'id': 16, 'name': 'Animation'}, {'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]"]


In [28]:
# Crear una función para desanidar los diccionarios de la columna genres
def desanidar_genres(columna):
    """
    Desanida la columna 'genres', extrayendo el primer valor de 'id' y 'name' y creando
    nuevas columnas 'id_genre' y 'name_genre'.

    Parametros:
        columna (pd.Series): Columna del DataFrame que contiene datos en formato lista de diccionarios.

    Retorna:
        Tuplas con 'id_genre' y 'name_genre'.
    """
    if pd.isna(columna) or not isinstance(columna, str):
        return None, None
    
    try:
        columna_list = ast.literal_eval(columna)
        if isinstance(columna_list, list) and len(columna_list) > 0:
            return columna_list[0].get('id', None), columna_list[0].get('name', None)
        else:
            return None, None
    except (ValueError, SyntaxError):
        return None, None

# Aplicar la función
movies_df_parquet ['id_genre'], movies_df_parquet ['name_genre'] = zip(*movies_df_parquet ['genres'].apply(desanidar_genres))

In [29]:
# Verificar la existencia de las nuevas columnas
print(movies_df_parquet [['iso_production_countries', 'name_production_countries']].head())
print(movies_df_parquet .info())


  iso_production_countries name_production_countries
0                       US  United States of America
1                       US  United States of America
2                       US  United States of America
3                       US  United States of America
4                       US  United States of America
<class 'pandas.core.frame.DataFrame'>
Index: 45379 entries, 0 to 45465
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   belongs_to_collection       4491 non-null   object        
 1   budget                      45376 non-null  float64       
 2   genres                      45379 non-null  object        
 3   id                          45379 non-null  object        
 4   original_language           45368 non-null  object        
 5   overview                    44438 non-null  object        
 6   popularity                  45377 non-null  object        
 7

### 7. Eliminacion de las columnas desanidadas

In [30]:

# # Eliminar las columnas desanidadas
movies_df_parquet = movies_df_parquet.drop(columns=['belongs_to_collection','production_companies','production_countries','spoken_languages','genres'])

# Guardar el dataframe en su respectivo dataset
movies_df_parquet .to_parquet('Datasets/movies_dataset_parquet', index=False) 

# Verificar la eliminacion exitosa
print(movies_df_parquet .info())

<class 'pandas.core.frame.DataFrame'>
Index: 45379 entries, 0 to 45465
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   budget                      45376 non-null  float64       
 1   id                          45379 non-null  object        
 2   original_language           45368 non-null  object        
 3   overview                    44438 non-null  object        
 4   popularity                  45377 non-null  object        
 5   release_date                45376 non-null  datetime64[ns]
 6   revenue                     45379 non-null  float64       
 7   runtime                     45130 non-null  float64       
 8   status                      45296 non-null  object        
 9   tagline                     20398 non-null  object        
 10  title                       45376 non-null  object        
 11  vote_average                45376 non-null  float64       


### 8. Desanidado de Columnas del dataset credits

In [31]:
# Revisar las columnas y tipo de datos
print(credits_df_parquet.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB
None


In [32]:
# Confirmar los tipos de datos que puedo encontrar dentro de la columna cast
print(credits_df_parquet['cast'].unique())

["[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4

In [33]:
# Realizar una funcion para desanidar la columna cast

def desanidar_columna_cast(columna):
    """
    Desanida la columna 'cast' del DataFrame 'credits_df_parquet', que contiene información en listas de diccionarios.
    La función extrae los valores de 'id', 'name', 'character', 'gender', y 'profile_path' de la primera entrada en la lista 
    de cada fila y crea nuevas columnas en el DataFrame con estos valores.

    Parámetros:
        columna: Una columna del DataFrame que tiene una lista de diccionarios.

    Retorna:
        Una tupla con los valores extraídos: cast_id, character, credit_id, gender, id, name y order.
    """
    if pd.isna(columna) or not isinstance(columna, str):
        return None, None, None, None, None, None, None

    try:
        lista_dicts = ast.literal_eval(columna)

        if isinstance(lista_dicts, list) and lista_dicts and isinstance(lista_dicts[0], dict):
            # Extraer los primeros valores de la lista para simplificar
            first_entry = lista_dicts[0]
            return (
                first_entry.get('cast_id', None),
                first_entry.get('character', None),
                first_entry.get('credit_id', None),
                first_entry.get('gender', None),
                first_entry.get('id', None),
                first_entry.get('name', None),
                first_entry.get('order', None),
            )
        else:
            return None, None, None, None, None, None, None
    except (ValueError, SyntaxError):
        return None, None, None, None, None, None, None

# Aplicar la función para extraer 'cast_id', 'character', 'credit_id', 'gender', 'id', 'name' y 'order' de 'cast'
credits_df_parquet[['cast_id', 'cast_character', 'cast_credit_id', 'cast_gender', 'cast_id_actor', 'cast_name_actor', 'cast_order_credit']] = credits_df_parquet['cast'].apply(desanidar_columna_cast).apply(pd.Series)




In [34]:
# Verificar la existencia de las nuevas columnas
print(credits_df_parquet[['cast_id', 'cast_character', 'cast_credit_id', 'cast_gender', 'cast_id_actor', 'cast_name_actor', 'cast_order_credit']].head())
print(credits_df_parquet.info())

   cast_id             cast_character            cast_credit_id  cast_gender  \
0     14.0              Woody (voice)  52fe4284c3a36847f8024f95          2.0   
1      1.0               Alan Parrish  52fe44bfc3a36847f80a7c73          2.0   
2      2.0                Max Goldman  52fe466a9251416c75077a8d          2.0   
3      1.0  Savannah 'Vannah' Jackson  52fe44779251416c91011aad          1.0   
4      1.0               George Banks  52fe44959251416c75039eb9          2.0   

   cast_id_actor  cast_name_actor  cast_order_credit  
0           31.0        Tom Hanks                0.0  
1         2157.0   Robin Williams                0.0  
2         6837.0   Walter Matthau                0.0  
3         8851.0  Whitney Houston                0.0  
4        67773.0     Steve Martin                0.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------

In [35]:
# Confirmar los tipos de datos que puedo encontrar dentro de la columna cast
print(credits_df_parquet['crew'].unique())

['[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'

In [36]:
# Realizar una funcion para desanidar la columna crew

def desanidar_columna_crew(columna):
    """
    Desanida la columna 'crew' del DataFrame 'credits_df_parquet', que contiene información en listas de diccionarios.
    La función extrae los valores de 'credit_id', 'department', 'gender', 'id', 'job', 'name', y 'profile_path' 
    de la primera entrada en la lista de cada fila y crea nuevas columnas en el DataFrame con estos valores.

    Parámetros:
        columna: Una columna del DataFrame que tiene una lista de diccionarios.

    Retorna:
        Una tupla con los valores extraídos: credit_id, department, gender, id, job, name y profile_path.
    """
    if pd.isna(columna) or not isinstance(columna, str):
        return None, None, None, None, None, None, None

    try:
        lista_dicts = ast.literal_eval(columna)

        if isinstance(lista_dicts, list) and lista_dicts and isinstance(lista_dicts[0], dict):
            # Extraer los primeros valores de la lista para simplificar
            first_entry = lista_dicts[0]
            return (
                first_entry.get('credit_id', None),
                first_entry.get('department', None),
                first_entry.get('gender', None),
                first_entry.get('id', None),
                first_entry.get('job', None),
                first_entry.get('name', None),
                first_entry.get('profile_path', None),
            )
        else:
            return None, None, None, None, None, None, None
    except (ValueError, SyntaxError):
        return None, None, None, None, None, None, None

# Aplicar la función para extraer 'credit_id', 'department', 'gender', 'id', 'job', 'name' y 'profile_path' de 'crew'
credits_df_parquet[['crew_credit_id', 'crew_department', 'crew_gender', 'crew_id_member', 'crew_job', 'crew_name_member', 'crew_profile_path_member']] = credits_df_parquet['crew'].apply(desanidar_columna_crew).apply(pd.Series)



In [37]:
# Verificar las primeras filas con las nuevas columnas

print(credits_df_parquet[['crew_credit_id', 'crew_department', 'crew_gender', 'crew_id_member', 'crew_job', 'crew_name_member', 'crew_profile_path_member']].head())
print(credits_df_parquet.info())

             crew_credit_id crew_department  crew_gender  crew_id_member  \
0  52fe4284c3a36847f8024f49       Directing          2.0          7879.0   
1  52fe44bfc3a36847f80a7cd1      Production          2.0           511.0   
2  52fe466a9251416c75077a89       Directing          2.0         26502.0   
3  52fe44779251416c91011acb       Directing          2.0          2178.0   
4  52fe44959251416c75039ed7           Sound          2.0            37.0   

                  crew_job crew_name_member          crew_profile_path_member  
0                 Director    John Lasseter  /7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg  
1       Executive Producer  Larry J. Franco                              None  
2                 Director    Howard Deutch  /68Vae1HkU1NxQZ6KEmuxIpno7c9.jpg  
3                 Director  Forest Whitaker  /4pMQkelS5lK661m9Kz3oIxLYiyS.jpg  
4  Original Music Composer   Alan Silvestri  /chEsfnDEtRmv1bfOaNAoVEzhCc6.jpg  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries

In [38]:

# Eliminar las columnas desanidadas
credits_df_parquet = credits_df_parquet.drop(columns=['cast','crew'])

# Guardar el dataframe en su respectivo dataset
credits_df_parquet.to_parquet('Datasets/credits_dataset_parquet', index=False)

# Verificar la eliminacion exitosa
print(credits_df_parquet.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        45476 non-null  int64  
 1   cast_id                   43058 non-null  float64
 2   cast_character            43058 non-null  object 
 3   cast_credit_id            43058 non-null  object 
 4   cast_gender               43058 non-null  float64
 5   cast_id_actor             43058 non-null  float64
 6   cast_name_actor           43058 non-null  object 
 7   cast_order_credit         43058 non-null  float64
 8   crew_credit_id            44705 non-null  object 
 9   crew_department           44705 non-null  object 
 10  crew_gender               44705 non-null  float64
 11  crew_id_member            44705 non-null  float64
 12  crew_job                  44705 non-null  object 
 13  crew_name_member          44705 non-null  object 
 14  crew_p

In [39]:
# Cambiar el nombre de la columna
credits_df_parquet.rename(columns={'id': 'id_credits'}, inplace=True)
# Guardar el dataframe en su respectivo dataset
credits_df_parquet.to_parquet('Datasets/credits_dataset_parquet', index=False)
# Cambiar el nombre de la columna
movies_df_parquet.rename(columns={'id': 'id_movies'}, inplace=True)
# Guardar el dataframe en su respectivo dataset
movies_df_parquet.to_parquet('Datasets/movies_dataset_parquet', index=False) 

