In [67]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # elimina los errres de que van a cambiar

# importamos las librerías que necesitamos
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
import re

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames


In [49]:
df = pd.read_csv('DisneyMoviesDataset.csv', index_col=0)

### LIMPIEZA

In [50]:
#Borramos columnas innecesarias para el análsis
columnas_a_eliminar = ['Running time', 'metascore', 'rotten_tomatoes', 'Written by', 'Starring', 'Music by', 'Budget', 'Box office', 'Story by', 'Narrated by', 'Cinematography', 'Edited by', 'Screenplay by', 'Production companies', 'Adaptation by', 'Traditional', 'Simplified']
def eliminar_columnas(df, lista):
    df.drop( columns = lista, inplace= True)
eliminar_columnas(df,columnas_a_eliminar) 



In [51]:
# Extraer el primer lugar de estreno (lo que está entre paréntesis) y manejar casos donde no haya paréntesis
df['Release location'] = df['Release date'].str.extract(r'\(([^)]+)\)').fillna(np.nan)

In [52]:
# Función para limpiar cada elemento de la columna release_location
def clean_release_location(location):
    if pd.isna(location):
        return np.nan
    
    # Eliminar palabras relacionadas con 'Premier' y variantes
    location = re.sub(r'\b(Premiere|premiere|World Premiere|Premiere-|Premiere:|World Premiere-|World premiere-|premiere-)\b', '', location, flags=re.IGNORECASE)
    
    # Eliminar comas
    location = location.replace(',', '')
    # Quitar el guion al principio
    location = location.lstrip('-')
    # Quitar espacios en exceso al principio y al final
    location = location.strip()
    # Si la cadena resultante está vacía, devolver NaN
    if not location:
        return np.nan
    return location
# Aplicar la función clean_release_location a cada fila de la columna release_location
df['release_location'] = df['Release location'].apply(clean_release_location)

In [53]:
#Elimino la columna Release date
df = df.drop(['Release date'], axis=1)
df = df.drop(['Release location'], axis=1)

In [54]:
def cambiar_cabeceras (df):

    df.columns = [col.lower().replace(' ', '_').replace('_(int)', '').replace('_(float)', '').replace('_(datetime)', '') for col in df.columns]

cambiar_cabeceras(df)



In [55]:
df.head(1)

Unnamed: 0,title,production_company,country,language,running_time,budget,box_office,release_date,imdb,directed_by,produced_by,based_on,distributed_by,release_location
0,Academy Award Review of,Walt Disney Productions,United States,English,41.0,,,1937-05-19,7.2,,,,,


In [56]:
# Lista de columnas a transformar con explode
columns_to_explode = ['production_company', 'country', 'language']

# Iterar sobre cada columna y aplicar las transformaciones
for column in columns_to_explode:
    if column in df.columns:
        # Convertir todos los valores a cadenas para evitar errores
        df[column] = df[column].astype(str)
        # Quitar corchetes, eliminar comillas y dividir por comas
        df[column] = df[column].str.strip("[]").str.replace("'", "").str.split(", ")
        # Aplicar explode
        df = df.explode(column)
        # Limpiar espacios en blanco alrededor de los elementos
        df[column] = df[column].str.strip()

# Revisar la forma del DataFrame final
print(f"Forma después de aplicar explode a todas las columnas: {df.shape}")

# Mostrar las primeras filas del DataFrame final usando display
display(df.head())



Forma después de aplicar explode a todas las columnas: (974, 14)


Unnamed: 0,title,production_company,country,language,running_time,budget,box_office,release_date,imdb,directed_by,produced_by,based_on,distributed_by,release_location
0,Academy Award Review of,Walt Disney Productions,United States,English,41.0,,,1937-05-19,7.2,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,United States,English,83.0,1490000.0,418000000.0,1937-12-21,7.6,"['David Hand (supervising)', 'William Cottrell...",Walt Disney,"['Snow White', 'by The', 'Brothers Grimm']",RKO Radio Pictures,Carthay Circle Theatre Los Angeles CA
2,Pinocchio,Walt Disney Productions,United States,English,88.0,2600000.0,164000000.0,1940-02-07,7.4,"['Ben Sharpsteen', 'Hamilton Luske', 'Bill Rob...",Walt Disney,"['The Adventures of Pinocchio', 'by', 'Carlo C...",RKO Radio Pictures,Center Theatre
3,Fantasia,Walt Disney Productions,United States,English,126.0,2280000.0,83300000.0,1940-11-13,7.8,"['Samuel Armstrong', 'James Algar', 'Bill Robe...","['Walt Disney', 'Ben Sharpsteen']",,"['Walt Disney Productions', 'RKO Radio Pictures']",
4,The Reluctant Dragon,Walt Disney Productions,United States,English,74.0,600000.0,960000.0,1941-06-20,6.9,"['Alfred Werker', '(live action)', 'Hamilton L...",Walt Disney,,RKO Radio Pictures,


In [57]:
def elegir_primer(df):

    #Limpiamos columna directed BY
    df['directed_by'] = df['directed_by'].str.replace("['Animated sequences:',",'').str.replace("['Supervising director:',",'').str.replace("['Animated sequences:',",'').str.replace("['Supervising director',",'')
    df['directed_by'] = df['directed_by'].str.split(',').str[0]
    df['directed_by'] = df['directed_by'].str.replace('"','').str.replace('[','').str.replace("'",'')
    df['directed_by'] = df['directed_by'].str.replace('(supervising)','').str.replace('(supervising director)','').str.replace('(animation)','').str.strip()
    df['directed_by'] = df['directed_by'].fillna('Unknown')

    # Limpiamos columna produced_by
    df['produced_by'] = df['produced_by'].str.split(',').str[0]
    df['produced_by'] = df['produced_by'].str.replace('"','').str.replace('[','').str.replace("'",'').str.replace(']','').str.strip()
    df['produced_by'] = df['produced_by'].fillna('Unknown')

    # Limpiamos columna distributed_by
    df['distributed_by'] = df['distributed_by'].str.split(',').str[0]
    df['distributed_by'] = df['distributed_by'].str.replace('"','').str.replace('[','').str.replace("'",'').str.replace(']','').str.strip()
    df['distributed_by'] = df['distributed_by'].fillna('Unknown')

    # Limpiamos columna based_on
    df['based_on'] = df['based_on'].str.replace('"','').str.replace('[','').str.replace(']','').str.replace("\',",'').str.replace("\'",'').str.replace("'",'').str.replace("by,",'by').str.replace('\\','')
    df['based_on'] = df['based_on'].str.replace('Disney s','').str.replace('Disneys','').str.replace('Walt Disney s','').str.strip().str.capitalize()
    df['based_on'] = df['based_on'].fillna('No based')
    df['based_on'] = df['based_on'].str.split('by').str[0].str.strip().str.capitalize()
    

elegir_primer(df)

In [58]:
df.head()

Unnamed: 0,title,production_company,country,language,running_time,budget,box_office,release_date,imdb,directed_by,produced_by,based_on,distributed_by,release_location
0,Academy Award Review of,Walt Disney Productions,United States,English,41.0,,,1937-05-19,7.2,Unknown,Unknown,No based,Unknown,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,United States,English,83.0,1490000.0,418000000.0,1937-12-21,7.6,David Hand,Walt Disney,Snow white,RKO Radio Pictures,Carthay Circle Theatre Los Angeles CA
2,Pinocchio,Walt Disney Productions,United States,English,88.0,2600000.0,164000000.0,1940-02-07,7.4,Ben Sharpsteen,Walt Disney,The adventures of pinocchio,RKO Radio Pictures,Center Theatre
3,Fantasia,Walt Disney Productions,United States,English,126.0,2280000.0,83300000.0,1940-11-13,7.8,Samuel Armstrong,Walt Disney,No based,Walt Disney Productions,
4,The Reluctant Dragon,Walt Disney Productions,United States,English,74.0,600000.0,960000.0,1941-06-20,6.9,Alfred Werker,Walt Disney,No based,RKO Radio Pictures,


In [59]:
# Función para convertir columnas a millones
def convertir_a_millones(df, columnas):
    for columna in columnas:
        df[columna] = df[columna] / 1e6
# Llamada a la función
columnas_a_convertir = ['budget', 'box_office']
convertir_a_millones(df, columnas_a_convertir)

In [60]:
df.head()

Unnamed: 0,title,production_company,country,language,running_time,budget,box_office,release_date,imdb,directed_by,produced_by,based_on,distributed_by,release_location
0,Academy Award Review of,Walt Disney Productions,United States,English,41.0,,,1937-05-19,7.2,Unknown,Unknown,No based,Unknown,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,United States,English,83.0,1.49,418.0,1937-12-21,7.6,David Hand,Walt Disney,Snow white,RKO Radio Pictures,Carthay Circle Theatre Los Angeles CA
2,Pinocchio,Walt Disney Productions,United States,English,88.0,2.6,164.0,1940-02-07,7.4,Ben Sharpsteen,Walt Disney,The adventures of pinocchio,RKO Radio Pictures,Center Theatre
3,Fantasia,Walt Disney Productions,United States,English,126.0,2.28,83.3,1940-11-13,7.8,Samuel Armstrong,Walt Disney,No based,Walt Disney Productions,
4,The Reluctant Dragon,Walt Disney Productions,United States,English,74.0,0.6,0.96,1941-06-20,6.9,Alfred Werker,Walt Disney,No based,RKO Radio Pictures,


In [61]:
df.shape

(974, 14)

In [62]:
# Porcentaje de valores nulos
print("Porcentaje de valores nulos por columna:")
display((df.isnull().sum() / len(df) * 100).round(2))
print("----"*10)

Porcentaje de valores nulos por columna:


title                  0.00
production_company     0.00
country                0.00
language               0.00
running_time           2.77
budget                25.87
box_office            14.37
release_date           2.46
imdb                   4.00
directed_by            0.00
produced_by            0.00
based_on               0.00
distributed_by         0.00
release_location      48.46
dtype: float64

----------------------------------------


In [65]:
#Gestion nulos variables categoricas
def variables_categoricas(df):
   
    # Identificar columnas categóricas con valores nulos
    nulos_categoricas = df[df.columns[df.isnull().any()]].select_dtypes(include="O").columns
    print("Las columnas categóricas que tienen nulos son : \n")
    print(nulos_categoricas)

   # Asegurar que la columna 'release_date' esté en formato datetime
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

    # Reemplazar los valores nulos en 'release_date' con la fecha estándar '1900-01-01'
    df['release_date'].fillna(pd.to_datetime('1900-01-01'), inplace=True)

    #Remplazar los valores nulos de release_location por desconocido
    df['release_location'] = df['release_location'].fillna('Unknown')

    # Verificar la presencia de la fecha '1900-01-01'
    value_counts = df['release_date'].value_counts()
    display("Conteo de fechas en 'release_date':")
    display(value_counts)

    # Verificar que no queden valores nulos
    nulos_restantes = df['release_date'].isnull().sum()
    display(f"Valores nulos restantes en release_date: {nulos_restantes}")

    # Visualizar las primeras filas de la columna para ver los cambios
    display("Primeras filas de 'release_date':")
    display(df['release_date'].head(10))

    # Mostrar una muestra aleatoria de la columna para verificar
    display("Muestra aleatoria de 'release_date':")
    display(df['release_date'].sample(10))

#Llamada a la funcion
variables_categoricas(df)


Las columnas categóricas que tienen nulos son : 

Index([], dtype='object')


"Conteo de fechas en 'release_date':"

release_date
2016-08-12    36
1900-01-01    24
2011-04-22    20
2004-06-13    16
2007-10-10    16
              ..
1973-11-08     1
1973-06-20     1
1973-03-23     1
1973-02-01     1
1937-05-19     1
Name: count, Length: 422, dtype: int64

'Valores nulos restantes en release_date: 0'

"Primeras filas de 'release_date':"

0   1937-05-19
1   1937-12-21
2   1940-02-07
3   1940-11-13
4   1941-06-20
5   1941-10-23
6   1942-08-09
7   1942-08-24
7   1942-08-24
7   1942-08-24
Name: release_date, dtype: datetime64[ns]

"Muestra aleatoria de 'release_date':"

308   2008-03-07
37    1958-07-09
64    1963-06-01
336   2010-10-08
399   2018-02-26
330   2009-10-17
209   1996-02-16
156   1983-03-11
237   1999-12-17
315   2008-12-25
Name: release_date, dtype: datetime64[ns]

In [66]:
def variables_numericas(dg):
   
    # Identificar columnas numéricas con valores nulos
    nulos_numericas = df[df.columns[df.isnull().any()]].select_dtypes(include=np.number).columns
    print("Las columnas numéricas que tienen nulos son : \n")
    print(nulos_numericas)

    # Mostrar el porcentaje de valores nulos 
    print(df[nulos_numericas].isnull().sum() / df.shape[0] * 100)
   
    # Reemplazar nulos con la mediana
    columnas_mediana =['running_time','imdb']
    mediana = df[columnas_mediana].median()
    df[columnas_mediana] = df[columnas_mediana].fillna(mediana)
    
    # Verificar que los nulos en las columnas de mediana fueron reemplazados
    print("\nDespués de reemplazar nulos con la mediana:")
    print(df[columnas_mediana].isnull().sum())

    # Reemplazar nulos con fillna 0
    columnas_fillna_0=['budget', 'box_office']
    df[columnas_fillna_0] = df[columnas_fillna_0].fillna(0)

   # Verificar que los nulos en las columnas de fillna(0) fueron reemplazados
    print("\nDespués de reemplazar nulos con 0:")
    print(df[columnas_fillna_0].isnull().sum())

    # Copiar el DataFrame 
    df_copia = df.copy()
    
    # Guardar el DataFrame en un archivo CSV
    df_copia.to_csv("DisneyMoviesDataset_nonulls.csv", index=False)

    return df_copia

# Imputar valores nulos en variables numéricas
df_final = variables_numericas(df)
df_final.isnull().sum()

Las columnas numéricas que tienen nulos son : 

Index([], dtype='object')
Series([], dtype: float64)

Después de reemplazar nulos con la mediana:
running_time    0
imdb            0
dtype: int64

Después de reemplazar nulos con 0:
budget        0
box_office    0
dtype: int64


title                 0
production_company    0
country               0
language              0
running_time          0
budget                0
box_office            0
release_date          0
imdb                  0
directed_by           0
produced_by           0
based_on              0
distributed_by        0
release_location      0
dtype: int64

In [None]:
df = pd.read_csv('DisneyMoviesDataset_nonulls.csv')

In [69]:
df.head()

Unnamed: 0,title,production_company,country,language,running_time,budget,box_office,release_date,imdb,directed_by,produced_by,based_on,distributed_by,release_location
0,Academy Award Review of,Walt Disney Productions,United States,English,41.0,0.0,0.0,1937-05-19,7.2,Unknown,Unknown,No based,Unknown,Unknown
1,Snow White and the Seven Dwarfs,Walt Disney Productions,United States,English,83.0,1.49,418.0,1937-12-21,7.6,David Hand,Walt Disney,Snow white,RKO Radio Pictures,Carthay Circle Theatre Los Angeles CA
2,Pinocchio,Walt Disney Productions,United States,English,88.0,2.6,164.0,1940-02-07,7.4,Ben Sharpsteen,Walt Disney,The adventures of pinocchio,RKO Radio Pictures,Center Theatre
3,Fantasia,Walt Disney Productions,United States,English,126.0,2.28,83.3,1940-11-13,7.8,Samuel Armstrong,Walt Disney,No based,Walt Disney Productions,Unknown
4,The Reluctant Dragon,Walt Disney Productions,United States,English,74.0,0.6,0.96,1941-06-20,6.9,Alfred Werker,Walt Disney,No based,RKO Radio Pictures,Unknown
