Transformaciones

In [1]:
import ast
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# Cargamos el archivo CSV original
df = pd.read_csv('Datos\movies_dataset.csv', low_memory=False)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10, 'name': 'Star Wars Collection', 'po...",11000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,11,tt0076759,en,Star Wars,Princess Leia is captured and held hostage by ...,...,1977-05-25,775398007.0,121.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"A long time ago in a galaxy far, far away...",Star Wars,False,8.1,6778.0
1,False,"{'id': 10, 'name': 'Star Wars Collection', 'po...",18000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,1891,tt0080684,en,The Empire Strikes Back,"The epic saga continues as Luke Skywalker, in ...",...,1980-05-17,538400000.0,124.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Adventure Continues...,The Empire Strikes Back,False,8.2,5998.0
2,False,"{'id': 10, 'name': 'Star Wars Collection', 'po...",32350000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,1892,tt0086190,en,Return of the Jedi,As Rebel leaders map their strategy for an all...,...,1983-05-23,572700000.0,135.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Empire Falls...,Return of the Jedi,False,7.9,4763.0
3,False,"{'id': 10, 'name': 'Star Wars Collection', 'po...",115000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,1893,tt0120915,en,Star Wars: Episode I - The Phantom Menace,"Anakin Skywalker, a young slave strong with th...",...,1999-05-19,924317558.0,136.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Every generation has a legend. Every journey h...,Star Wars: Episode I - The Phantom Menace,False,6.4,4526.0
4,False,"{'id': 10, 'name': 'Star Wars Collection', 'po...",120000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,1894,tt0121765,en,Star Wars: Episode II - Attack of the Clones,"Ten years after the invasion of Naboo, the gal...",...,2002-05-15,649398328.0,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A Jedi Shall Not Know Anger. Nor Hatred. Nor L...,Star Wars: Episode II - Attack of the Clones,False,6.4,4074.0


In [68]:
df.info()
df.isna().sum().sort_values()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45463 entries, 0 to 45462
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45463 non-null  bool   
 1   belongs_to_collection  4491 non-null   object 
 2   budget                 45463 non-null  int64  
 3   genres                 45463 non-null  object 
 4   homepage               7779 non-null   object 
 5   id                     45463 non-null  int64  
 6   imdb_id                45446 non-null  object 
 7   original_language      45452 non-null  object 
 8   original_title         45463 non-null  object 
 9   overview               44509 non-null  object 
 10  popularity             45460 non-null  float64
 11  poster_path            45077 non-null  object 
 12  production_companies   45460 non-null  object 
 13  production_countries   45460 non-null  object 
 14  release_date           45376 non-null  object 
 15  re

adult                        0
budget                       0
genres                       0
id                           0
original_title               0
video                        3
title                        3
spoken_languages             3
revenue                      3
production_countries         3
production_companies         3
vote_count                   3
popularity                   3
vote_average                 3
original_language           11
imdb_id                     17
status                      84
release_date                87
runtime                    260
poster_path                386
overview                   954
tagline                  25051
homepage                 37684
belongs_to_collection    40972
dtype: int64

In [69]:
# Empezamos limpieza
df.dropna(subset=['title'], inplace=True)
df.dropna(subset=['release_date'], inplace=True)
df.isna().sum().sort_values()


adult                        0
video                        0
title                        0
spoken_languages             0
revenue                      0
release_date                 0
production_countries         0
production_companies         0
vote_average                 0
popularity                   0
vote_count                   0
original_title               0
id                           0
genres                       0
budget                       0
original_language           11
imdb_id                     14
status                      80
runtime                    246
poster_path                339
overview                   941
tagline                  24978
homepage                 37610
belongs_to_collection    40888
dtype: int64

In [70]:
# Desanidado de las 5 columnas

for i, row in df.iterrows():
    belongs_to_collection = row['belongs_to_collection']
    if isinstance(belongs_to_collection, str): 
        try:
            dict_data = ast.literal_eval(belongs_to_collection)
            if isinstance(dict_data, dict):  
                df.loc[i, 'collection_id'] = dict_data.get('id', None)
                df.loc[i, 'collection_name'] = dict_data.get('name', None)
                df.loc[i, 'collection_poster_path'] = dict_data.get(
                    'poster_path', None)
        except (ValueError, SyntaxError): 
            pass

print(df)



       adult                              belongs_to_collection     budget   
0      False  {'id': 10, 'name': 'Star Wars Collection', 'po...   11000000  \
1      False  {'id': 10, 'name': 'Star Wars Collection', 'po...   18000000   
2      False  {'id': 10, 'name': 'Star Wars Collection', 'po...   32350000   
3      False  {'id': 10, 'name': 'Star Wars Collection', 'po...  115000000   
4      False  {'id': 10, 'name': 'Star Wars Collection', 'po...  120000000   
...      ...                                                ...        ...   
45457  False                                                NaN          0   
45459  False                                                NaN          0   
45460  False                                                NaN          0   
45461  False                                                NaN          0   
45462  False                                                NaN          0   

                                                  genres   
0  

In [71]:
# Desanidar las 4 columnas restantes

df['production_companies'] = df['production_companies'].apply(
    lambda x: ', '.join([str(i['name']) for i in eval(x.replace("False", "FALSE,"))] if isinstance(x, str) and x != "[]" and not isinstance(eval(x), bool) and len(eval(x)) > 0 else ''))

df['production_countries'] = df['production_countries'].apply(
    lambda x: ', '.join([str(i['name']) for i in eval(x.replace("False", "FALSE,"))] if isinstance(x, str) and x != "[]" and len(eval(x)) > 0 else ''))
df['genres'] = df['genres'].apply(
    lambda x: ', '.join([str(i['name']) for i in eval(x.replace("False", "FALSE,"))] if isinstance(x, str) and x != "[]" and len(eval(x)) > 0 else ''))

df['spoken_languages'] = df['spoken_languages'].apply(
    lambda x: ', '.join([str(i['name']) for i in eval(x.replace("False", "FALSE,"))] if isinstance(x, str) and x != "[]" and len(eval(x)) > 0 else ''))




In [72]:
# Transformacion y eliminacion de los datos de las restantes columnas

df[['revenue', 'budget']] = df[['revenue', 'budget']].fillna(0)

df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_date'] = df['release_date'].dt.strftime('%Y-%m-%d')
df['release_year'] = pd.DatetimeIndex(df['release_date']).year

df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['return'] = df['revenue'] / df['budget']
df['return'] = df['return'].fillna(0)
df['return'] = df['return'].replace([float('inf'), float('-inf')], 0)

df['belongs_to_collection'] = df['belongs_to_collection'].fillna('')
df['production_countries'] = df['production_countries'].fillna('')

df = df.drop(['video', 'imdb_id', 'adult', 'original_title',
             'vote_count', 'poster_path', 'homepage', "collection_id", "collection_poster_path", "belongs_to_collection"], axis=1)

df.to_csv('Datos\movies_data.csv', index=False)


In [86]:
# Ya tengo los datos para empezar con la Api 

datos = pd.read_csv("Datos\movies_data.csv", low_memory=False)
datos.info()
datos.isna().sum().sort_values()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45376 entries, 0 to 45375
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                45376 non-null  int64  
 1   genres                42992 non-null  object 
 2   id                    45376 non-null  int64  
 3   original_language     45365 non-null  object 
 4   overview              44435 non-null  object 
 5   popularity            45376 non-null  float64
 6   production_companies  33580 non-null  object 
 7   production_countries  39165 non-null  object 
 8   release_date          45376 non-null  object 
 9   revenue               45376 non-null  float64
 10  runtime               45130 non-null  float64
 11  spoken_languages      41485 non-null  object 
 12  status                45296 non-null  object 
 13  tagline               20398 non-null  object 
 14  title                 45376 non-null  object 
 15  vote_average       

budget                      0
vote_average                0
title                       0
release_year                0
release_date                0
revenue                     0
id                          0
popularity                  0
return                      0
original_language          11
status                     80
runtime                   246
overview                  941
genres                   2384
spoken_languages         3891
production_countries     6211
production_companies    11796
tagline                 24978
collection_name         40888
dtype: int64

In [87]:
datos.head()

Unnamed: 0,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,collection_name,release_year,return
0,11000000,"Adventure, Action, Science Fiction",11,en,Princess Leia is captured and held hostage by ...,42.149697,"Lucasfilm, Twentieth Century Fox Film Corporation",United States of America,1977-05-25,775398007.0,121.0,English,Released,"A long time ago in a galaxy far, far away...",Star Wars,8.1,Star Wars Collection,1977,70.490728
1,18000000,"Adventure, Action, Science Fiction",1891,en,"The epic saga continues as Luke Skywalker, in ...",19.470959,"Lucasfilm, Twentieth Century Fox Film Corporation",United States of America,1980-05-17,538400000.0,124.0,English,Released,The Adventure Continues...,The Empire Strikes Back,8.2,Star Wars Collection,1980,29.911111
2,32350000,"Adventure, Action, Science Fiction",1892,en,As Rebel leaders map their strategy for an all...,14.586087,"Lucasfilm, Twentieth Century Fox Film Corporation",United States of America,1983-05-23,572700000.0,135.0,English,Released,The Empire Falls...,Return of the Jedi,7.9,Star Wars Collection,1983,17.703246
3,115000000,"Adventure, Action, Science Fiction",1893,en,"Anakin Skywalker, a young slave strong with th...",15.649091,Lucasfilm,United States of America,1999-05-19,924317558.0,136.0,English,Released,Every generation has a legend. Every journey h...,Star Wars: Episode I - The Phantom Menace,6.4,Star Wars Collection,1999,8.037544
4,120000000,"Adventure, Action, Science Fiction",1894,en,"Ten years after the invasion of Naboo, the gal...",14.072511,Lucasfilm,United States of America,2002-05-15,649398328.0,142.0,English,Released,A Jedi Shall Not Know Anger. Nor Hatred. Nor L...,Star Wars: Episode II - Attack of the Clones,6.4,Star Wars Collection,2002,5.411653


In [89]:
# Ahora voy a hacer otra copia para ML 

datos.dropna(subset=['genres'], inplace=True)
datos.dropna(subset=['original_language'], inplace=True)
datos.to_csv('Datos\movies_dataML.csv', index=False)



In [107]:
datosml = pd.read_csv('Datos\movies_dataML.csv', low_memory=False)
datosml.isna().sum().sort_values()


budget                      0
vote_average                0
title                       0
release_year                0
release_date                0
revenue                     0
original_language           0
id                          0
genres                      0
popularity                  0
return                      0
status                     50
runtime                   176
overview                  693
spoken_languages         2761
production_countries     4671
production_companies     9753
tagline                 22844
collection_name         38561
dtype: int64