In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import pylab as plt
import json
import time
import requests
import warnings
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib
import plotly.express as px
from tqdm.autonotebook import tqdm
warnings.filterwarnings('ignore')


                Extracción, transformación y limpieza de datos del dataset de películas, series y documentales de Netflix

In [3]:
film = pd.read_csv('../data/Net_titles.csv', encoding='utf-8', encoding_errors='ignore')
actor = pd.read_csv('../data/Net_credits.csv', encoding='utf-8', encoding_errors='ignore') 

In [4]:
# Para rellenar los valores nulos de la columna Age_certification. Si pertenece a los siguientes géneros se asumirá que la certificación de edad es R (+17)
def fill_age_null(row):
    if pd.isna(row['age_certification']):
        if 'thriller' in row['genres'] or 'horror' in row['genres'] or 'black comedy' in row['genres']:
            return 'R'
    return row['age_certification']

film['age_certification'] = film.apply(lambda row: fill_age_null(row), axis=1)

In [5]:
# Eliminamos los valores nulos de las columnas relacionados con IMDB y TMDB, ya que sin el imdb_id es imposible analizarlos
# Rellenamos los valores nulos de la columna seasons con '-1'
# Rellenamos los valores nulos de la columna character del df 'actor' con la palabra 'unknown'
# Rellenamos los valores nulos de la columna description con 'unknown'
# Rellenamos los valores nulos de la columna age_certification con 'unknown'

film.dropna(subset=['imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score'], inplace=True)
film['age_certification'] = film['age_certification'].fillna('unknown')
film['seasons'] = film['seasons'].fillna('-1')
film['description'] = film['description'].fillna('unknown')
actor['character'] = actor['character'].fillna('unknown')



In [8]:
from joblib import Parallel, delayed
from tqdm import tqdm

def extract_reviews_and_ratings(imdb_id):
    url = f'https://www.imdb.com/title/{imdb_id}/reviews?ref_=tt_ov_rt' 
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        reviews = soup.find_all('div', {'class': 'text show-more__control'})  
        ratings = soup.find_all('span', {'class': 'rating-other-user-rating'})
        results = []                                        # crea la lista para almacenar los resultados
        for rev, rat in zip(reviews, ratings):              # itera sobre reviews y rating a la vez
            rev_div = rev.find_parent('div', {'class': 'lister-item-content'})  
            rev_title = rev_div.find('a', {'class': 'title'}).text.strip() if rev_div.find('a', {'class': 'title'}) else 'No title'
            rat_title = rat.text.strip() if rat else 'Sin calificación'
            results.append((imdb_id, rev_title, rat_title))
        return results
    else:
        print(f'Error al obtener comentarios de la película con IMDB ID {imdb_id}')
        return []

if __name__ == '__main__':
    imdb_ids = film['imdb_id'].tolist()                                                                                         #pasamos la columna del df a lista
    reviews_and_ratings = Parallel(n_jobs=-1)(delayed(extract_reviews_and_ratings)(imdb_id) for imdb_id in tqdm(imdb_ids))      #extraemos en paralelo las reviews y ratings de cada id
    reviews_and_ratings = [review for sublist in reviews_and_ratings for review in sublist]                                     #aplanamos la lista de listas 'reviews_and_ratings'
    com_rev = pd.DataFrame(reviews_and_ratings, columns=['id', 'review_title', 'rating_title'])



100%|██████████| 5131/5131 [14:34<00:00,  5.87it/s]


In [20]:
#com_rev.to_csv('../data/Net_comments.csv', index=False)
#film.to_csv('../data/Net_titles_clean.csv', index=False)
#actor.to_csv('../data/Net_actors_clean.csv', index=False)

                Realizamos el mismo proceso con las tablas de HBO

In [24]:
film = pd.read_csv('../data/HBO_titles.csv', encoding='utf-8', encoding_errors='ignore')
actor = pd.read_csv('../data/HBO_credits.csv', encoding='utf-8', encoding_errors='ignore') 

In [26]:
def fill_age_null(row):
    if pd.isna(row['age_certification']):
        if 'thriller' in row['genres'] or 'horror' in row['genres'] or 'black comedy' in row['genres']:
            return 'R'
    return row['age_certification']

film['age_certification'] = film.apply(lambda row: fill_age_null(row), axis=1)

In [28]:
film.dropna(subset=['imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score'], inplace=True)
film['age_certification'] = film['age_certification'].fillna('unknown')
film['seasons'] = film['seasons'].fillna('-1')
film['description'] = film['description'].fillna('unknown')
actor['character'] = actor['character'].fillna('unknown')

In [31]:
from joblib import Parallel, delayed
from tqdm import tqdm

def extract_reviews_and_ratings(imdb_id):
    url = f'https://www.imdb.com/title/{imdb_id}/reviews?ref_=tt_ov_rt' 
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        reviews = soup.find_all('div', {'class': 'text show-more__control'})  
        ratings = soup.find_all('span', {'class': 'rating-other-user-rating'})
        results = []
        for rev, rat in zip(reviews, ratings):
            rev_div = rev.find_parent('div', {'class': 'lister-item-content'})  
            rev_title = rev_div.find('a', {'class': 'title'}).text.strip() if rev_div.find('a', {'class': 'title'}) else 'No title'
            rat_title = rat.text.strip() if rat else 'Sin calificación'
            results.append((imdb_id, rev_title, rat_title))
        return results
    else:
        print(f'Error al obtener comentarios de la película con IMDB ID {imdb_id}')
        return []

if __name__ == '__main__':
    imdb_ids = film['imdb_id'].tolist()
    reviews_and_ratings = Parallel(n_jobs=-1)(delayed(extract_reviews_and_ratings)(imdb_id) for imdb_id in tqdm(imdb_ids))
    reviews_and_ratings = [review for sublist in reviews_and_ratings for review in sublist]
    com_rev = pd.DataFrame(reviews_and_ratings, columns=['id', 'review_title', 'rating_title'])

100%|██████████| 2606/2606 [12:14<00:00,  3.55it/s]


In [32]:
#com_rev.to_csv('../data/HBO_comments.csv', index=False)
#film.to_csv('../data/HBO_titles_clean.csv', index=False)
#actor.to_csv('../data/HBO_actors_clean.csv', index=False)

In [10]:
# Creamos el df de actores
actor = pd.read_csv('../data/HBO_actors_clean.csv', encoding='utf-8', encoding_errors='ignore') 

In [11]:
actor = actor.drop(['id','character', 'role'], axis= 1)

In [12]:
actor = actor.drop_duplicates()

In [13]:
actor.to_csv('../data/sql/HBO_actors_sql.csv', index=False)

In [15]:
# Creamos el df de reparto
cast = pd.read_csv('../data/Net_actors_clean.csv', encoding='utf-8', encoding_errors='ignore')

In [17]:
cast = cast.drop(['name'], axis= 1)

In [19]:
cast.rename(columns={'id': 'film_id'}, inplace=True)


In [21]:
cast['id'] = range(1, len(cast)+1)

In [23]:
cast = cast.reindex(columns=['id', 'person_id', 'film_id', 'character', 'role'])


In [25]:
cast.to_csv('../data/sql/Net_cast_sql.csv', index=False)

In [26]:
# Creamos el df de reviews
com = pd.read_csv('../data/Net_comments.csv', encoding='utf-8', encoding_errors='ignore')

In [28]:
com.rename(columns={'id': 'imdb_id'}, inplace=True)

In [30]:
com['id'] = range(1, len(com)+1)

In [32]:
com = com.reindex(columns=['id', 'imdb_id', 'review_title', 'rating_title'])

In [34]:
com.to_csv('../data/sql/Net_reviews_sql.csv', index=False)

In [63]:
# Creamos el df de films
film = pd.read_csv('../data/Net_titles_clean.csv', encoding='utf-8', encoding_errors='ignore') 
film.rename(columns={'id': 'film_id'}, inplace=True)
film.to_csv('../data/sql/Net_titles_sql.csv', index=False)


In [69]:
rev = pd.read_csv('../data/sql/Net_reviews_sql.csv', encoding='utf-8', encoding_errors='ignore') 
rev.rename(columns={'id': 'review_id'}, inplace=True)
rev.to_csv('../data/sql/Net_reviews_sql.csv', index=False)

                Visualización de datos

In [22]:
film = pd.read_csv('../data/Net_titles_clean.csv', encoding='utf-8', encoding_errors='ignore')

In [23]:
# Definimos la función filtrar_gen para quitar del gráfico aquellos géneros y subgéneros que representen una minoría
def filtrar_gen(df, col, thresh):
    counts = df[col].value_counts()
    mask = df[col].isin(counts[counts > thresh].index)
    return df[mask]

In [24]:
# Vamos a crear un nuevo df cuyas columnas sean el género y el subgénero de de las películas
film['genres'] = film['genres'].str.strip('[]').str.replace("'", '').str.split(', ')
film_gen = film['genres'].apply(pd.Series).rename(columns = lambda x : 'genre_' + str(x))
df_sunburst = pd.concat([film_gen['genre_0'], film_gen['genre_1'], film_gen['genre_2']], axis=1)
df_sunburst.columns = ['Level 1', 'Level 2', 'Level 3']


In [25]:
# Eliminamos los nulos de los subgéneros, ya que representan que género y subgénero coinciden

df_sunburst = df_sunburst.dropna(subset=['Level 3'])
df_sunburst = df_sunburst.dropna(subset=['Level 2'])



In [26]:
# definimos un thresh con un umbral de 50 valores únicos para no tener en cuenta aquellos géneros y subgéneros que son poco representativos y que molestan visualmente
df_sunburst = filtrar_gen(df_sunburst, 'Level 2', 50)   
df_sunburst = filtrar_gen(df_sunburst, 'Level 1', 50)

In [27]:
fig = px.sunburst(df_sunburst, path=['Level 1', 'Level 2'], width=600, height=600)
fig.show()
fig.write_html('../images/NET_gen_sunburst.html')


In [28]:
film = pd.read_csv('../data/HBO_titles_clean.csv', encoding='utf-8', encoding_errors='ignore')

film['genres'] = film['genres'].str.strip('[]').str.replace("'", '').str.split(', ')
film_gen = film['genres'].apply(pd.Series).rename(columns = lambda x : 'genre_' + str(x))

df_sunburst = pd.concat([film_gen['genre_0'], film_gen['genre_1'], film_gen['genre_2']], axis=1)
df_sunburst.columns = ['Level 1', 'Level 2', 'Level 3']

df_sunburst = df_sunburst.dropna(subset=['Level 3'])
df_sunburst = df_sunburst.dropna(subset=['Level 2'])

df_sunburst = filtrar_gen(df_sunburst, 'Level 2', 40)  
df_sunburst = filtrar_gen(df_sunburst, 'Level 1', 40)

fig = px.sunburst(df_sunburst, path=['Level 1', 'Level 2'], width=600, height=600)

fig.show()
fig.write_html('../images/HBO_gen_sunburst.html')

In [70]:
actor = pd.read_csv('../data/sql/Net_actors_sql.csv', encoding='utf-8', encoding_errors='ignore') 
film = pd.read_csv('../data/sql/Net_titles_sql.csv', encoding='utf-8', encoding_errors='ignore')  
cast = pd.read_csv('../data/sql/Net_cast_sql.csv', encoding='utf-8', encoding_errors='ignore') 
rev = pd.read_csv('../data/sql/Net_reviews_sql.csv', encoding='utf-8', encoding_errors='ignore') 


In [71]:
actor.head()

Unnamed: 0,person_id,name
0,3748,Robert De Niro
1,14658,Jodie Foster
2,7064,Albert Brooks
3,3739,Harvey Keitel
4,48933,Cybill Shepherd


In [72]:
film.head()

Unnamed: 0,film_id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],-1.0,tt0075314,8.2,808582.0,40.965,8.179
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],-1.0,tt0068473,7.7,107673.0,10.01,7.3
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],-1.0,tt0071853,8.2,534486.0,15.461,7.811
3,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,unknown,150,"['war', 'action']","['GB', 'US']",-1.0,tt0061578,7.7,72662.0,20.398,7.6
4,ts22164,Monty Python's Flying Circus,SHOW,A British sketch comedy series with the shows ...,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,tt0063929,8.8,73424.0,17.617,8.306


In [76]:
h = pd.read_csv('../data/HBO_titles_clean.csv')

In [82]:
film.query('title == "Taxi Driver"')

Unnamed: 0,film_id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],-1.0,tt0075314,8.2,808582.0,40.965,8.179
1255,tm248010,Taxi Driver,MOVIE,A small-town mechanic turned chauffeur for the...,2015,unknown,100,"['comedy', 'crime', 'drama']",['NG'],-1.0,tt5112438,6.0,69.0,1.191,5.3


In [81]:
h.query('title == "Taxi Driver"')

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
133,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,"['drama', 'crime']",['US'],-1.0,tt0075314,8.2,848334.0,43.893,8.166


In [85]:
h[['id', 'title']]

Unnamed: 0,id,title
0,tm77588,Casablanca
1,tm155702,The Wizard of Oz
2,tm83648,Citizen Kane
3,tm3175,Meet Me in St. Louis
4,ts225761,Tom and Jerry
...,...,...
2601,tm1310730,Marlon Wayans: God Loves Me
2602,ts171230,Poor Devil
2603,tm1306271,The Weeknd: Live at SoFi Stadium
2604,tm1305288,"Marcella Arguello: Bitch, Grow Up!"


In [73]:
cast.head()

Unnamed: 0,id,person_id,film_id,character,role
0,1,3748,tm84618,Travis Bickle,ACTOR
1,2,14658,tm84618,Iris Steensma,ACTOR
2,3,7064,tm84618,Tom,ACTOR
3,4,3739,tm84618,Matthew 'Sport' Higgins,ACTOR
4,5,48933,tm84618,Betsy,ACTOR


In [75]:
rev

Unnamed: 0,review_id,imdb_id,review_title,rating_title
0,1,tt0075314,A classy character study of a disturbed indivi...,7/10
1,2,tt0075314,A wonderfully engaging and convincing slide in...,10/10
2,3,tt0075314,Ladies and gentlemen: Mr. Robert De Niro!,10/10
3,4,tt0075314,Still has the power to shock...,10/10
4,5,tt0075314,"Disturbing, powerful, relevant, important",9/10
...,...,...,...,...
88369,88370,tt14216488,What is it for real?,1/10
88370,88371,tt14216488,Cringe.,2/10
88371,88372,tt14216488,WTF,1/10
88372,88373,tt14216488,Bad story,1/10
