In [2]:
import pandas as pd
import seaborn as sns
import pylab as plt
import json
import time
import requests
import warnings
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
from joblib import Parallel, delayed
from tqdm_joblib import tqdm_joblib
warnings.filterwarnings('ignore')


Extracción, transformación y limpieza de datos del dataset de películas, series y documentales de Netflix

In [3]:
film = pd.read_csv('../data/Net_titles.csv', encoding='utf-8', encoding_errors='ignore')
actor = pd.read_csv('../data/Net_credits.csv', encoding='utf-8', encoding_errors='ignore') 

In [4]:
# Para rellenar los valores nulos de la columna Age_certification. Si pertenece a los siguientes géneros se asumirá que la certificación de edad es R (+17)
def fill_age_null(row):
    if pd.isna(row['age_certification']):
        if 'thriller' in row['genres'] or 'horror' in row['genres'] or 'black comedy' in row['genres']:
            return 'R'
    return row['age_certification']

film['age_certification'] = film.apply(lambda row: fill_age_null(row), axis=1)

In [5]:
# Eliminamos los valores nulos de las columnas relacionados con IMDB y TMDB, ya que sin el imdb_id es imposible analizarlos
# Rellenamos los valores nulos de la columna seasons con '-1'
# Rellenamos los valores nulos de la columna character del df 'actor' con la palabra 'unknown'
# Rellenamos los valores nulos de la columna description con 'unknown'
# Rellenamos los valores nulos de la columna age_certification con 'unknown'

film.dropna(subset=['imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score'], inplace=True)
film['age_certification'] = film['age_certification'].fillna('unknown')
film['seasons'] = film['seasons'].fillna('-1')
film['description'] = film['description'].fillna('unknown')
actor['character'] = actor['character'].fillna('unknown')



In [8]:
from joblib import Parallel, delayed
from tqdm import tqdm

def extract_reviews_and_ratings(imdb_id):
    url = f'https://www.imdb.com/title/{imdb_id}/reviews?ref_=tt_ov_rt' 
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        reviews = soup.find_all('div', {'class': 'text show-more__control'})  
        ratings = soup.find_all('span', {'class': 'rating-other-user-rating'})
        results = []                                        # crea la lista para almacenar los resultados
        for rev, rat in zip(reviews, ratings):              # itera sobre reviews y rating a la vez
            rev_div = rev.find_parent('div', {'class': 'lister-item-content'})  
            rev_title = rev_div.find('a', {'class': 'title'}).text.strip() if rev_div.find('a', {'class': 'title'}) else 'No title'
            rat_title = rat.text.strip() if rat else 'Sin calificación'
            results.append((imdb_id, rev_title, rat_title))
        return results
    else:
        print(f'Error al obtener comentarios de la película con IMDB ID {imdb_id}')
        return []

if __name__ == '__main__':
    imdb_ids = film['imdb_id'].tolist()                                                                                         #pasamos la columna del df a lista
    reviews_and_ratings = Parallel(n_jobs=-1)(delayed(extract_reviews_and_ratings)(imdb_id) for imdb_id in tqdm(imdb_ids))      #extraemos en paralelo las reviews y ratings de cada id
    reviews_and_ratings = [review for sublist in reviews_and_ratings for review in sublist]                                     #aplanamos la lista de listas 'reviews_and_ratings'
    com_rev = pd.DataFrame(reviews_and_ratings, columns=['id', 'review_title', 'rating_title'])



100%|██████████| 5131/5131 [14:34<00:00,  5.87it/s]


In [20]:
#com_rev.to_csv('../data/Net_comments.csv', index=False)
#film.to_csv('../data/Net_titles_clean.csv', index=False)
#actor.to_csv('../data/Net_actors_clean.csv', index=False)

Realizamos el mismo proceso con las tablas de HBO

In [24]:
film = pd.read_csv('../data/HBO_titles.csv', encoding='utf-8', encoding_errors='ignore')
actor = pd.read_csv('../data/HBO_credits.csv', encoding='utf-8', encoding_errors='ignore') 

In [26]:
def fill_age_null(row):
    if pd.isna(row['age_certification']):
        if 'thriller' in row['genres'] or 'horror' in row['genres'] or 'black comedy' in row['genres']:
            return 'R'
    return row['age_certification']

film['age_certification'] = film.apply(lambda row: fill_age_null(row), axis=1)

In [28]:
film.dropna(subset=['imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score'], inplace=True)
film['age_certification'] = film['age_certification'].fillna('unknown')
film['seasons'] = film['seasons'].fillna('-1')
film['description'] = film['description'].fillna('unknown')
actor['character'] = actor['character'].fillna('unknown')

In [31]:
from joblib import Parallel, delayed
from tqdm import tqdm

def extract_reviews_and_ratings(imdb_id):
    url = f'https://www.imdb.com/title/{imdb_id}/reviews?ref_=tt_ov_rt' 
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        reviews = soup.find_all('div', {'class': 'text show-more__control'})  
        ratings = soup.find_all('span', {'class': 'rating-other-user-rating'})
        results = []
        for rev, rat in zip(reviews, ratings):
            rev_div = rev.find_parent('div', {'class': 'lister-item-content'})  
            rev_title = rev_div.find('a', {'class': 'title'}).text.strip() if rev_div.find('a', {'class': 'title'}) else 'No title'
            rat_title = rat.text.strip() if rat else 'Sin calificación'
            results.append((imdb_id, rev_title, rat_title))
        return results
    else:
        print(f'Error al obtener comentarios de la película con IMDB ID {imdb_id}')
        return []

if __name__ == '__main__':
    imdb_ids = film['imdb_id'].tolist()
    reviews_and_ratings = Parallel(n_jobs=-1)(delayed(extract_reviews_and_ratings)(imdb_id) for imdb_id in tqdm(imdb_ids))
    reviews_and_ratings = [review for sublist in reviews_and_ratings for review in sublist]
    com_rev = pd.DataFrame(reviews_and_ratings, columns=['id', 'review_title', 'rating_title'])

100%|██████████| 2606/2606 [12:14<00:00,  3.55it/s]


In [32]:
#com_rev.to_csv('../data/HBO_comments.csv', index=False)
#film.to_csv('../data/HBO_titles_clean.csv', index=False)
#actor.to_csv('../data/HBO_actors_clean.csv', index=False)