# Cargar archivos fuente

In [2]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

sns.set_theme(style='whitegrid', palette='viridis')
plt.rcParams['figure.figsize'] = (12, 6)

ARTIFACTS_DIR = Path('artifacts')
ARTIFACTS_DIR.mkdir(exist_ok=True)

print('Entorno preparado. Directorio artifacts listo.')

Entorno preparado. Directorio artifacts listo.


In [None]:
MOVIES_PATH = Path('data/tmdb_5000_movies.csv')
CREDITS_PATH = Path('data/tmdb_5000_credits.csv')

movies_raw = pd.read_csv(MOVIES_PATH)
credits_raw = pd.read_csv(CREDITS_PATH)

print(f'Movies shape: {movies_raw.shape}')
print(f'Credits shape: {credits_raw.shape}')


In [None]:
movies_raw.head()


In [None]:
movies_raw.info()


In [None]:
movies_raw.describe(include=[np.number])


Conclusiones rápidas

Dataset con ~5000 películas y columnas financieras, temporales y categóricas (JSON).

budget y revenue presentan ceros/nulos que debemos depurar para ROI.

release_date requiere conversión a fechas para análisis temporales.

# Limpieza y preparación

In [None]:
movies = movies_raw.copy()

nulls = movies.isna().sum().sort_values(ascending=False)
nulls[nulls > 0]


In [None]:
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies['release_year'] = movies['release_date'].dt.year
movies['release_decade'] = (movies['release_year'] // 10) * 10

movies[['release_date', 'release_year', 'release_decade']].head()


In [None]:
valid_movies = movies[(movies['budget'] > 0) & (movies['revenue'] > 0)].copy()
valid_movies['roi'] = valid_movies['revenue'] / valid_movies['budget']

print(f'Películas originales: {len(movies)}')
print(f'Películas con datos válidos para ROI: {len(valid_movies)}')
valid_movies[['title', 'budget', 'revenue', 'roi']].head()


Notas de limpieza

Se descartaron registros sin datos financieros fiables.

Se generaron columnas temporales (release_year, release_decade).

Se calculó ROI listo para análisis posteriores.

# Análisis exploratorio (EDA)

## Distribución del ROI

In [None]:
roi = valid_movies['roi'].dropna()

# Evito ROI <= 0 porque log no lo permite
roi = roi[roi > 0]

fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# --- HISTOGRAMA LOG ---
sns.histplot(np.log10(roi), bins=60, ax=axes[0], color='#4c72b0', edgecolor='white')
axes[0].set_title('Distribución del ROI (log10)')
axes[0].set_xlabel('log10(roi)')
axes[0].set_ylabel('Count')

# Líneas de media y mediana en log
axes[0].axvline(np.log10(roi.mean()), color='red', linestyle='--', label='Media (log)')
axes[0].axvline(np.log10(roi.median()), color='green', linestyle='--', label='Mediana (log)')
axes[0].legend()

# --- BOXPLOT LOG ---
sns.boxplot(x=np.log10(roi), ax=axes[1], color='#55a868')
axes[1].set_title('Boxplot ROI (log10)')
axes[1].set_xlabel('log10(roi)')

plt.tight_layout()
plt.show()


In [None]:
print('TOP 10 ROI positivos:')
display(valid_movies.nlargest(10, 'roi')[['title', 'budget', 'revenue', 'roi']])

print('TOP 10 ROI negativos:')
display(valid_movies.nsmallest(10, 'roi')[['title', 'budget', 'revenue', 'roi']])


In [None]:
def parse_json_list(value):
    try:
        return [item['name'] for item in json.loads(value)]
    except (TypeError, json.JSONDecodeError):
        return []


valid_movies['genres_list'] = valid_movies['genres'].apply(parse_json_list)

movies_by_genre = (
    valid_movies.explode('genres_list')
    .dropna(subset=['genres_list'])
    .rename(columns={'genres_list': 'genre'})
)

genre_roi = (
    movies_by_genre.groupby('genre')['roi']
    .agg(['mean', 'median', 'count'])
    .rename(columns={'mean': 'roi_mean', 'median': 'roi_median', 'count': 'movies'})
    .sort_values('roi_mean', ascending=False)
)

genre_roi.head(10)


In [None]:
top_genres = (
    genre_roi
    .head(15)
    .reset_index()
    .rename(columns={'index': 'genre'})
)

plt.figure(figsize=(12, 7))

plt.barh(top_genres['genre'], top_genres['roi_mean'], color='seagreen', alpha=0.6)

plt.axvline(
    genre_roi['roi_mean'].mean(),
    color='red',
    linestyle='--',
    linewidth=2,
    label='Promedio global'
)

plt.xlabel("ROI medio")
plt.ylabel("Género")
plt.title("ROI medio por género (Top 15)")
plt.legend()

plt.tight_layout()
plt.show()


Hallazgos clave

La distribución del ROI es fuertemente sesgada: pocas películas logran multiplicar su inversión.

Géneros como Horror/Thriller destacan por ROI elevado gracias a presupuestos controlados.

Estos resultados alimentan el análisis de rentabilidad para la API (por género/país).

## Evolución de la duración de películas (últimos 50 años)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

recent_year = movies['release_year'].dropna().max()
cutoff_year = recent_year - 49
recent_movies = (
    movies
    .loc[movies['release_year'].between(cutoff_year, recent_year)]
    .dropna(subset=['runtime'])
    .copy()
)

recent_movies['decade'] = (recent_movies['release_year'] // 10) * 10

runtime_by_decade = (
    recent_movies
    .groupby('decade')['runtime']
    .agg(['count', 'mean', 'median'])
    .rename(columns={'count': 'movies', 'mean': 'runtime_mean', 'median': 'runtime_median'})
    .sort_index()
)

display(runtime_by_decade)

fig, ax = plt.subplots(figsize=(10, 5))
sns.lineplot(
    data=runtime_by_decade.reset_index(),
    x='decade',
    y='runtime_mean',
    marker='o',
    label='Promedio',
    ax=ax
)
sns.lineplot(
    data=runtime_by_decade.reset_index(),
    x='decade',
    y='runtime_median',
    marker='s',
    label='Mediana',
    ax=ax
)
ax.set_title('Evolución de la duración media/mediana por década')
ax.set_xlabel('Década')
ax.set_ylabel('Runtime (min)')
ax.grid(True, alpha=0.3)
ax.legend()
plt.tight_layout()
plt.show()

## Actores que aparecen juntos más frecuentemente.

In [None]:
import json
from itertools import combinations
from pathlib import Path

import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt


def extract_cast_names(cast_json, max_cast=10):
    try:
        cast = json.loads(cast_json)
        return [member['name'] for member in cast[:max_cast]]
    except (TypeError, json.JSONDecodeError):
        return []


# --- Load data ---
ARTIFACTS_DIR = Path('artifacts')
CREDITS_PATH = Path('data/tmdb_5000_credits.csv')
credits_raw = pd.read_csv(CREDITS_PATH)
credits = credits_raw[['movie_id', 'title', 'cast']].dropna(subset=['cast']).copy()
credits['cast_names'] = credits['cast'].apply(extract_cast_names)

# --- Build actor pairs ---
pairs = []
for cast_list in credits['cast_names']:
    if len(cast_list) < 2:
        continue
    for a, b in combinations(sorted(set(cast_list)), 2):
        pairs.append((a, b))

pairs_df = (
    pd.DataFrame(pairs, columns=['actor_a', 'actor_b'])
    .value_counts()
    .reset_index(name='colaboraciones')
    .sort_values('colaboraciones', ascending=False)
)

display(pairs_df.head(50))

# --- Filter edges with >= 3 co appearances ---
top_edges = pairs_df[pairs_df['colaboraciones'] >= 3]

# --- Build graph ---
G = nx.Graph()
for _, row in top_edges.iterrows():
    G.add_edge(row['actor_a'], row['actor_b'], weight=row['colaboraciones'])

print(f'Nodes: {G.number_of_nodes()}, edges: {G.number_of_edges()}')

# =====================================================
# GRÁFICO DE LA RED DE CO-ACTORES
# =====================================================

rng = np.random.default_rng(123)

pos = nx.spring_layout(
    G,
    seed=42,
    k=4.0 / np.sqrt(G.number_of_nodes()),
    iterations=1200,
    weight='weight'
)

plt.figure(figsize=(32, 30))

degree_dict = dict(G.degree())
node_sizes = [120 + 35 * degree_dict[n] for n in G.nodes()]

nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='#4c72b0', alpha=0.8)
nx.draw_networkx_edges(G, pos, width=0.6, alpha=0.12, edge_color='#7f7f7f')

label_min_degree = 3
important_nodes = [n for n in pos if degree_dict.get(n, 0) >= label_min_degree]

label_pos = {}
for n in important_nodes:
    jitter = 0.08 * rng.uniform(-1.0, 1.0, size=2)
    label_pos[n] = pos[n] + jitter

nx.draw_networkx_labels(
    G,
    label_pos,
    labels={n: n for n in important_nodes},
    font_size=9,
    font_weight='bold',
    font_family='sans-serif',
    bbox=dict(facecolor='white', edgecolor='none', alpha=0.75, pad=0.35)
)

plt.title('Red de co-actuacion', fontsize=20, pad=20)
plt.axis('off')
plt.tight_layout()
plt.savefig(ARTIFACTS_DIR / 'coacting_network.png', dpi=250, bbox_inches='tight')
plt.show()

