# Netflix - Content Recommendation Algorithm Performance

```SQL
CREATE TABLE dim_content (
    content_id INT,
    title VARCHAR,
    genre VARCHAR,
    release_date DATE
);

CREATE TABLE fct_watch_history (
    watch_id INT,
    user_id INT,
    content_id INT,
    watch_time_minutes INT,
    watch_date DATE
);

CREATE TABLE fct_recommendations (
    recommendation_id INT,
    user_id INT,
    content_id INT,
    recommended_date DATE
);

INSERT INTO dim_content (content_id, title, genre, release_date)
VALUES
    (1, 'The Great Adventure', 'Action', '2023-11-15'),
    (2, 'Love & Laughs', 'Comedy', '2022-07-22'),
    (3, 'Mystery of the Lost City', 'Thriller', '2024-01-10'),
    (4, 'Nature''s Wonders', 'Documentary', '2021-05-05'),
    (5, 'Space Odyssey', 'Sci-Fi', '2023-03-30'),
    (6, 'Dawn''s Early Light', 'Drama', '2024-02-14'),
    (7, 'The Culinary Journey', 'Reality', '2022-10-18'),
    (8, 'Haunted Manor', 'Horror', '2023-08-09'),
    (9, 'Robot Uprising', 'Action', '2024-01-25'),
    (10, 'Stand-Up Nights', 'Comedy', '2022-12-12'),
    (11, 'Deep Sea Secrets', 'Documentary', '2023-06-20'),
    (12, 'The Last Frontier', 'Sci-Fi', '2024-03-05'),
    (13, 'Urban Legends', 'Horror', '2022-09-30'),
    (14, 'Comedy Central', 'Comedy', '2023-02-28'),
    (15, 'Historical Battles', 'Documentary', '2021-11-11');

INSERT INTO fct_watch_history (watch_id, user_id, content_id, watch_time_minutes, watch_date)
VALUES
    (1, 1, 1, 120, '2024-01-05'),
    (2, 2, 2, 45, '2024-01-15'),
    (3, 3, 3, 90, '2024-02-10'),
    (4, 1, 4, 60, '2024-02-20'),
    (5, 4, 5, 150, '2024-03-01'),
    (6, 5, 6, 30, '2024-03-10'),
    (7, 2, 7, 80, '2024-01-25'),
    (8, 3, 8, 50, '2024-02-05'),
    (9, 4, 9, 100, '2024-03-15'),
    (10, 5, 10, 40, '2024-03-20'),
    (11, 6, 11, 70, '2024-01-18'),
    (12, 7, 12, 110, '2024-02-22'),
    (13, 8, 13, 55, '2024-03-08'),
    (14, 9, 14, 35, '2024-01-30'),
    (15, 10, 15, 65, '2024-02-14'),
    (16, 1, 3, 95, '2024-02-28'),
    (17, 2, 5, 145, '2024-03-25'),
    (18, 3, 7, 75, '2024-01-12'),
    (19, 4, 9, 105, '2024-03-18'),
    (20, 5, 11, 85, '2024-02-08'),
    (21, 6, 12, 115, '2024-03-12'),
    (22, 7, 14, 38, '2024-01-22'),
    (23, 8, 1, 125, '2024-02-16'),
    (24, 9, 4, 58, '2024-03-05'),
    (25, 10, 2, 42, '2024-01-28');

INSERT INTO fct_recommendations (recommendation_id, user_id, content_id, recommended_date)
VALUES
    (1, 1, 1, '2024-01-04'),
    (2, 2, 2, '2024-01-10'),
    (3, 3, 3, '2024-02-08'),
    (4, 1, 4, '2024-02-18'),
    (5, 4, 5, '2024-03-02'),
    (6, 5, 6, '2024-03-09'),
    (7, 2, 7, '2024-01-20'),
    (8, 3, 8, '2024-02-03'),
    (9, 4, 9, '2024-03-14'),
    (10, 5, 10, '2024-03-19'),
    (11, 6, 11, '2024-01-17'),
    (12, 7, 12, '2024-02-20'),
    (13, 8, 13, '2024-03-07'),
    (14, 9, 14, '2024-01-29'),
    (15, 10, 15, '2024-02-13'),
    (16, 1, 3, '2024-02-27'),
    (17, 2, 5, '2024-03-24'),
    (18, 3, 7, '2024-01-11'),
    (19, 4, 9, '2024-03-17'),
    (20, 5, 11, '2024-02-07');

SELECT * FROM dim_content;

SELECT * FROM fct_watch_history;

SELECT * FROM fct_recommendations;
```

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_content = pd.read_csv('Data/011/dim_content.csv')
df_recommendation = pd.read_csv('Data/011/fct_recommendations.csv')
df_watch = pd.read_csv('Data/011/fct_watch_history.csv')

df_content.head()

Unnamed: 0,content_id,title,genre,release_date
0,1,The Great Adventure,Action,2023-11-15
1,2,Love & Laughs,Comedy,2022-07-22
2,3,Mystery of the Lost City,Thriller,2024-01-10
3,4,Nature's Wonders,Documentary,2021-05-05
4,5,Space Odyssey,Sci-Fi,2023-03-30


In [3]:
df_recommendation.head()

Unnamed: 0,recommendation_id,user_id,content_id,recommended_date
0,1,1,1,2024-01-04
1,2,2,2,2024-01-10
2,3,3,3,2024-02-08
3,4,1,4,2024-02-18
4,5,4,5,2024-03-02


In [4]:
df_watch.head()

Unnamed: 0,watch_id,user_id,content_id,watch_time_minutes,watch_date
0,1,1,1,120,2024-01-05
1,2,2,2,45,2024-01-15
2,3,3,3,90,2024-02-10
3,4,1,4,60,2024-02-20
4,5,4,5,150,2024-03-01


# Pregunta 1

### ¿Cuál es el tiempo total de visualización del contenido después de haber sido recomendado a los usuarios? Para atribuir correctamente el tiempo de visualización a la recomendación, es crítico incluir solo el tiempo de visualización posterior a la fecha en que se hizo la recomendación al usuario. Un contenido pudo ser recomendado a un usuario varias veces; si es así, queremos usar la primera fecha en la que el contenido le fue recomendado a ese usuario.

In [6]:
df_first_recom = df_recommendation.groupby(['user_id','content_id'])['recommended_date'].min().reset_index()

df_attributed = pd.merge(df_watch, df_first_recom, on=['user_id','content_id'])

df_attributed = df_attributed[df_attributed['watch_date'] >= df_attributed['recommended_date']]

total_time = df_attributed['watch_time_minutes'].sum()

total_time

np.int64(1455)

```SQL
WITH primera_recom AS(
    SELECT
        user_id,
        content_id,
        MIN(recommended_date) as first_recom_date
    FROM fct_recommendations
    GROUP BY user_id, content_id
)
SELECT
    SUM(w.watch_time_minutes) as total_attributed_watch_time
FROM fct_watch_history w
JOIN primera_recom pr
    ON w.user_id = pr.user_id
    AND w.content_id = pr.content_id
WHERE w.watch_date >= pr.first_recom_date;
```

# Pregunta 2

### El equipo quiere saber el tiempo total de visualización para cada género en el primer trimestre de 2024, dividido según si el contenido fue recomendado o no al usuario.

### El tiempo de visualización debe clasificarse como 'Recommended' (Recomendado) al unir por usuario y contenido, independientemente de cuándo lo vieron en comparación con cuándo recibieron la recomendación.

In [10]:
recom_pairs = df_recommendation[['user_id','content_id']].drop_duplicates()
recom_pairs['is_recom'] = True

df_merged = df_watch.merge(df_content, on='content_id')
df_final = df_merged.merge(recom_pairs, on=['user_id','content_id'], how='left')

df_final['recommendation_status'] = df_final['is_recom'].map({True: 'Recommended'}).fillna('Non-Recommended')

df_q1 = df_final[df_final['watch_date'].between('2024-01-01','2024-03-31')]
resultado = df_q1.groupby(['genre','recommendation_status'])['watch_time_minutes'].sum().reset_index()

resultado_final = resultado.sort_values(by='watch_time_minutes', ascending=False)

resultado_final

Unnamed: 0,genre,recommendation_status,watch_time_minutes
10,Sci-Fi,Recommended,405
1,Action,Recommended,325
5,Documentary,Recommended,280
11,Thriller,Recommended,185
8,Reality,Recommended,155
0,Action,Non-Recommended,125
3,Comedy,Recommended,120
9,Sci-Fi,Non-Recommended,115
7,Horror,Recommended,105
2,Comedy,Non-Recommended,80


```SQL
SELECT
    c.genre,
    CASE
        WHEN r.user_id IS NOT NULL THEN 'Recommended'
        ELSE 'Non-Recommended'
    END AS recommendation_status,
    SUM(w.watch_time_minutes) AS total_watch_time
FROM fct_watch_history w
JOIN dim_content c ON w.content_id = c.content_id
LEFT JOIN (
    SELECT Distinct user_id, content_id
    FROM fct_recommendations
) AS r ON w.user_id = r.user_id AND w.content_id = r.content_id
          WHERE w.watch_date BETWEEN '2024-01-01' AND '2024-03-31'
          GROUP BY c.genre, recommendation_status
          ORDER BY c.genre, total_watch_time DESC;
```

# Pregunta 3

### El equipo tiene como objetivo categorizar las sesiones de visualización de los usuarios en 'Short' (Corta), 'Medium' (Media) o 'Long' (Larga), basándose en el tiempo de visualización del contenido recomendado para identificar patrones de compromiso (engagement).

### 'Short' para menos de 60 minutos, 'Medium' para entre 60 y 120 minutos, y 'Long' para más de 120 minutos. ¿Puedes clasificar y contar las sesiones en el primer trimestre (Q1) de 2024 de acuerdo con esto?

In [15]:
df_recom_q1 = df_final[
    (df_final['is_recom'] == True) &
    (df_final['watch_date'].between('2024-01-01','2024-03-31'))
].copy()

bins = [0,59,120, float('inf')]
labels = ['Short','Medium','Long']

df_recom_q1['session_category'] = pd.cut(df_recom_q1['watch_time_minutes'], bins=bins, labels=labels)

conteo_sesiones = df_recom_q1['session_category'].value_counts().reset_index()
conteo_sesiones.columns = ['session_category', 'total_sessions']

conteo_sessiones

Unnamed: 0,session_category,count
0,Medium,12
1,Short,6
2,Long,2


```SQL
SELECT
    CASE
        WHEN watch_time_minutes < 60 THEN 'Short'
        WHEN watch_time_minutes BETWEEN  60 and 120 THEN 'Medium'
        ELSE 'Long'
    END AS session_category,
    COUNT(*) AS total_session
FROM fct_watch_history w
JOIN fct_recommendations r
    ON w.user_id = r.user_id
    AND w.content_id = r.content_id
WHERE w.watch_date BETWEEN '2024-01-01' AND '2024-03-31'
GROUP BY 1
ORDER BY 2 DESC;
```