# OpenAI - ChatGPT User Engagement and Query Patterns

```SQL
CREATE TABLE dim_users (
    user_id integer,
    first_name text,
    last_name text
);

CREATE TABLE fct_queries (
    query_id integer,
    user_id integer,
    query_text text,
    query_domain text,
    query_timestamp timestamp
);

INSERT INTO dim_users (user_id, first_name, last_name)
VALUES
    (1, 'Alice', 'Smith'),
    (2, 'Bob', 'Johnson'),
    (3, 'Charlie', 'Lee'),
    (4, 'Dana', 'Brown'),
    (5, 'Evan', 'Davis'),
    (6, 'Fiona', 'Miller'),
    (7, 'George', 'Wilson'),
    (8, 'Hannah', 'Moore'),
    (9, 'Ian', 'Taylor'),
    (10, 'Julia', 'Anderson');

INSERT INTO fct_queries (query_id, user_id, query_text, query_domain, query_timestamp)
VALUES
    (1, 1, 'How does ChatGPT work?', 'technology', '2024-07-02 09:15:00'),
    (2, 2, 'What is the latest discovery in astrophysics?', 'science', '2024-07-15 12:00:00'),
    (3, 3, 'Tell me a joke', 'entertainment', '2024-07-20 16:30:00'),
    (4, 1, 'Explain neural networks', 'technology', '2024-08-01 10:05:00'),
    (5, 2, 'Quantum mechanics basics', 'science', '2024-08-03 11:00:00'),
    (6, 3, 'Art history overview', 'history', '2024-08-10 09:45:00'),
    (7, 4, 'Latest trends in technology', 'technology', '2024-08-12 14:30:00'),
    (8, 5, 'Biology breakthroughs', 'science', '2024-08-15 15:00:00'),
    (9, 1, 'ChatGPT use cases', 'technology', '2024-08-20 08:00:00'),
    (10, 4, 'History of computing', 'technology', '2024-08-22 13:15:00'),
    (11, 6, 'Science fiction books', 'science', '2024-08-25 18:20:00'),
    (12, 2, 'Defining artificial intelligence', 'technology', '2024-08-28 20:45:00'),
    (13, 7, 'Latest science news', 'science', '2024-09-03 10:00:00'),
    (14, 8, 'Technology breakthrough in AI', 'technology', '2024-09-08 12:30:00'),
    (15, 9, 'ChatGPT vs human creativity', 'technology', '2024-09-10 14:45:00'),
    (16, 10, 'Understanding climate change', 'environment', '2024-09-15 16:00:00'),
    (17, 7, 'Advancements in space travel', 'science', '2024-09-20 09:00:00'),
    (18, 5, 'ChatGPT features overview', 'technology', '2024-09-25 19:15:00');

SELECT * FROM dim_users;

SELECT * FROM fct_queries;
```

In [1]:
import pandas as pd
import numpy as np

In [11]:
df_user = pd.read_csv('Data/027/dim_users.csv')
df_queries = pd.read_csv('Data/027/fct_queries.csv', parse_dates=['query_timestamp'])

df_user.head()

Unnamed: 0,user_id,first_name,last_name
0,1,Alice,Smith
1,2,Bob,Johnson
2,3,Charlie,Lee
3,4,Dana,Brown
4,5,Evan,Davis


In [12]:
df_queries.head()

Unnamed: 0,query_id,user_id,query_text,query_domain,query_timestamp
0,1,1,How does ChatGPT work?,technology,2024-07-02 09:15:00
1,2,2,What is the latest discovery in astrophysics?,science,2024-07-15 12:00:00
2,3,3,Tell me a joke,entertainment,2024-07-20 16:30:00
3,4,1,Explain neural networks,technology,2024-08-01 10:05:00
4,5,2,Quantum mechanics basics,science,2024-08-03 11:00:00


# Pregunta 1

### ¿Qué porcentaje de las consultas de los usuarios en julio de 2024 estuvieron relacionadas con los dominios de 'technology' (tecnología) o 'science' (ciencia)?

In [19]:
df_july = df_queries[
    (df_queries['query_timestamp'].between('2024-07-01','2024-07-31'))
]

respuesta1 = df_july['query_domain'].isin(['science','technology']).mean() * 100

print(f"Resultado: {respuesta1:.2f}%")

Resultado: 66.67%


```SQL
SELECT
    COUNT(CASE WHEN query_domain in ('science','technology') THEN 1 END) * 100.0 / COUNT(*) AS percentage_science_query
FROM fct_queries
WHERE query_timestamp BETWEEN '2024-07-01' AND '2024-07-31';
```

# Pregunta 2

### Calcule el número total de consultas por mes en el tercer trimestre (Q3) de 2024. ¿Qué mes tuvo el mayor número de consultas?

In [22]:
df_q3 = df_queries[df_queries['query_timestamp'].dt.month.isin([7, 8, 9])]

conteo_mensual = df_q3.groupby(df_q3['query_timestamp'].dt.month).size()

mes_top = conteo_mensual.idxmax()
valor_top = conteo_mensual.max()

print(f"El mes con más actividad fue el {mes_top} con {valor_top} consultas.")

El mes con más actividad fue el 8 con 9 consultas.


```SQL
SELECT
    EXTRACT(MONTH FROM query_timestamp) AS month,
    COUNT(*) AS total_queries
FROM fct_queries
WHERE query_timestamp BETWEEN '2024-07-01' AND '2024-09-30'
GROUP BY month
ORDER BY total_queries DESC;
```

# Pregunta 3

### Identifique a los 5 usuarios con mayor cantidad de consultas en agosto de 2024 por su nombre y apellido. Queremos entrevistar a nuestros usuarios más activos y esta información se utilizará para contactarlos.

In [24]:
# Unimos las tablas
df_power_users = df_queries.merge(df_user, on='user_id')

# Filtramos agosto y hacemos el ranking
resultado_agosto = (
    df_power_users[df_power_users['query_timestamp'].dt.month == 8]
    .groupby(['first_name', 'last_name'])
    .size()
    .sort_values(ascending=False)
    .head(5)
    .reset_index(name='total_query_per_user')
)

resultado_agosto

Unnamed: 0,first_name,last_name,total_query_per_user
0,Alice,Smith,2
1,Bob,Johnson,2
2,Dana,Brown,2
3,Charlie,Lee,1
4,Evan,Davis,1


```SQL
SELECT
    u.first_name,
    u.last_name,
    COUNT(q.query_id) AS total_query_per_user
FROM dim_users u
JOIN fct_queries q ON u.user_id = q.user_id
WHERE q.query_timestamp BETWEEN '2024-08-01' AND '2024-08-31'
GROUP BY u.user_id, u.first_name, u.last_name -- Agregamos el ID para mayor precisión
ORDER BY total_query_per_user DESC
LIMIT 5;
```