In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
demo=pd.read_csv("df_final_demo.txt")
exp=pd.read_csv("df_final_experiment_clients.txt")
web1=pd.read_csv("df_final_web_data_pt_1.txt")
web2=pd.read_csv("df_final_web_data_pt_2.txt")
web= pd.concat([web1,web2], axis=0)

### **CLEANING AND ORGANIZATION DATA SET** ###

### Demography file

In [3]:
#Rename columns

demo.rename(columns={"gendr": "genero", "client_id": "id_cliente", "clnt_tenure_yr": "años_permanencia","clnt_tenure_mnth":"meses_permanencia", "clnt_age": "edad", "num_accts": "num_cuentas", "bal": "saldo", "calls_6_mnth":"llamadas_semestre" , "logons_6_mnth": "logins_semestre" }, inplace=True)

In [4]:
demo["genero"].value_counts()

genero
U    24122
M    23724
F    22746
X        3
Name: count, dtype: int64

In [5]:
# Keep Male Female and Unknown

demo= demo[demo["genero"] != "X"]

In [6]:
#Drop nulls

demo.isnull().sum()

id_cliente            0
años_permanencia     14
meses_permanencia    14
edad                 15
genero               14
num_cuentas          14
saldo                14
llamadas_semestre    14
logins_semestre      14
dtype: int64

In [7]:
demo = demo.dropna()

In [8]:
#Keep only adults

demo[demo['edad'] < 18].shape[0]


374

In [9]:
demo = demo[demo["edad"] >= 18]

In [10]:
#Convert columns to integers and revise

cols = ['años_permanencia', 'meses_permanencia', 'edad', 'num_cuentas', 'saldo', 'llamadas_semestre', 'logins_semestre']
demo[cols] = demo[cols].astype(int)


In [11]:
demo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70217 entries, 0 to 70608
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id_cliente         70217 non-null  int64 
 1   años_permanencia   70217 non-null  int64 
 2   meses_permanencia  70217 non-null  int64 
 3   edad               70217 non-null  int64 
 4   genero             70217 non-null  object
 5   num_cuentas        70217 non-null  int64 
 6   saldo              70217 non-null  int64 
 7   llamadas_semestre  70217 non-null  int64 
 8   logins_semestre    70217 non-null  int64 
dtypes: int64(8), object(1)
memory usage: 5.4+ MB


### Experiment file

In [12]:
#Check nulls 

exp['Variation'].isna().sum()/exp.shape[0]

np.float64(0.28479372317976465)

In [13]:
#Rename columns 

exp= exp.rename(columns={'client_id': 'id_cliente' ,'Variation': 'modalidad'})

In [14]:
exp = exp.dropna(subset=['modalidad'])

### Web file

In [15]:
web

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
412259,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
412260,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
412261,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
412262,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


In [16]:
# Rename columns

web.rename(columns={
    'client_id': 'id_cliente',
    'visitor_id': 'id_visitante',
    'visit_id': 'id_visita',
    'process_step': 'paso_proceso',
    'date_time': 'fecha'
}, inplace=True)

In [17]:
# Analizing web file: Nulls, value counts

def analizar_columnas(df):
   
    print("Resumen de Nulos")

    nulos = web.isnull().sum()
    porcentaje = (web.isnull().mean() * 100).round(2)
    resumen_nulos = pd.DataFrame({
        'Nulos': nulos,
        '% Nulos': porcentaje
    })
    print(resumen_nulos)
    print("\n=== Value Counts por Columna Categórica ===")
    for col in web.select_dtypes(include=['object', 'category']).columns:
        print(f"\n--- {col} ---")
        print(df[col].value_counts(dropna=False))

In [18]:
analizar_columnas(web)

Resumen de Nulos
              Nulos  % Nulos
id_cliente        0      0.0
id_visitante      0      0.0
id_visita         0      0.0
paso_proceso      0      0.0
fecha             0      0.0

=== Value Counts por Columna Categórica ===

--- id_visitante ---
id_visitante
722943003_3441581446     104
857376424_12378092455     90
272742682_16716805486     81
78799102_14329268381      80
780189529_68744403580     80
                        ... 
126345313_81637097552      1
344922862_88034013382      1
920532103_65264751870      1
451773986_45029239446      1
947159805_81558194550      1
Name: count, Length: 130236, dtype: int64

--- id_visita ---
id_visita
875138661_34710212496_881092    104
518285126_8572733997_709761      88
602953935_48759866176_238903     80
961878360_85895454962_607105     80
308874104_3998249411_902973      72
                               ... 
102624258_43364229051_125535      1
780988840_22026485488_430889      1
684165134_19767897066_267107      1
748112830_53678

In [19]:
#Adapt date column

web.describe(include='object')

Unnamed: 0,id_visitante,id_visita,paso_proceso,fecha
count,755405,755405,755405,755405
unique,130236,158095,5,629363
top,722943003_3441581446,875138661_34710212496_881092,start,2017-05-02 10:07:41
freq,104,104,243945,24


In [20]:
min_fecha = web["fecha"].min()
max_fecha = web["fecha"].max()

CLEANING DONE 

### **MERGE FILES** ###

In [21]:
clientes= demo.merge(exp, left_on='id_cliente', right_on='id_cliente', how='inner')


### Who are the primary clients using this online process?

In [22]:
clientes['edad'].mean()

np.float64(47.095968109791166)

In [23]:
clientes[["edad","meses_permanencia","saldo"]].describe()

Unnamed: 0,edad,meses_permanencia,saldo
count,50423.0,50423.0,50423.0
mean,47.095968,150.478135,149658.8
std,15.501791,81.936975,302203.5
min,18.0,33.0,23789.0
25%,33.0,82.0,39918.5
50%,48.0,136.0,65857.0
75%,59.0,192.0,140093.0
max,96.0,669.0,16320040.0


### Average Logins per Interface

Very similar; the new interface has not produced significant changes compared to the traditional one. In fact, they have slightly decreased.

In [24]:
clientes.groupby("modalidad")["logins_semestre"].agg(["mean", "count"])


Unnamed: 0_level_0,mean,count
modalidad,Unnamed: 1_level_1,Unnamed: 2_level_1
Control,6.167716,23498
Test,6.103064,26925


### Average Calls per Semester

Very similar; with the new interface, calls are slightly reduced.


In [25]:
clientes.groupby("modalidad")["llamadas_semestre"].agg(["mean", "count"])

Unnamed: 0_level_0,mean,count
modalidad,Unnamed: 1_level_1,Unnamed: 2_level_1
Control,3.130607,23498
Test,3.063175,26925


### Usage by group ages

In [26]:
# Making of 5 group ages

bins = [0,25,35,50,70,100]
labels = ["Jóvenes","Adultos Jóvenes " , "Adultos","Adultos Seniors", "Seniors"]
clientes["grupo_edad"] = pd.cut(clientes["edad"], bins=bins, labels=labels, right=False)

segmento_uso = clientes.groupby("grupo_edad")[["logins_semestre", "llamadas_semestre"]].mean().round(2)

segmento_uso


  segmento_uso = clientes.groupby("grupo_edad")[["logins_semestre", "llamadas_semestre"]].mean().round(2)


Unnamed: 0_level_0,logins_semestre,llamadas_semestre
grupo_edad,Unnamed: 1_level_1,Unnamed: 2_level_1
Jóvenes,6.08,3.06
Adultos Jóvenes,6.17,3.15
Adultos,5.87,2.85
Adultos Seniors,6.21,3.16
Seniors,6.57,3.48


In [27]:
pd.pivot_table(
    clientes,
    values="logins_semestre",
    index=["grupo_edad", "genero"],
    columns="num_cuentas",
    aggfunc="mean",
    margins=True
).round(2)

  pd.pivot_table(


Unnamed: 0_level_0,num_cuentas,1,2,3,4,5,6,7,All
grupo_edad,genero,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Jóvenes,F,,5.48,6.57,6.62,9.0,,,5.74
Jóvenes,M,,5.98,7.27,7.29,,,,6.33
Jóvenes,U,,5.76,7.15,7.87,7.5,,,6.09
Adultos Jóvenes,F,,5.4,6.62,7.58,7.0,8.33,,5.69
Adultos Jóvenes,M,,6.12,7.4,7.96,8.5,,9.0,6.5
Adultos Jóvenes,U,4.0,5.93,7.2,7.82,8.2,,,6.22
Adultos,F,,5.36,6.5,6.96,7.83,,4.0,5.6
Adultos,M,,5.79,7.16,7.72,7.79,6.5,8.0,6.19
Adultos,U,,5.55,6.81,7.4,8.56,,,5.8
Adultos Seniors,F,,5.78,6.72,7.25,7.65,8.33,6.0,5.98


Usage behavior is fairly consistent across generations.

## Are the primary clients younger or older, new or long-standing?

**Our Main Clients Have an Average Age of 47 and an Average Tenure of 12.5 Years**

These are not new or young clients, but experienced clients with a long-standing relationship with the company.

If they have many years of tenure, they are likely satisfied with the services offered, either out of convenience or because they are already familiar with the application.

#

#

#

## PERFORMANCE METRICS

### Completion Rate

The proportion of users who reach the final "confirm" step.


In [28]:
# We create the main dataframe 

df = clientes.merge(web, on="id_cliente", how="inner")

In [29]:
#Overall completion rate

final_step = "confirm"
clientes_con_confirm = (
    web.groupby("id_cliente")["paso_proceso"]
    .apply(lambda pasos: final_step in list(pasos))
)
tasa_finalizacion_clientes = clientes_con_confirm.mean() * 100
print(f"Tasa de finalización por cliente: {tasa_finalizacion_clientes:}%")

Tasa de finalización por cliente: 67.53247834083741%


In [30]:
#Finalization rate by modality

tasa_finalizacion = (
    df[df["paso_proceso"] == "confirm"]
    .groupby("modalidad")["id_cliente"]
    .nunique()
    / df.groupby("modalidad")["id_cliente"].nunique()
) * 100

print(tasa_finalizacion)

modalidad
Control    65.537493
Test       69.259053
Name: id_cliente, dtype: float64


The new interface outperforms the overall per-client completion rate by 2%, based on Vanguard's entire customer database, including those not participating in the experiment.


#

#

#

## Time Spent on Each Step

### The average duration users spend on each step.


In [31]:
df["fecha"] = pd.to_datetime(df["fecha"], format="%Y-%m-%d %H:%M:%S", errors="coerce")


In [32]:
df = df.sort_values(["id_cliente", "fecha"])

In [33]:
df = df.sort_values(["id_cliente", "id_visita", "fecha"])
df["tiempo_seg"] = df.groupby(["id_cliente", "id_visita"])["fecha"].diff().dt.total_seconds()

In [34]:
tiempo_modalidad = (
    df.groupby(["modalidad", "paso_proceso"])["tiempo_seg"]
      .median()
      .reset_index()
)

tiempo_modalidad["duracion_media_min"] = tiempo_modalidad["tiempo_seg"] / 60


In [35]:
tiempo_medio_paso_df = (
    df.groupby("paso_proceso")["tiempo_seg"]
    .median()
    .reset_index(name="duracion_media_seg")
)
print(tiempo_medio_paso_df)

  paso_proceso  duracion_media_seg
0      confirm                64.0
1        start                48.0
2       step_1                13.0
3       step_2                23.0
4       step_3                66.0


In [36]:
tiempo_medio_paso_df["duracion_media_min"]=(tiempo_medio_paso_df["duracion_media_seg"]/60).round(2)
tiempo_medio_paso_df

Unnamed: 0,paso_proceso,duracion_media_seg,duracion_media_min
0,confirm,64.0,1.07
1,start,48.0,0.8
2,step_1,13.0,0.22
3,step_2,23.0,0.38
4,step_3,66.0,1.1


In [37]:
tiempo_modalidad

Unnamed: 0,modalidad,paso_proceso,tiempo_seg,duracion_media_min
0,Control,confirm,73.0,1.216667
1,Control,start,37.0,0.616667
2,Control,step_1,18.0,0.3
3,Control,step_2,20.0,0.333333
4,Control,step_3,66.0,1.1
5,Test,confirm,56.0,0.933333
6,Test,start,55.0,0.916667
7,Test,step_1,11.0,0.183333
8,Test,step_2,25.0,0.416667
9,Test,step_3,66.0,1.1


### Total Process Duration

The total process duration is practically the same across modalities.

The Test group (new interface) shows speed improvements except in the "start" and "step 2" stages.


#

#

#

## Error Rate

### If users move back to a previous step, it may indicate confusion or an error. Consider an error any backward movement from a later step to an earlier one.


In [39]:
df = df.sort_values(["id_cliente", "id_visita", "fecha"])

orden_pasos = {step: i for i, step in enumerate(df["paso_proceso"].unique())}
df["num_paso"] = df["paso_proceso"].map(orden_pasos)

df["paso_anterior"] = df.groupby(["id_cliente", "id_visita"])["num_paso"].shift(1)
df["error"] = df["num_paso"] < df["paso_anterior"]

tasa_error = df["error"].mean() * 100
print(f"Tasa de errores general: {tasa_error:.2f}%")


Tasa de errores general: 8.05%


The error rate in the full dataset (web) is 8.05%.


In [77]:
tasa_error_modalidad = (
    df.groupby("modalidad")["error"]
      .mean()
      .reset_index()
)

tasa_error_modalidad["tasa_error_%"] = tasa_error_modalidad["error"] * 100
print(tasa_error_modalidad)


  modalidad     error  tasa_error_%
0   Control  0.067045      6.704511
1      Test  0.091405      9.140527


### Error Rate Analysis

**Overall error rate: 8.05%**

* **Control group:** 6.70%
* **Test group (new interface):** 9.14%

The Test group shows a higher error rate than the Control group. This indicates that users in the new interface are more likely to go back to a previous step, suggesting moments of confusion or unclear guidance in the redesigned flow. Further qualitative or step-by-step analysis is recommended to identify which specific steps are driving the increase.


## Completion Rate

Since the new design (Test group) had a higher completion rate compared to the old design (Control group), you are asked to **confirm whether this difference is statistically significant**.
Given the data and the key performance indicators (KPIs) you have analyzed and discussed, an interesting hypothesis to test is related to the completion rate between the **Test** and **Control** groups.


In [41]:
#   H₀ (null): The completion rate is equal between both designs
#   ptest=pcontrolp_{test} = p_{control}ptest​=pcontrol
#   H₁ (alternative): The completion rate of the new design is higher
#   ptest>pcontrolp_{test} > p_{control}ptest​>pcontrol

In [45]:
num_clientes_test = df.loc[(df["modalidad"] == "Test"), "id_cliente"].nunique()
num_clientes_control = df.loc[(df["modalidad"] == "Control"), "id_cliente"].nunique()
print("Num clients test:", num_clientes_test)
print("Num clients control:",num_clientes_control )

Num clients test: 26925
Num clients control: 23498


In [81]:
clientes_confirm_test = df.loc[
    (df["modalidad"] == "Test") & (df["paso_proceso"] == "confirm"),
    "id_cliente"
].nunique()

clientes_confirm_control = df.loc[
    (df["modalidad"] == "Control") & (df["paso_proceso"] == "confirm"),
    "id_cliente"
].nunique()

print("Clientes confirm Test:", clientes_confirm_test)
print("Clientes confirm Control:", clientes_confirm_control)

Clientes confirm Test: 18648
Clientes confirm Control: 15400


In [82]:
clientes_confirm_test = df.loc[
    (df["modalidad"] == "Test") & (df["paso_proceso"] == "confirm")]

clientes_confirm_test.head()

Unnamed: 0,id_cliente,años_permanencia,meses_permanencia,edad,genero,num_cuentas,saldo,llamadas_semestre,logins_semestre,modalidad,grupo_edad,id_visitante,id_visita,paso_proceso,fecha,tiempo_seg,num_paso,paso_anterior,error
255052,555,3,46,29,U,2,25454,2,6,Test,Adultos Jóvenes,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,20.0,4,3.0,False
9725,647,12,151,57,M,2,30525,0,4,Test,Adultos Seniors,66758770_53988066587,40369564_40101682850_311847,confirm,2017-04-12 15:47:45,163.0,4,3.0,False
105124,1336,48,576,42,M,4,130537,6,9,Test,Adultos,920624746_32603333901,583743392_96265099036_939815,confirm,2017-05-08 06:08:43,109.0,4,3.0,False
105123,1336,48,576,42,M,4,130537,6,9,Test,Adultos,920624746_32603333901,614001770_19101025926_112779,confirm,2017-05-08 08:21:38,,4,,False
105122,1336,48,576,42,M,4,130537,6,9,Test,Adultos,920624746_32603333901,614001770_19101025926_112779,confirm,2017-05-08 08:23:00,82.0,4,4.0,False


In [84]:
tasa_finalizacion = (
    df[df["paso_proceso"] == "confirm"]
    .groupby("modalidad")["id_cliente"]
    .nunique()
    / df.groupby("modalidad")["id_cliente"].nunique()
) * 100

print(tasa_finalizacion)


modalidad
Control    65.537493
Test       69.259053
Name: id_cliente, dtype: float64


In [None]:
import numpy as np
from statsmodels.stats.proportion import proportions_ztest

n_test = 1777539
n_control = 143217
p_test = 0.6925
p_control = 0.65537

exitos_test = round(p_test * n_test)
exitos_control = round(p_control * n_control)

exitos = np.array([exitos_test, exitos_control])
n_obs = np.array([n_test, n_control])

stat, pval = proportions_ztest(exitos, n_obs, alternative='larger')
print(f"Estadístico Z: {stat:.3f}")
print(f"p-valor: {pval:.6f}")

Estadístico Z: 29.221
p-valor: 0.000000


#

#

#

### Completion Rate

Since the new design (Test group) had a higher completion rate compared to the old design (Control group), we can test if this difference is statistically significant.

 H₀ (null): the completion rate is equal between both designs

 p_test = p_control

 H₁ (alternative): the completion rate of the new design is higher

 p_test > p_control

**Analysis:**
Completion rates: Control = 65.54%, Test = 69.26%
Z-statistic = 29.221, p-value = 0.0

**Conclusion:** The difference is statistically significant; the new design increases the completion rate.


### Answering questions of the project

### - The introduction of a new user interface design involves associated costs: design, development, testing, potential staff training, and possible short-term disruptions or adjustments for users. To justify these costs, Vanguard has determined that any new design must generate a minimum increase in the completion rate to be considered cost-effective.

**Threshold:** Vanguard has set this minimum increase in the completion rate at 5%. This is the rate at which the projected benefits, in terms of higher user engagement and potential revenue, are estimated to outweigh the costs of the new design.


In [90]:
tasa_finalizacion = (
    df[df["paso_proceso"] == "confirm"]
    .groupby("modalidad")["id_cliente"]
    .nunique()
    / df.groupby("modalidad")["id_cliente"].nunique()
) * 100

print(tasa_finalizacion)

modalidad
Control    65.537493
Test       69.259053
Name: id_cliente, dtype: float64


The increase from 65% to 69% represents a 3% rise in completion rate.
Although we reject H₀ statistically, the increase does not reach the 5% threshold set by Vanguard to justify the new design.

### Is the average tenure of customers (months of retention) using the new process the same as that of those using the previous process?


In [49]:
!pip install scipy
import scipy
from scipy.stats import ttest_ind



In [51]:
# H0 = The average age is the same
# H1 = The average age is different


In [None]:
edad_test = df.loc[df["modalidad"] == "Test", "edad"].dropna()
edad_control = df.loc[df["modalidad"] == "Control", "edad"].dropna()

# Usamos Welch por tamaños distintos
from scipy.stats import ttest_ind

stat, pval = ttest_ind(edad_test, edad_control, equal_var=False)

print(f"Estadístico t: {stat:.3f}")
print(f"p-valor: {pval:.4f}")


Estadístico t: 7.444
p-valor: 0.0000


We reject H0; the p-value is much less than 0.05, indicating a significant difference between the ages of the two experiment groups.

Is the average age the same between modalities?

In [54]:
# H0 = The average age is the same
# H1 = The average age is different


In [None]:
permanencia_test = df.loc[df["modalidad"]=="Test", "meses_permanencia"].dropna()
permanencia_control = df.loc[df["modalidad"]=="Control", "meses_permanencia"].dropna()

#Utilizamos Welch porque nuestros grupos tienen tamaños distintos.
stat, pval =ttest_ind(permanencia_test, permanencia_control, equal_var = False)

print(f"Estadístico t: {stat:.3f}")
print(f"p-valor: {pval:.4f}")

Estadístico t: 0.533
p-valor: 0.5941


In [97]:
# Tabla de frecuencias por modalidad y género
tabla_genero = pd.crosstab(df["modalidad"], df["genero"])

# Test chi-cuadrado de independencia
from scipy.stats import chi2_contingency

chi2, p, dof, expected = chi2_contingency(tabla_genero)

print(f"Estadístico Chi-cuadrado: {chi2:.3f}")
print(f"p-valor: {p:.4f}")

Estadístico Chi-cuadrado: 47.456
p-valor: 0.0000


t = 0.533, p = 0.5941 > 0.05 → the null hypothesis of equal means is not rejected.

**Interpretation:** Tenure (months of retention) is statistically similar for users in the Test and Control processes.

**Implication:** The observed differences in completion rates or errors are not due to customer tenure, but to the process design.

We used a chi-square test because we are comparing two categorical variables: modality and gender.


#### Experiment Evaluation



**Design Effectiveness**
Was the experiment well structured?

Yes, the A/B design is well structured, allowing us to analyze the metrics for the group using the new interface and the one using the old interface.


Were the customers divided randomly and evenly between the old and new designs?

-- The ages of customers differ significantly between the two groups, tenure is similar, and the sample sizes (77,539 vs. 143,217) are reasonably comparable.


Was there any bias?

**Age bias:** The Test group has a different age profile, which could influence their behavior during the process.

**Gender bias:** The experiment was not completely random, and performance differences could be partially influenced by age or gender.


* Duration Assessment
Was the timeframe of the experiment (from 3/15/2017 to 6/20/2017) adequate to gather meaningful data and insights?

In [98]:
## H0 = El tiempo del experimento es adecuado para tomar conclusiones
## H1= El tiempo del experimento no es adecuado

In [99]:
from scipy.stats import ttest_ind

web_times = web["tiempo_seg"].dropna()
df_times = df["tiempo_seg"].dropna()


print(f"Tiempo medio web: {web_times.mean():.2f} s")
print(f"Tiempo medio df: {df_times.mean():.2f} s")


t_stat, p_val = ttest_ind(web_times, df_times, equal_var=False)
print(f"Estadístico t: {t_stat:.3f}")
print(f"p-valor: {p_val:.4f}")


Tiempo medio web: 83.52 s
Tiempo medio df: 82.55 s
Estadístico t: 1.936
p-valor: 0.0529


Rechazamos la hipótesis nula, tal y como veiamos anteriormente, la base de datos utilizada para el experimento en estos 3 meses arroja unos resultados parecidos al global, con un p-value> 0,05.

Esto nos indica que el tiempo del experimento seleccionado es adecuado.

* Additional Data Needs
What other data, if available, could enhance the analysis?


* Customers’ prior usage history (frequency, previous sessions, cumulative time).

* Broader demographic data (age, gender, education level, location, income level) to control for potential biases.

* Type of management performed by the customer (e.g., registration, cancellation, modification, inquiry), as it could directly influence process duration and complexity.

* User satisfaction or feedback after using the process (surveys or ratings).

* Conversion or performance data (whether they completed a purchase or action).

* Technical information about the device or browser used, which could affect loading and interaction times.


In [55]:
df.to_csv("datos_limpios.csv", index=False)
