## **Librerias**

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

## **Preparación de Datos**

In [2]:
df_closed = pd.read_csv("../data/raw/olist_closed_deals_dataset.csv")
#
df_mkt = pd.read_csv("../data/raw/olist_marketing_qualified_leads_dataset.csv")

In [3]:
df_mkt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   mql_id              8000 non-null   object
 1   first_contact_date  8000 non-null   object
 2   landing_page_id     8000 non-null   object
 3   origin              7940 non-null   object
dtypes: object(4)
memory usage: 250.1+ KB


In [4]:
df_mkt['mql_id'].nunique()

8000

In [5]:
def get_low_completion_columns(df, seller_id_col='seller_id', threshold=80):
    """
    Retorna las columnas que tienen un porcentaje de completitud menor al threshold especificado,
    calculado en relación a la cantidad de seller_id únicos.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame a analizar
    seller_id_col : str
        Nombre de la columna que contiene el seller_id
    threshold : int
        Porcentaje mínimo de completitud requerido (0-100)
    
    Returns:
    --------
    list
        Lista de columnas que no cumplen con el porcentaje mínimo de completitud
    """
    
    # Cantidad de sellers únicos
    total_sellers = df[seller_id_col].nunique()
    
    # Calcular porcentaje de completitud para cada columna
    completion_rates = (df.count() / total_sellers * 100).round(2)
    
    # Filtrar columnas por debajo del threshold
    low_completion_cols = completion_rates[completion_rates < threshold].index.tolist()
    
    return low_completion_cols

In [6]:
df_mkt_closed = df_mkt.merge(df_closed, on='mql_id', how='left')
#
# Usar el análisis
# Asumiendo que tu DataFrame se llama 'df'
low_quality_columns = get_low_completion_columns(df_mkt_closed, threshold=60)
#
list_drop_columns = ['landing_page_id', 'seller_id', 'sdr_id', 'sr_id'] + low_quality_columns
#
df_mkt_closed = df_mkt_closed.drop(columns=list_drop_columns)
#
#
df_processed = df_mkt_closed.assign(
        first_contact_date=lambda df: pd.to_datetime(df_mkt_closed['first_contact_date'], format='%Y-%m-%d', errors='coerce'),
        won_date= pd.to_datetime(df_mkt_closed['won_date'].str[:10], format='%Y-%m-%d', errors='coerce'),
        target= lambda df: np.where(df_mkt_closed['won_date'].isnull(), 0, 1),
        origin= lambda df: np.where(df_mkt_closed['origin'].isnull(), 'unknown', df_mkt_closed['origin']))
#
df_processed['days_to_convert'] = (df_processed['won_date'] - df_processed['first_contact_date']).dt.days

In [7]:
origin_conversion = df_processed.query("origin != 'unknown'")\
                              .groupby('origin', as_index=False)\
                              .agg(mql=('mql_id', 'count'), won=('target', 'sum'),
                                   #days_to_convert_mean=('days_to_convert', 'mean'),
                                   #days_to_convert_std=('days_to_convert', 'std'),
                                   days_to_convert_q3=('days_to_convert', lambda x: x.quantile(0.75))
                                   )
#
# Calcular los porcentajes
total_mql = origin_conversion['mql'].sum()
total_won = origin_conversion['won'].sum()
# Añadir columnas de porcentaje
origin_conversion['mql_percentage'] = (origin_conversion['mql'] / total_mql * 100).round(2)
origin_conversion['won_percentage'] = (origin_conversion['won'] / total_won * 100).round(2)
origin_conversion['weighted_conversion'] = (origin_conversion['won'] / origin_conversion['mql']) * np.log(origin_conversion['mql'])
origin_conversion['conversion'] = (origin_conversion['won'] / origin_conversion['mql']  * 100).round(2)

In [8]:
origin_conversion.sort_values(by='weighted_conversion', ascending=False).round(2)

Unnamed: 0,origin,mql,won,days_to_convert_q3,mql_percentage,won_percentage,weighted_conversion,conversion
3,organic_search,2296,271,55.5,33.56,41.76,0.91,11.8
6,paid_search,1586,195,80.0,23.18,30.05,0.91,12.3
0,direct_traffic,499,56,32.25,7.29,8.63,0.7,11.22
7,referral,284,24,35.25,4.15,3.7,0.48,8.45
8,social,1350,75,84.0,19.73,11.56,0.4,5.56
1,display,118,6,14.5,1.72,0.92,0.24,5.08
5,other_publicities,65,3,41.5,0.95,0.46,0.19,4.62
2,email,493,15,62.0,7.21,2.31,0.19,3.04
4,other,150,4,20.25,2.19,0.62,0.13,2.67


In [9]:
px.box(df_processed.query("origin != 'unknown'"), x='origin', y='days_to_convert', color='origin')

In [11]:
# Crear gráfico de burbujas
fig = px.scatter(
    origin_conversion,
    x="mql_percentage",           # Eje X
    y="conversion",               # Eje Y (conversion rate)
    size="weighted_conversion",                   # Tamaño de burbuja según cantidad de MQLs
    color="origin",               # Color por canal
    hover_name="origin",          # Mostrar nombre al pasar el mouse
    size_max=60,                  # Tamaño máximo de burbuja
    title="Comparación de Orígenes de MQLs",
    labels={
        "mql_percentage": "% de MQLs",
        "conversion": "Tasa de Conversión (%)"
    }
)

fig.show()

In [30]:
# Crear gráfico de burbujas
fig = px.scatter(
    origin_conversion,
    x="mql_percentage",           # Eje X
    y="conversion",               # Eje Y (conversion rate)
    size="weighted_conversion",                   # Tamaño de burbuja según cantidad de MQLs
    color="days_to_convert_q3",               # Color por canal
    hover_name="origin",          # Mostrar nombre al pasar el mouse
    size_max=60,                  # Tamaño máximo de burbuja
    title="Comparación de Orígenes de MQLs",
    labels={
        "mql_percentage": "% de MQLs",
        "conversion": "Tasa de Conversión (%)"
    }
)

fig.show()

In [44]:
mql_daily_series = df_processed.groupby('first_contact_date', as_index=False).agg(mql_count=('mql_id', 'count'))

In [46]:
px.line(mql_daily_series, x='first_contact_date', y='mql_count', title='MQLs por día')

In [51]:
df_processed['contact_period'] = df_processed['first_contact_date'].astype(str).str[:7]


In [53]:
mql_period_series = df_processed.groupby('contact_period', as_index=False).agg(mql_count=('mql_id', 'count'))

In [56]:
mql_period_series.shape

(12, 2)

In [55]:
px.line(mql_period_series, x='contact_period', y='mql_count', title='MQLs por día')

In [57]:
mql_series = df_processed.groupby(['first_contact_date', 'contact_period'], as_index=False).agg(mql_count=('mql_id', 'count'))

In [58]:
px.box(mql_series, x='contact_period', y='mql_count')

In [45]:
mql_daily_series

Unnamed: 0,first_contact_date,mql_count
0,2017-06-14,2
1,2017-06-16,1
2,2017-06-20,1
3,2017-07-02,1
4,2017-07-03,1
...,...,...
331,2018-05-27,19
332,2018-05-28,57
333,2018-05-29,37
334,2018-05-30,30


In [39]:
# MQLs por día
mql_daily_series = df_processed.groupby(df_processed['first_contact_date'].dt.to_period('d'))['mql_id'].count().rename('mql_count').to_timestamp()

# Convertir a DataFrame
mql_daily_df_constructor = pd.DataFrame(mql_daily_series)





TypeError: Index(...) must be called with a collection of some kind, 'mql_count' was passed

In [40]:
mql_daily_df_constructor

Unnamed: 0_level_0,mql_count
first_contact_date,Unnamed: 1_level_1
2017-06-14,2
2017-06-16,1
2017-06-20,1
2017-07-02,1
2017-07-03,1
...,...
2018-05-27,19
2018-05-28,57
2018-05-29,37
2018-05-30,30


In [35]:
# MQLs por mes
mql_monthly = df_processed.groupby(df_processed['first_contact_date'].dt.to_period('d'))['mql_id'].count().rename('mql_count').to_timestamp()

In [None]:


# Wons por mes
won_monthly = df[df['won_date'].notna()].groupby(df['won_date'].dt.to_period('M'))['mql_id'].count().rename('won_count').to_timestamp()

In [33]:
df_mkt_closed.query(" sr_id.notna() and declared_monthly_revenue > 0 ")[['declared_monthly_revenue']]

Unnamed: 0,declared_monthly_revenue
4,0.0
12,0.0
14,0.0
39,0.0
67,0.0
...,...
7978,0.0
7983,200000.0
7991,0.0
7994,0.0


In [32]:
df_mkt_closed.query(" sr_id.notna() ")['declared_monthly_revenue'].unique()

array([0.0e+00, 1.0e+05, 2.0e+04, 6.0e+03, 1.8e+05, 3.0e+04, 6.0e+00,
       1.5e+05, 1.0e+04, 2.5e+04, 5.0e+07, 2.1e+05, 1.5e+04, 2.5e+05,
       8.0e+06, 4.0e+04, 5.0e+03, 4.0e+03, 3.0e+05, 6.0e+04, 1.0e+03,
       5.0e+04, 5.0e+05, 1.3e+05, 1.2e+05, 8.0e+03, 2.0e+05])

In [26]:
df_mkt_closed['business_segment'].value_counts(dropna=False, normalize=True)

business_segment
NaN                                0.894875
home_decor                         0.013125
health_beauty                      0.011625
car_accessories                    0.009625
household_utilities                0.008875
construction_tools_house_garden    0.008625
audio_video_electronics            0.008000
computers                          0.004250
pet                                0.003750
food_supplement                    0.003500
food_drink                         0.003250
sports_leisure                     0.003125
bags_backpacks                     0.002750
bed_bath_table                     0.002750
toys                               0.002500
fashion_accessories                0.002375
home_office_furniture              0.001750
stationery                         0.001625
phone_mobile                       0.001625
handcrafted                        0.001500
small_appliances                   0.001500
baby                               0.001250
books          

In [25]:
df_mkt_closed['business_type'].value_counts(dropna=False, normalize=True)

business_type
NaN             0.896000
reseller        0.073375
manufacturer    0.030250
other           0.000375
Name: proportion, dtype: float64

In [None]:
df_mkt_closed['origin'].value_counts(dropna=False, normalize=True)

origin
organic_search       0.287000
paid_search          0.198250
social               0.168750
unknown              0.137375
direct_traffic       0.062375
email                0.061625
referral             0.035500
other                0.018750
display              0.014750
other_publicities    0.008125
NaN                  0.007500
Name: proportion, dtype: float64

In [21]:
df_mkt_closed['lead_type'].value_counts(dropna=False, normalize=True)

lead_type
NaN                0.895500
online_medium      0.041500
online_big         0.015750
industry           0.015375
offline            0.013000
online_small       0.009625
online_beginner    0.007125
online_top         0.001750
other              0.000375
Name: proportion, dtype: float64

In [22]:
df_mkt_closed['lead_behaviour_profile'].value_counts(dropna=False, normalize=True)

lead_behaviour_profile
NaN            0.916875
cat            0.050875
eagle          0.015375
wolf           0.011875
shark          0.003000
cat, wolf      0.001000
eagle, wolf    0.000375
eagle, cat     0.000375
shark, cat     0.000125
shark, wolf    0.000125
Name: proportion, dtype: float64

In [23]:
# Total de MQLs (todos los leads en el dataset)
n_mql = df_mkt_closed['mql_id'].nunique()

# SQLs: los que tienen un vendedor asignado (ej: sr_id no nulo)
n_sql = df_mkt_closed[df_mkt_closed['sr_id'].notna()]['mql_id'].nunique()

# Won: los que tienen una fecha de conversión
n_won = df_mkt_closed[df_mkt_closed['won_date'].notna()]['mql_id'].nunique()

print("MQL:", n_mql)
print("SQL:", n_sql)
print("Won:", n_won)

MQL: 8000
SQL: 842
Won: 842


In [24]:
conversion_mql_to_sql = n_sql / n_mql
conversion_sql_to_won = n_won / n_sql
conversion_mql_to_won = n_won / n_mql

print(f"Tasa MQL → SQL: {conversion_mql_to_sql:.2%}")
print(f"Tasa SQL → Won: {conversion_sql_to_won:.2%}")
print(f"Tasa MQL → Won: {conversion_mql_to_won:.2%}")

Tasa MQL → SQL: 10.53%
Tasa SQL → Won: 100.00%
Tasa MQL → Won: 10.53%


In [42]:
df_mkt_closed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   mql_id                         8000 non-null   object        
 1   first_contact_date             8000 non-null   datetime64[ns]
 2   landing_page_id                8000 non-null   object        
 3   origin                         8000 non-null   object        
 4   seller_id                      842 non-null    object        
 5   sdr_id                         842 non-null    object        
 6   sr_id                          842 non-null    object        
 7   won_date                       842 non-null    datetime64[ns]
 8   business_segment               841 non-null    object        
 9   lead_type                      836 non-null    object        
 10  lead_behaviour_profile         665 non-null    object        
 11  has_company      