In [2]:
import pandas as pd
import numpy as np

In [3]:
df_closed = pd.read_csv("../data/raw/olist_closed_deals_dataset.csv")
#
df_mkt = pd.read_csv("../data/raw/olist_marketing_qualified_leads_dataset.csv")

In [4]:
df_mkt['mql_id'].nunique()

8000

In [7]:
def get_low_completion_columns(df, seller_id_col='seller_id', threshold=80):
    """
    Retorna las columnas que tienen un porcentaje de completitud menor al threshold especificado,
    calculado en relación a la cantidad de seller_id únicos.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame a analizar
    seller_id_col : str
        Nombre de la columna que contiene el seller_id
    threshold : int
        Porcentaje mínimo de completitud requerido (0-100)
    
    Returns:
    --------
    list
        Lista de columnas que no cumplen con el porcentaje mínimo de completitud
    """
    
    # Cantidad de sellers únicos
    total_sellers = df[seller_id_col].nunique()
    
    # Calcular porcentaje de completitud para cada columna
    completion_rates = (df.count() / total_sellers * 100).round(2)
    
    # Filtrar columnas por debajo del threshold
    low_completion_cols = completion_rates[completion_rates < threshold].index.tolist()
    
    return low_completion_cols

In [20]:
df_mkt_closed = df_mkt.merge(df_closed, on='mql_id', how='left')
#
# Usar el análisis
# Asumiendo que tu DataFrame se llama 'df'
low_quality_columns = get_low_completion_columns(df_mkt_closed, threshold=60)
#
list_drop_columns = ['landing_page_id', 'seller_id', 'sdr_id', 'sr_id'] + low_quality_columns
#
df_mkt_closed = df_mkt_closed.drop(columns=list_drop_columns)
#
#
df_processed = df_mkt_closed.assign(
        first_contact_date=lambda df: pd.to_datetime(df_mkt_closed['first_contact_date'], format='%Y-%m-%d', errors='coerce'),
        won_date= pd.to_datetime(df_mkt_closed['won_date'].str[:10], format='%Y-%m-%d', errors='coerce'),
        target= lambda df: np.where(df_mkt_closed['won_date'].isnull(), 0, 1),
        origin= lambda df: np.where(df_mkt_closed['origin'].isnull(), 'unknown', df_mkt_closed['origin']))
#
df_processed['days_to_convert'] = (df_processed['won_date'] - df_processed['first_contact_date']).dt.days

In [29]:
origin_conversion = df_processed.groupby('origin', as_index=False)\
                                .agg(mql=('mql_id', 'count'),
                                     won=('target', 'sum'))
#
# Calcular los porcentajes
total_mql = origin_conversion['mql'].sum()
total_won = origin_conversion['won'].sum()
# Añadir columnas de porcentaje
origin_conversion['mql_percentage'] = (origin_conversion['mql'] / total_mql * 100).round(2)
origin_conversion['won_percentage'] = (origin_conversion['won'] / total_won * 100).round(2)
origin_conversion['conversion'] = (origin_conversion['won'] / origin_conversion['mql']  * 100).round(2)

In [30]:
origin_conversion

Unnamed: 0,origin,mql,won,mql_percentage,won_percentage,conversion
0,direct_traffic,499,56,6.24,6.65,11.22
1,display,118,6,1.48,0.71,5.08
2,email,493,15,6.16,1.78,3.04
3,organic_search,2296,271,28.7,32.19,11.8
4,other,150,4,1.88,0.48,2.67
5,other_publicities,65,3,0.81,0.36,4.62
6,paid_search,1586,195,19.82,23.16,12.3
7,referral,284,24,3.55,2.85,8.45
8,social,1350,75,16.88,8.91,5.56
9,unknown,1159,193,14.49,22.92,16.65


In [21]:
df_processed.head()

Unnamed: 0,mql_id,first_contact_date,origin,won_date,business_segment,lead_type,lead_behaviour_profile,business_type,declared_monthly_revenue,target,days_to_convert
0,dac32acd4db4c29c230538b72f8dd87d,2018-02-01,social,NaT,,,,,,0,
1,8c18d1de7f67e60dbd64e3c07d7e9d5d,2017-10-20,paid_search,NaT,,,,,,0,
2,b4bc852d233dfefc5131f593b538befa,2018-03-22,organic_search,NaT,,,,,,0,
3,6be030b81c75970747525b843c1ef4f8,2018-01-22,email,NaT,,,,,,0,
4,5420aad7fec3549a85876ba1c529bd84,2018-02-21,organic_search,2018-02-26,pet,online_medium,cat,reseller,0.0,1,5.0


In [12]:
low_quality_columns

['has_company', 'has_gtin', 'average_stock', 'declared_product_catalog_size']

In [65]:
null_analysis

Unnamed: 0,null_count,total_unique_sellers,null_percentage,high_null_rate
has_company,7937,842,942.64,True
has_gtin,7936,842,942.52,True
average_stock,7934,842,942.28,True
declared_product_catalog_size,7931,842,941.92,True
lead_behaviour_profile,7335,842,871.14,True
business_type,7168,842,851.31,True
lead_type,7164,842,850.83,True
business_segment,7159,842,850.24,True
sr_id,7158,842,850.12,True
declared_monthly_revenue,7158,842,850.12,True


In [61]:
df_processed.head()

Unnamed: 0,mql_id,first_contact_date,landing_page_id,origin,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,has_company,has_gtin,average_stock,business_type,declared_product_catalog_size,declared_monthly_revenue,target,days_to_convert
0,dac32acd4db4c29c230538b72f8dd87d,2018-02-01,88740e65d5d6b056e0cda098e1ea6313,social,,,,NaT,,,,,,,,,,0,
1,8c18d1de7f67e60dbd64e3c07d7e9d5d,2017-10-20,007f9098284a86ee80ddeb25d53e0af8,paid_search,,,,NaT,,,,,,,,,,0,
2,b4bc852d233dfefc5131f593b538befa,2018-03-22,a7982125ff7aa3b2054c6e44f9d28522,organic_search,,,,NaT,,,,,,,,,,0,
3,6be030b81c75970747525b843c1ef4f8,2018-01-22,d45d558f0daeecf3cccdffe3c59684aa,email,,,,NaT,,,,,,,,,,0,
4,5420aad7fec3549a85876ba1c529bd84,2018-02-21,b48ec5f3b04e9068441002a19df93c6c,organic_search,2c43fb513632d29b3b58df74816f1b06,a8387c01a09e99ce014107505b92388c,4ef15afb4b2723d8f3d81e51ec7afefe,2018-02-26,pet,online_medium,cat,,,,reseller,,0.0,1,5.0


In [20]:
842/7940

0.10604534005037783

In [34]:
df_mkt_closed.query(" sr_id.notna() and declared_monthly_revenue > 0 ").shape

(45, 17)

In [33]:
df_mkt_closed.query(" sr_id.notna() and declared_monthly_revenue > 0 ")[['declared_monthly_revenue']]

Unnamed: 0,declared_monthly_revenue
4,0.0
12,0.0
14,0.0
39,0.0
67,0.0
...,...
7978,0.0
7983,200000.0
7991,0.0
7994,0.0


In [32]:
df_mkt_closed.query(" sr_id.notna() ")['declared_monthly_revenue'].unique()

array([0.0e+00, 1.0e+05, 2.0e+04, 6.0e+03, 1.8e+05, 3.0e+04, 6.0e+00,
       1.5e+05, 1.0e+04, 2.5e+04, 5.0e+07, 2.1e+05, 1.5e+04, 2.5e+05,
       8.0e+06, 4.0e+04, 5.0e+03, 4.0e+03, 3.0e+05, 6.0e+04, 1.0e+03,
       5.0e+04, 5.0e+05, 1.3e+05, 1.2e+05, 8.0e+03, 2.0e+05])

In [26]:
df_mkt_closed['business_segment'].value_counts(dropna=False, normalize=True)

business_segment
NaN                                0.894875
home_decor                         0.013125
health_beauty                      0.011625
car_accessories                    0.009625
household_utilities                0.008875
construction_tools_house_garden    0.008625
audio_video_electronics            0.008000
computers                          0.004250
pet                                0.003750
food_supplement                    0.003500
food_drink                         0.003250
sports_leisure                     0.003125
bags_backpacks                     0.002750
bed_bath_table                     0.002750
toys                               0.002500
fashion_accessories                0.002375
home_office_furniture              0.001750
stationery                         0.001625
phone_mobile                       0.001625
handcrafted                        0.001500
small_appliances                   0.001500
baby                               0.001250
books          

In [25]:
df_mkt_closed['business_type'].value_counts(dropna=False, normalize=True)

business_type
NaN             0.896000
reseller        0.073375
manufacturer    0.030250
other           0.000375
Name: proportion, dtype: float64

In [None]:
df_mkt_closed['origin'].value_counts(dropna=False, normalize=True)

origin
organic_search       0.287000
paid_search          0.198250
social               0.168750
unknown              0.137375
direct_traffic       0.062375
email                0.061625
referral             0.035500
other                0.018750
display              0.014750
other_publicities    0.008125
NaN                  0.007500
Name: proportion, dtype: float64

In [21]:
df_mkt_closed['lead_type'].value_counts(dropna=False, normalize=True)

lead_type
NaN                0.895500
online_medium      0.041500
online_big         0.015750
industry           0.015375
offline            0.013000
online_small       0.009625
online_beginner    0.007125
online_top         0.001750
other              0.000375
Name: proportion, dtype: float64

In [22]:
df_mkt_closed['lead_behaviour_profile'].value_counts(dropna=False, normalize=True)

lead_behaviour_profile
NaN            0.916875
cat            0.050875
eagle          0.015375
wolf           0.011875
shark          0.003000
cat, wolf      0.001000
eagle, wolf    0.000375
eagle, cat     0.000375
shark, cat     0.000125
shark, wolf    0.000125
Name: proportion, dtype: float64

In [23]:
# Total de MQLs (todos los leads en el dataset)
n_mql = df_mkt_closed['mql_id'].nunique()

# SQLs: los que tienen un vendedor asignado (ej: sr_id no nulo)
n_sql = df_mkt_closed[df_mkt_closed['sr_id'].notna()]['mql_id'].nunique()

# Won: los que tienen una fecha de conversión
n_won = df_mkt_closed[df_mkt_closed['won_date'].notna()]['mql_id'].nunique()

print("MQL:", n_mql)
print("SQL:", n_sql)
print("Won:", n_won)

MQL: 8000
SQL: 842
Won: 842


In [24]:
conversion_mql_to_sql = n_sql / n_mql
conversion_sql_to_won = n_won / n_sql
conversion_mql_to_won = n_won / n_mql

print(f"Tasa MQL → SQL: {conversion_mql_to_sql:.2%}")
print(f"Tasa SQL → Won: {conversion_sql_to_won:.2%}")
print(f"Tasa MQL → Won: {conversion_mql_to_won:.2%}")

Tasa MQL → SQL: 10.53%
Tasa SQL → Won: 100.00%
Tasa MQL → Won: 10.53%


In [42]:
df_mkt_closed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   mql_id                         8000 non-null   object        
 1   first_contact_date             8000 non-null   datetime64[ns]
 2   landing_page_id                8000 non-null   object        
 3   origin                         8000 non-null   object        
 4   seller_id                      842 non-null    object        
 5   sdr_id                         842 non-null    object        
 6   sr_id                          842 non-null    object        
 7   won_date                       842 non-null    datetime64[ns]
 8   business_segment               841 non-null    object        
 9   lead_type                      836 non-null    object        
 10  lead_behaviour_profile         665 non-null    object        
 11  has_company      