In [28]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
df_nps = pd.read_parquet('../data/stage/3m/customers_nps_churn_palive.parquet')

In [29]:
columns = ['consumer_id', 'nps', 'mercadopago_id', 'allow_promos'] 
features = pd.DataFrame()
features[columns] =  df_nps[columns]

In [4]:
df_push = pd.read_parquet('../data/stage/3m/customers_push.parquet')
df_email_open = pd.read_parquet('../data/raw/3m/80464_open.snappy.parquet')
df_email_sent = pd.read_parquet('../data/raw/3m/80464_sent.snappy.parquet')
df_zendesk = pd.read_parquet('../data/stage/3m/customers_zendesk.parquet')
df_nps_loyalty = pd.read_parquet('../data/stage/3m/customers_loyalty.parquet')
df_transactions = pd.read_parquet('../data/stage/3m/customers_transactions_vouchers_ratings.parquet')

### Push features

In [30]:
df_push['month_ptt'] = pd.to_numeric(df_push['month_ptt'], errors='coerce', downcast='integer')
df_push['year_ptt'] = pd.to_numeric(df_push['year_ptt'], errors='coerce', downcast='integer')

In [31]:
mask_year = df_push['year_ptt'] == 2021
mask_month_1m = df_push['month_ptt'] > 9
mask_month_3m = df_push['month_ptt'] > 7

In [32]:
# PUSH_BOUNCED PUSH_OPEN
'''
total_pushs_bounced_1m
total_pushs_bounced_3m
total_open_pushs_1m
total_sent_pushs_1m
'''

df_push_ = df_push[mask_year & mask_month_1m][['consumer_id', 'EVENT_UUID']].groupby('consumer_id').count()\
    .rename(columns={'EVENT_UUID':'total_sent_pushs_1m'})
features = features.merge(df_push_, on='consumer_id', how='left')

df_push_ = df_push[mask_year & mask_month_1m][['consumer_id', 'PUSH_OPEN']].groupby('consumer_id').sum()\
    .rename(columns={'PUSH_OPEN':'total_open_pushs_1m'})
features = features.merge(df_push_, on='consumer_id', how='left')

df_push_ = df_push[mask_year & mask_month_1m][['consumer_id', 'PUSH_BOUNCED']].groupby('consumer_id').sum()\
    .rename(columns={'PUSH_BOUNCED':'total_pushs_bounced_1m'})
features = features.merge(df_push_, on='consumer_id', how='left')

df_push_ = df_push[mask_year & mask_month_3m][['consumer_id', 'PUSH_BOUNCED']].groupby('consumer_id').sum()\
    .rename(columns={'PUSH_BOUNCED':'total_pushs_bounced_3m'})
features = features.merge(df_push_, on='consumer_id', how='left')
features = features.fillna(0)

### Emails Open

In [33]:
df_email_open = df_email_open.rename(columns={'CUSTOMER_ID':'consumer_id'})

df_email_open['month_ptt'] = pd.to_numeric(df_email_open['month_ptt'], errors='coerce', downcast='integer')
df_email_open['year_ptt'] = pd.to_numeric(df_email_open['year_ptt'], errors='coerce', downcast='integer')
df_email_open['consumer_id'] = pd.to_numeric(df_email_open['consumer_id'], errors='coerce', downcast='integer')

In [34]:
mask_year = df_email_open['year_ptt'] == 2021
mask_month_1m = df_email_open['month_ptt'] > 9
mask_month_3m = df_email_open['month_ptt'] > 7

In [35]:
df_open_ = df_email_open[mask_year & mask_month_1m][['consumer_id', 'BROWSER']].groupby('consumer_id').count()\
    .rename(columns={'BROWSER':'total_emails_open_1m'})
features = features.merge(df_open_, on='consumer_id', how='left')

df_open_ = df_email_open[mask_year & mask_month_3m][['consumer_id', 'BROWSER']].groupby('consumer_id').count()\
    .rename(columns={'BROWSER':'total_emails_open_3m'})
features = features.merge(df_open_, on='consumer_id', how='left')

### Emails Sent

In [36]:
df_email_sent = df_email_sent.rename(columns={'CUSTOMER_ID':'consumer_id'})

df_email_sent['month_ptt'] = pd.to_numeric(df_email_sent['month_ptt'], errors='coerce', downcast='integer')
df_email_sent['year_ptt'] = pd.to_numeric(df_email_sent['year_ptt'], errors='coerce', downcast='integer')
df_email_sent['consumer_id'] = pd.to_numeric(df_email_sent['consumer_id'], errors='coerce', downcast='integer')

In [37]:
mask_year = df_email_sent['year_ptt'] == 2021
mask_month_1m = df_email_sent['month_ptt'] > 9
mask_month_3m = df_email_sent['month_ptt'] > 7

In [38]:
df_sent_ = df_email_sent[mask_year & mask_month_1m][['consumer_id', 'EMAIL']].groupby('consumer_id').count()\
    .rename(columns={'EMAIL':'total_emails_sent_1m'})
features = features.merge(df_sent_, on='consumer_id', how='left')

df_sent_ = df_email_sent[mask_year & mask_month_3m][['consumer_id', 'EMAIL']].groupby('consumer_id').count()\
    .rename(columns={'EMAIL':'total_emails_sent_3m'})
features = features.merge(df_sent_, on='consumer_id', how='left')

### Transactions DataFrame

In [39]:
df_transactions['consumer_id'] = pd.to_numeric(df_transactions['consumer_id'], errors='coerce', downcast='integer')
df_transactions['rating_value'] = pd.to_numeric(df_transactions['rating_value'], errors='coerce', downcast='integer')

In [40]:
mask_month_1m = df_transactions['created_at'] > '2021-10-01'
mask_month_3m = df_transactions['created_at'] > '2021-08-01'
mask_month_3m_u = (df_transactions['created_at'] > '2021-08-01') & (df_transactions['created_at'] < '2021-09-01')
mask_rating_reason = df_transactions['rating_reason'] != ''
mask_partner = df_transactions['payout_partner_id'].notna()

In [41]:
#distinct_reason_1m
df_transaction_ = df_transactions[mask_month_1m][['consumer_id', 'rating_reason']].groupby('consumer_id')\
    .nunique().rename(columns={'rating_reason':'distinct_reason_1m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#distinct_reason_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'rating_reason']].groupby('consumer_id')\
    .nunique().rename(columns={'rating_reason':'distinct_reason_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mode_rating_reason_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'rating_reason']].groupby('consumer_id')\
    .agg(pd.Series.mode).rename(columns={'rating_reason':'mode_rating_reason_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#max_diff_total_ratings
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'rating_value']].groupby('consumer_id')\
    .max().rename(columns={'rating_value':'max_rating'})
df_transaction_2 = df_transactions[mask_month_3m][['consumer_id', 'rating_value']].groupby('consumer_id')\
    .min().rename(columns={'rating_value':'max_rating'})

df_transaction_['max_diff_total_ratings'] = df_transaction_ - df_transaction_2
df_transaction_ = df_transaction_[['max_diff_total_ratings']]
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mm_total_ratings_reason
df_transaction_ = df_transactions[mask_month_3m & mask_rating_reason][['consumer_id', 'rating_value']].groupby('consumer_id')\
    .mean().rename(columns={'rating_value':'mm_total_ratings_reason'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mm_total_ratings
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'rating_value']].groupby('consumer_id')\
    .sum().rename(columns={'rating_value':'mm_total_ratings'})/3
features = features.merge(df_transaction_, on='consumer_id', how='left')

#trend_total_ratings
df_transaction_ = df_transactions[mask_month_1m][['consumer_id', 'rating_value']].groupby('consumer_id')\
    .mean().rename(columns={'rating_value':'trend_total_ratings_'})
df_transaction_2 = df_transactions[mask_month_3m_u][['consumer_id', 'rating_value']].groupby('consumer_id')\
    .mean().rename(columns={'rating_value':'trend_total_ratings_'})

df_transaction_['trend_total_ratings'] = df_transaction_ - df_transaction_2
df_transaction_ = df_transaction_[['trend_total_ratings']]
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mean_rating_1m
df_transaction_ = df_transactions[mask_month_1m][['consumer_id', 'rating_value']].groupby('consumer_id')\
    .mean().rename(columns={'rating_value':'mean_rating_1m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mean_rating_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'rating_value']].groupby('consumer_id')\
    .mean().rename(columns={'rating_value':'mean_rating_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#### Vouchers

In [42]:
#distinct_voucher_id_1m
df_transaction_ = df_transactions[mask_month_1m][['consumer_id', 'voucher_id_vouchers']].groupby('consumer_id')\
    .nunique().rename(columns={'voucher_id_vouchers':'distinct_voucher_id_1m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#distinct_voucher_id_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'voucher_id_vouchers']].groupby('consumer_id')\
    .nunique().rename(columns={'voucher_id_vouchers':'distinct_voucher_id_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mode_channel_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'reward_id_vouchers']].groupby('consumer_id')\
    .agg(pd.Series.mode).rename(columns={'reward_id_vouchers':'mode_channel_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mm_vouchers
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'reward_id_vouchers']].groupby('consumer_id')\
    .count().rename(columns={'reward_id_vouchers':'mm_vouchers'})/3
features = features.merge(df_transaction_, on='consumer_id', how='left')

#distinct_redeemed_voucher_id_1m

#mm_redeemed_vouchers
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'voucher_id_vouchers']].groupby('consumer_id')\
    .count().rename(columns={'voucher_id_vouchers':'mm_redeemed_vouchers'})/3
features = features.merge(df_transaction_, on='consumer_id', how='left')

#trend_redeemed_vouchers
df_transaction_ = df_transactions[mask_month_1m][['consumer_id', 'voucher_id_vouchers']].groupby('consumer_id')\
    .count().rename(columns={'voucher_id_vouchers':'trend_redeemed_vouchers'})
df_transaction_2 = df_transactions[mask_month_3m_u][['consumer_id', 'voucher_id_vouchers']].groupby('consumer_id')\
    .count().rename(columns={'voucher_id_vouchers':'trend_redeemed_vouchers'})

df_transaction_['trend_redeemed_vouchers'] = df_transaction_ - df_transaction_2
df_transaction_ = df_transaction_[['trend_redeemed_vouchers']]
features = features.merge(df_transaction_, on='consumer_id', how='left')

#distinct_reward_id_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'voucher_discount']].groupby('consumer_id')\
    .nunique().rename(columns={'voucher_discount':'distinct_reward_id_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mean_redeemed_discount_amount_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'voucher_discount']].groupby('consumer_id')\
    .mean().rename(columns={'voucher_discount':'mean_redeemed_discount_amount_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#sum_redeemed_discount_amount_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'voucher_discount']].groupby('consumer_id')\
    .sum().rename(columns={'voucher_discount':'sum_redeemed_discount_amount_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

### Transactions

In [43]:
#p_discount_1m
df_transaction_ = df_transactions[mask_month_1m][['consumer_id', 'discount_amount']].groupby('consumer_id')\
    .sum().rename(columns={'discount_amount':'discount_amount'})
df_transaction_1 = df_transactions[mask_month_1m][['consumer_id', 'total_in_cents']].groupby('consumer_id')\
    .sum().rename(columns={'total_in_cents':'total_in_cents'})

df_transactions['discount_amount'] = pd.to_numeric(df_transactions['discount_amount'], errors='coerce', downcast='integer')
df_transaction_['discount_amount'] = df_transaction_['discount_amount'].fillna(0).astype('float')

df_transaction_['p_discount_1m'] = (df_transaction_['discount_amount']*100)/df_transaction_1['total_in_cents']

df_transaction_ = df_transaction_.drop('discount_amount', axis=1)
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mean_total_value_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'total_in_cents']].groupby('consumer_id')\
    .mean().rename(columns={'total_in_cents':'mean_total_value_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#sum_total_discont_3m
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'discount_amount']].groupby('consumer_id')\
    .sum().rename(columns={'discount_amount':'sum_total_discont_3m'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#max_diff_stores
df_transaction_ = df_transactions[mask_month_3m][['consumer_id', 'store_id_ratings']].groupby('consumer_id')\
    .nunique().rename(columns={'store_id_ratings':'max_diff_stores'})
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mm_transactions_partners
df_transaction_ = df_transactions[mask_month_3m & mask_partner][['consumer_id', 'transaction_id']].groupby('consumer_id')\
    .count().rename(columns={'transaction_id':'mm_transactions_partners'})/3
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mm_total_value_partners
df_transaction_ = df_transactions[mask_month_3m & mask_partner][['consumer_id', 'total_in_cents']].groupby('consumer_id')\
    .sum().rename(columns={'total_in_cents':'mm_total_value_partners'})/3
features = features.merge(df_transaction_, on='consumer_id', how='left')

#mob
columns = ['consumer_id', 'created_at_transactions']
df_transactions = df_transactions.sort_values(columns)
df_transaction_ = df_transactions[columns]\
    .groupby('consumer_id').first().rename(columns={'created_at_transactions':'mob'})

df_transaction_['mob_'] = '2021-11-01'
df_transaction_['mob_'] = pd.to_datetime(df_transaction_['mob_'])
df_transaction_['mob'] = pd.to_datetime(df_transaction_['mob'])

df_transaction_['mob'] = df_transaction_['mob_'].dt.to_period('M').view(int)\
    - df_transaction_['mob'].dt.to_period('M').view(int)
features = features.merge(df_transaction_, on='consumer_id', how='left')


In [44]:
features['mercadopago_id'] = features['mercadopago_id'] != 0

### Loyalty

In [45]:
df_nps_loyalty['month_ptt'] = pd.to_numeric(df_nps_loyalty['month_ptt'], errors='coerce', downcast='integer')
df_nps_loyalty['year_ptt'] = pd.to_numeric(df_nps_loyalty['year_ptt'], errors='coerce', downcast='integer')
df_nps_loyalty['consumer_id'] = pd.to_numeric(df_nps_loyalty['consumer_id'], errors='coerce', downcast='integer')
df_nps_loyalty['value'] = pd.to_numeric(df_nps_loyalty['value'], errors='coerce', downcast='integer')

In [46]:
mask_year = df_nps_loyalty['year_ptt'] == 2021
mask_month_1m = df_nps_loyalty['month_ptt'] > 9
mask_month_3m = df_nps_loyalty['month_ptt'] > 7

In [47]:
#sum_loyalty_discount_amount_1m
df_nps_loyalty_ = df_nps_loyalty[mask_month_1m][['consumer_id', 'value']].groupby('consumer_id')\
    .sum().rename(columns={'value':'sum_loyalty_discount_amount_1m'})
features = features.merge(df_nps_loyalty_, on='consumer_id', how='left')

#mm_loyalty_amount
df_nps_loyalty_ = df_nps_loyalty[mask_month_3m][['consumer_id', 'value']].groupby('consumer_id')\
    .sum().rename(columns={'value':'mm_loyalty_amount'})/3
features = features.merge(df_nps_loyalty_, on='consumer_id', how='left')

### ZenDesk

In [48]:
df_zendesk['consumer_id'] = pd.to_numeric(df_zendesk['consumer_id'], errors='coerce', downcast='integer')

In [49]:
mask_month_1m = df_zendesk['created_at_y'] > '2021-10-01'
mask_month_3m = df_zendesk['created_at_y'] > '2021-08-01'
mask_bad_ticket = df_zendesk['satisfaction_rating_score'] == 'bad'

In [50]:
#max_num_comments_3m
df_zendesk_ = df_zendesk[mask_month_3m][['consumer_id', 'num_comments']].groupby('consumer_id')\
    .max().rename(columns={'num_comments':'max_num_comments_3m'})
features = features.merge(df_zendesk_, on='consumer_id', how='left')

#tickets_1m
df_zendesk_ = df_zendesk[mask_month_1m][['consumer_id', 'ticket_id']].groupby('consumer_id')\
    .count().rename(columns={'ticket_id':'tickets_1m'})
features = features.merge(df_zendesk_, on='consumer_id', how='left')

#tickets_3m
df_zendesk_ = df_zendesk[mask_month_3m][['consumer_id', 'ticket_id']].groupby('consumer_id')\
    .count().rename(columns={'ticket_id':'tickets_3m'})
features = features.merge(df_zendesk_, on='consumer_id', how='left')

#sum_bad_rating_score_tickets
df_zendesk_ = df_zendesk[mask_month_3m & mask_bad_ticket][['consumer_id', 'ticket_id']].groupby('consumer_id')\
    .count().rename(columns={'ticket_id':'sum_bad_rating_score_tickets'})
features = features.merge(df_zendesk_, on='consumer_id', how='left')

### Saving DataFrame

In [51]:
#Salvando em Parquet
features.to_csv('../data/stage/3m/features_raw.csv')

In [1]:
import pandas as pd

In [2]:
features = pd.read_csv('../data/stage/3m/features_raw.csv')

In [3]:
features.head()

Unnamed: 0.1,Unnamed: 0,consumer_id,nps,mercadopago_id,allow_promos,total_sent_pushs_1m,total_open_pushs_1m,total_pushs_bounced_1m,total_pushs_bounced_3m,total_emails_open_1m,...,mm_transactions_partners,mm_total_value_partners,mob,mob_,sum_loyalty_discount_amount_1m,mm_loyalty_amount,max_num_comments_3m,tickets_1m,tickets_3m,sum_bad_rating_score_tickets
0,0,3923437,1,True,1,0.0,0.0,0.0,5.0,,...,4.0,26693.666667,3,2021-11-01,220.0,-374.0,,,,
1,1,3913125,1,True,1,0.0,0.0,0.0,0.0,2.0,...,0.666667,23106.666667,3,2021-11-01,-38.0,-12.666667,,,,
2,2,3927102,1,False,1,0.0,0.0,0.0,0.0,2.0,...,,,3,2021-11-01,323.0,184.666667,,,,
3,3,3923444,0,True,1,0.0,0.0,0.0,0.0,1.0,...,,,3,2021-11-01,,11.333333,,,,
4,4,3928419,1,True,1,0.0,0.0,0.0,1.0,1.0,...,0.666667,2666.666667,3,2021-11-01,23.0,7.666667,,,,
