In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [3]:
# Import dos datasets a serem utilizados: Customers e NPS

# Como visto na sesão '1.1' este dataset 'customer' está com os dtypes ajustados.
df_customers = pd.read_parquet('../data/raw/3m/customers.parquet')

# Importando o NPS em formato csv (extensão do Excel).
df_nps = pd.read_csv('../data/raw/nps/PESQUISA_SATISFACAO_NPS_20211108_151208.csv', delimiter='|',encoding= 'iso-8859-1')
# Acima, tivemos de passar o parâmetro 'encoding' pois não foi reconhecido os caracteres especiais (acentos, Ç, ^, ~, ...)

# Importando o indique e ganhe
df_indique_ganhe = pd.read_parquet('../data/raw/3m/customer_indique_ganhe.parquet')

In [4]:
df_indique_ganhe['nota_recomendacao'] = 10

 ### Definition of data types

In [5]:
# Para o tipo numérico
df_nps['NOTA_RECOMENDACAO'] = pd.to_numeric(df_nps['NOTA_RECOMENDACAO'], errors = 'coerce')

# Para o tipo datetime
df_nps['CREATED_DATE_'] = pd.to_datetime(df_nps['CREATED_DATE_'])
df_nps['MODIFIED_DATE_'] = pd.to_datetime(df_nps['MODIFIED_DATE_'])

# Para o tipo string
df_nps['EMAIL'] = df_nps['EMAIL'].astype(str)
df_nps['SATISFACAO'] = df_nps['SATISFACAO'].astype(str)
df_nps['SUGESTAO'] = df_nps['SUGESTAO'].astype(str)

In [6]:
# Para o tipo numérico
df_customers['consumer_id'] = pd.to_numeric(df_customers['consumer_id'], errors = 'coerce')
df_customers['role_id'] = pd.to_numeric(df_customers['role_id'], errors = 'coerce')
df_customers['cpf'] = pd.to_numeric(df_customers['cpf'], errors = 'coerce')
df_customers['zipcode'] = pd.to_numeric(df_customers['zipcode'], errors = 'coerce')
df_customers['street_number'] = pd.to_numeric(df_customers['street_number'], errors = 'coerce')
df_customers['phonenumber'] = pd.to_numeric(df_customers['phonenumber'], errors = 'coerce')
df_customers['paypal_cpf'] = pd.to_numeric(df_customers['paypal_cpf'], errors = 'coerce')
df_customers['allow_newsletter'] = pd.to_numeric(df_customers['allow_newsletter'], errors = 'coerce')
df_customers['allow_promos'] = pd.to_numeric(df_customers['allow_promos'], errors = 'coerce')
df_customers['paypal_allow_automatic_payment'] = pd.to_numeric(df_customers['paypal_allow_automatic_payment'], errors = 'coerce')
df_customers['allow_automatic_payment'] = pd.to_numeric(df_customers['allow_automatic_payment'], errors = 'coerce')
df_customers['paypal_subscription_canceled'] = pd.to_numeric(df_customers['paypal_subscription_canceled'], errors = 'coerce')
df_customers['active'] = pd.to_numeric(df_customers['active'], errors = 'coerce')
df_customers['signup_complete'] = pd.to_numeric(df_customers['signup_complete'], errors = 'coerce')
df_customers['city_id'] = pd.to_numeric(df_customers['city_id'], errors = 'coerce')
df_customers['year_ptt'] = pd.to_numeric(df_customers['year_ptt'], errors = 'coerce')
df_customers['month_ptt'] = pd.to_numeric(df_customers['month_ptt'], errors = 'coerce')
df_customers['day_ptt'] = pd.to_numeric(df_customers['day_ptt'], errors = 'coerce')

In [7]:
df_customers = df_customers.drop_duplicates(subset='consumer_id')

In [8]:
df_indique_ganhe['consumer_id'] = pd.to_numeric(df_indique_ganhe['consumer_id'])

### Renaming columns

In [9]:
columns = {
    'EMAIL':'email',
    'NOTA_RECOMENDACAO':'nota_recomendacao',
    'SATISFACAO':'satisfacao',
    'SUGESTAO': 'sugestao',
    'CREATED_DATE_': 'created_date_',
    'MODIFIED_DATE_': 'modified_date'
}
df_nps = df_nps.rename(columns=columns)

In [10]:
df_nps.head()

Unnamed: 0,email,nota_recomendacao,satisfacao,sugestao,created_date_,modified_date
0,roneyolliiver@hotmail.com,10.0,muito satisfeito,"Muito bom, recomendo a todos a mais de 3 anos ...",2020-09-01 14:04:50,2020-09-01 14:04:50
1,matimbu8@gmail.com,7.0,satisfeito,O atendimento foi ótimo os frentistas me ajudo...,2020-09-01 14:04:52,2020-09-01 14:04:52
2,tiagofranco31@gmail.com,3.0,insatisfeito,"Falhou o pagamento, erro na hora pagar, pouco ...",2020-09-01 14:04:54,2020-09-01 14:04:54
3,pontepreta.paineiras@gmail.com,10.0,muito satisfeito,Amoooo qdo recebo código de desconto... até ab...,2020-09-01 14:04:55,2020-09-01 14:04:55
4,machabas@gmail.com,1.0,insatisfeito,"Apresentam promoção, quando chego pra abastece...",2020-09-01 14:04:57,2021-04-14 11:25:29


### Merging DataFrames

In [11]:
df_customer_ig = df_customers.merge(df_indique_ganhe, on='consumer_id', how='left')

In [12]:
df_customer_ig_nps = df_customer_ig.merge(df_nps, on='email', how='left', suffixes=('_customer', '_nps'))

### Filtering final Dataframe

In [13]:
has_nps_1_mask = df_customer_ig_nps.nota_recomendacao_customer.notna()
has_nps_2_mask = df_customer_ig_nps.nota_recomendacao_nps.notna()
date_range_mask = df_customer_ig_nps['created_date_'] > '2021-10-24 00:00:00'

In [14]:
df_customer_ig_nps = df_customer_ig_nps[has_nps_1_mask | (has_nps_2_mask & date_range_mask)]

In [15]:
df_customer_ig_nps.loc[:,'nps'] = df_customer_ig_nps['nota_recomendacao_customer'].fillna(df_customer_ig_nps["nota_recomendacao_nps"])

In [20]:
df_customer_ig_nps.shape

(5306, 59)

In [25]:
df_customer_ig_nps = df_customer_ig_nps.drop_duplicates(subset='consumer_id')

In [26]:
df_customer_ig_nps.shape

(5062, 59)

### Saving final DataFrame

In [28]:
df_customer_ig_nps.to_parquet('../data/stage/3m/customers_nps_ig.parquet')