# Packages import
Vamos usar o pacote Pandas

In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)

# Dataset import
Para esta sesão, será utilizado dois datasets, onde vamos os mesclar de acordo com a primary key: 'transaction_id' e 'ref_date'

In [2]:
# Importando o dataset completo de 'nps' (notebook5)
nps = pd.read_parquet('../data/stage/3m/customers_transactions_vouchers.parquet')

# Importando o dataset completo de 'ratings' 
ratings = pd.read_parquet('../data/raw/3m/acelera_prd_ratings.snappy.parquet')

# Show datasets
Breve visualização dos dados

In [3]:
# O dataset contém 1969416 de registros e 144 colunas.
nps.info(max_cols=161)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48817 entries, 0 to 48816
Data columns (total 94 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   consumer_id                       48817 non-null  int64  
 1   role_id                           48817 non-null  int64  
 2   cpf                               48817 non-null  float64
 3   email                             48817 non-null  object 
 4   birthdate                         48817 non-null  object 
 5   city                              44000 non-null  object 
 6   state                             44000 non-null  object 
 7   phonenumber                       48817 non-null  float64
 8   created_at_nps                    48817 non-null  object 
 9   updated_at_nps                    48817 non-null  object 
 10  eou_synced_at                     7236 non-null   object 
 11  last_activity_date                48817 non-null  object 
 12  comp

In [4]:
# O dataset contém 19M de registros e 15 colunas.
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19320837 entries, 0 to 19320836
Data columns (total 15 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   rating_id         object
 1   consumer_id       object
 2   device_model      object
 3   transaction_type  object
 4   rating_value      object
 5   rating_reason     object
 6   rating_comment    object
 7   ref_id            object
 8   created_at        object
 9   updated_at        object
 10  store_id          object
 11  datalake_dt       object
 12  year_ptt          object
 13  month_ptt         object
 14  day_ptt           object
dtypes: object(15)
memory usage: 2.2+ GB


# Merge ('NPS' + 'Ratings')
Para essa mescla será necessário 'transaction_id' e 'ref_date' como primary keys: transaction_id, consumer_id, store_id

In [3]:
columns = [
    'rating_id', 'consumer_id', 'device_model', 'transaction_type',
    'rating_value', 'rating_reason', 'rating_comment', 'ref_id',
    'created_at', 'updated_at', 'store_id'
]
ratings = ratings[columns]

In [4]:
# Para fazer a mescla, a variável 'on' precisa ter o mesmo dtype em ambos datasets.
ratings = ratings.rename(columns={'ref_id': 'transaction_id'})

In [5]:
ratings['transaction_id'].notna().value_counts()

True     19319431
False        1406
Name: transaction_id, dtype: int64

In [6]:
# Alterar o dtype de transaction_id para numerico
ratings['transaction_id'] = pd.to_numeric(ratings['transaction_id'], errors = 'coerce')
ratings.loc[:,'consumer_id'] = ratings['consumer_id'].astype('int')

In [7]:
ratings['transaction_id'].notna().value_counts()

True     19252572
False       68265
Name: transaction_id, dtype: int64

In [8]:
ratings = ratings[ratings['transaction_id'].notna()]
ratings = ratings[ratings['transaction_id'] != 0]

In [9]:
ratings.loc[:,'transaction_id'] = ratings['transaction_id'].astype('int')

In [22]:
ratings = ratings.drop_duplicates(subset='transaction_id')

In [23]:
nps.shape

(48817, 94)

In [24]:
# Merge tabelas nps + ratings
merge_table = pd.merge(nps, ratings, on=['transaction_id'], how='left', suffixes=('', '_ratings'))

In [25]:
merge_table.shape

(48817, 104)

In [26]:
# Há 1984732 de observações e 174 colunas
merge_table.info(max_cols = 174)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48817 entries, 0 to 48816
Data columns (total 104 columns):
 #    Column                            Non-Null Count  Dtype  
---   ------                            --------------  -----  
 0    consumer_id                       48817 non-null  int64  
 1    role_id                           48817 non-null  int64  
 2    cpf                               48817 non-null  float64
 3    email                             48817 non-null  object 
 4    birthdate                         48817 non-null  object 
 5    city                              44000 non-null  object 
 6    state                             44000 non-null  object 
 7    phonenumber                       48817 non-null  float64
 8    created_at_nps                    48817 non-null  object 
 9    updated_at_nps                    48817 non-null  object 
 10   eou_synced_at                     7236 non-null   object 
 11   last_activity_date                48817 non-null  ob

# Salvando em parquet


In [27]:
#Salvando em Parquet
merge_table.to_parquet('../data/stage/3m/customers_transactions_vouchers_ratings.parquet')

In [11]:
merge_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1984732 entries, 0 to 1984731
Columns: 174 entries, consumer_id to day_ptt_ratings
dtypes: datetime64[ns](2), float32(3), float64(43), int64(12), object(114)
memory usage: 2.6+ GB
