In [1]:
import pandas as pd
import pickle

In [2]:
data = pd.read_csv("../data/customer_dataset.csv", sep=',')

## 1 Puntos de corte

In [3]:
probando = data.copy()

In [4]:
data['orderAmount'], saved_bins_order = pd.qcut(data['orderAmount'], q=5, duplicates='drop', retbins=True)

In [5]:
with open('../data/saved_bins_order.pickle', 'wb') as handle:
  pickle.dump(saved_bins_order, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
data['transactionAmount'], saved_bins_transaction = pd.qcut(data['transactionAmount'], q=4, duplicates='drop', retbins=True)
with open('../data/saved_bins_transaction.pickle', 'wb') as handle:
  pickle.dump(saved_bins_transaction, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
with open('../data/saved_bins_order.pickle', 'rb') as handle:
    new_saved_bins_order = pickle.load(handle)
with open('../data/saved_bins_transaction.pickle', 'rb') as handle:
    new_saved_bins_transaction = pickle.load(handle)

In [8]:
probando["orderAmount"] = pd.cut(
    probando['orderAmount'],
    bins=new_saved_bins_order,
    include_lowest=True) # importante para que coincidan todos

In [9]:
probando.head(3)

Unnamed: 0,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,transactionAmount,transactionFailed,fraudulent,emailProvider,emailDomain,customerIPAddressVersion,sameCity
0,"(9.999, 18.4]",pending,True,card,JCB 16 digit,Citizens First Banks,18,False,False,yahoo,com,4.0,yes
1,"(18.4, 30.0]",fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,26,False,True,yahoo,com,4.0,no
2,"(39.0, 47.0]",fulfilled,False,card,VISA 16 digit,Vertex Bancorp,45,False,False,yahoo,com,6.0,no


In [10]:
data.head(3)

Unnamed: 0,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,transactionAmount,transactionFailed,fraudulent,emailProvider,emailDomain,customerIPAddressVersion,sameCity
0,"(9.999, 18.4]",pending,True,card,JCB 16 digit,Citizens First Banks,"(9.999, 21.0]",False,False,yahoo,com,4.0,yes
1,"(18.4, 30.0]",fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,"(21.0, 34.0]",False,True,yahoo,com,4.0,no
2,"(39.0, 47.0]",fulfilled,False,card,VISA 16 digit,Vertex Bancorp,"(34.0, 45.0]",False,False,yahoo,com,6.0,no


## 2. Preparación de los datos

In [11]:
data['paymentMethodIssuer'].value_counts()

Her Majesty Trust           43
Vertex Bancorp              37
Fountain Financial Inc.     35
His Majesty Bank Corp.      33
Bastion Banks               29
Bulwark Trust Corp.         29
Citizens First Banks        28
Grand Credit Corporation    27
Solace Banks                27
Rose Bancshares             25
B                            7
e                            5
c                            4
r                            3
                             2
n                            2
x                            2
o                            2
a                            1
p                            1
Name: paymentMethodIssuer, dtype: int64

In [12]:
# Reemplazar un valor a la vez
weird_payment_method = ["B", "e", "c", "r", " ", "n", "x", "o", "a", "p"]

for payment_method in weird_payment_method:
    data['paymentMethodIssuer'] = data['paymentMethodIssuer'].replace(payment_method, 'weird')

In [13]:
data['paymentMethodIssuer'].value_counts()

Her Majesty Trust           43
Vertex Bancorp              37
Fountain Financial Inc.     35
His Majesty Bank Corp.      33
Bastion Banks               29
weird                       29
Bulwark Trust Corp.         29
Citizens First Banks        28
Solace Banks                27
Grand Credit Corporation    27
Rose Bancshares             25
Name: paymentMethodIssuer, dtype: int64

In [15]:
data['paymentMethodProvider'].value_counts()

JCB 16 digit                   65
VISA 16 digit                  57
Voyager                        36
Diners Club / Carte Blanche    34
Maestro                        32
VISA 13 digit                  32
Discover                       25
American Express               22
JCB 15 digit                   20
Mastercard                     19
Name: paymentMethodProvider, dtype: int64

In [16]:
data['paymentMethodType'].value_counts()

card         242
apple pay     36
paypal        36
bitcoin       28
Name: paymentMethodType, dtype: int64

In [17]:
data['fraudulent']=data['fraudulent'].fillna(value="warning")

In [18]:
data["fraudulent"].value_counts()

False      107
True        61
Name: fraudulent, dtype: int64

In [19]:
data['fraudulent']=data['fraudulent'].astype(str)

In [20]:
class_map = {'False': 0, 'True': 1, 'warning': 2}
data['fraudulent'] = data['fraudulent'].map(class_map)

In [21]:
data.head(3)

Unnamed: 0,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,transactionAmount,transactionFailed,fraudulent,emailProvider,emailDomain,customerIPAddressVersion,sameCity
0,"(9.999, 18.4]",pending,True,card,JCB 16 digit,Citizens First Banks,"(9.999, 21.0]",False,0,yahoo,com,4.0,yes
1,"(18.4, 30.0]",fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,"(21.0, 34.0]",False,1,yahoo,com,4.0,no
2,"(39.0, 47.0]",fulfilled,False,card,VISA 16 digit,Vertex Bancorp,"(34.0, 45.0]",False,0,yahoo,com,6.0,no


In [22]:
data['orderAmount'].value_counts()

(30.0, 39.0]     98
(39.0, 47.0]     97
(9.999, 18.4]    96
(18.4, 30.0]     96
(47.0, 353.0]    91
Name: orderAmount, dtype: int64

In [23]:
data['orderAmount']=data['orderAmount'].cat.add_categories("desconocido")
data['orderAmount']=data['orderAmount'].fillna(value="desconocido")

In [24]:
data['orderAmount'].value_counts()

desconocido      145
(30.0, 39.0]      98
(39.0, 47.0]      97
(9.999, 18.4]     96
(18.4, 30.0]      96
(47.0, 353.0]     91
Name: orderAmount, dtype: int64

## 3 One hot encoding

In [25]:
data_ohe =  pd.get_dummies(data, dummy_na=True)

In [26]:
data_ohe.head(3)

Unnamed: 0,transactionFailed,fraudulent,customerIPAddressVersion,"orderAmount_(9.999, 18.4]","orderAmount_(18.4, 30.0]","orderAmount_(30.0, 39.0]","orderAmount_(39.0, 47.0]","orderAmount_(47.0, 353.0]",orderAmount_desconocido,orderAmount_nan,...,emailDomain_com,emailDomain_info,emailDomain_net,emailDomain_org,emailDomain_weird,emailDomain_nan,sameCity_no,sameCity_unknown,sameCity_yes,sameCity_nan
0,False,0,4.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,False,1,4.0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,False,0,6.0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [27]:
data_ohe_without_fraudulent = data_ohe.drop(["fraudulent"], axis=1)

In [28]:
with open('../data/categories_ohe_without_fraudulent.pickle', 'wb') as handle:
    pickle.dump(data_ohe_without_fraudulent.columns, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
filename = "../data/ohe_customer_dataset.csv"
data_ohe.to_csv(filename, index = False)