### **Importación de librerías necesarias**

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier

### **Lectura de datos**

In [22]:
data = pd.read_csv('datasets/PS_20174392719_1491204439457_log.csv')
train = data.sample(frac=0.75, random_state=99)
test = data.loc[~data.index.isin(train.index), :]
test = test.loc[(test.type == 'TRANSFER') | (test.type == 'CASH_OUT')]
test = test.reset_index(drop=True)
del test['isFlaggedFraud']
train.to_parquet('HistoricData/train_df.parquet', engine='pyarrow')
test.to_parquet('HistoricData/test_cloud_df.parquet', engine='pyarrow')


In [16]:
data = pd.read_parquet('train_df.parquet', engine='pyarrow')

In [17]:
X = data.loc[(data.type == 'TRANSFER') | (data.type == 'CASH_OUT')]
randomState = 99
np.random.seed(randomState)

Y = X['isFraud']
del X['isFraud']

X = X.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)
X.loc[X.type == 'TRANSFER', 'type'] = 5
X.loc[X.type == 'CASH_OUT', 'type'] = 2
X.type = X.type.astype(int)

In [18]:
trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, \
                                                random_state = randomState)

In [19]:
trainX = trainX.to_numpy()
trainY = trainY.to_numpy()
testX =  testX.to_numpy()
testY = testY.to_numpy()

In [20]:
# Long computation in this cell (~1.8 minutes)
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf = XGBClassifier(max_depth = 3, scale_pos_weight = weights, \
                n_jobs = 4)
probabilities = clf.fit(trainX, trainY).predict_proba(testX)
print('AUPRC = {}'.format(average_precision_score(testY, \
                                              probabilities[:, 1])))
                                              
clf.save_model("Model/xgboost.json")                                              

AUPRC = 0.9341369589051862
