In [1]:
from sklearn.datasets import load_iris
import numpy as np
import math
from tqdm import tqdm
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import pandas as pd
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, ConfusionMatrixDisplay, confusion_matrix
import xgboost as xgb
import category_encoders.utils as util
from category_encoders import TargetEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier 

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
iden = pd.read_csv('train_identity.csv')

In [4]:
trans = pd.read_csv('train_transaction.csv')

In [5]:
joined = trans.merge(iden,how='left')

In [6]:
del(joined['TransactionID'])

In [7]:
del iden

In [8]:
del trans

In [9]:
limite = np.percentile(joined['TransactionDT'], 80)

In [10]:
filtro_validation = joined['TransactionDT'] > limite

In [11]:
validation = joined[filtro_validation]

In [12]:
label_validation = validation[['isFraud']]

In [13]:
del(validation['isFraud'])

In [14]:
filtro_train = joined['TransactionDT']<= limite

In [15]:
train = joined[filtro_train]

In [16]:
label_train = train[['isFraud']]

In [17]:
del(train['isFraud'])

In [18]:
del joined

In [19]:
dict_encoding = {}

In [20]:
def countVect(df,columna,dic):
    if columna not in dic:
        dic[columna] = CountVectorizer()
        dic[columna].fit_transform(df[[columna]].fillna('desconocido'))
    col_encoded = dic[columna].transform(df[[columna]]).toarray()
    col_encoded = pd.DataFrame(col_encoded).add_prefix(columna).fillna(0)
    del(df[columna])
    df_nuevo = pd.concat([df.reset_index(drop=True),col_encoded], axis=1)
    return df_nuevo

In [21]:
def one_hot_encode(df, columna, dict_encoding):
    if columna not in dict_encoding:
        dict_encoding[columna] = OneHotEncoder(handle_unknown='ignore')
        dict_encoding[columna].fit_transform(df[[columna]].astype(str)).astype(int)
    col_encoded = (dict_encoding[columna].transform(df[[columna]].astype(str)).astype(int))
    col_encoded = pd.DataFrame(col_encoded.todense()).add_prefix(columna + '_').fillna(0)
    del(df[columna])
    df_nuevo = pd.concat([df.reset_index(drop=True), col_encoded], axis=1)
    return df_nuevo

In [22]:
def mean_encoding(df,columna,label, dict_encoding):
    if columna not in dict_encoding:
        dict_encoding[columna] = TargetEncoder(handle_unknown='ignore')
        dict_encoding[columna].fit_transform(df[[columna]], label)
    df[columna] = dict_encoding[columna].transform(df[[columna]])
    df[columna].fillna(0,inplace=True)
    return df

In [23]:
def label_ec(df):
    for column in df.columns:
        if df[column].dtypes==object:
            dict_encoding[column] = OrdinalEncoder(handle_unknown='ignore')
            df[column] = dict_encoding[column].fit_transform(df[column].astype(str).values.reshape(-1, 1))
    if df[column].dtype==object:
        df[column] = dict_encoding[column].transform(df[column].astype(str).values.reshape(-1, 1))
    df.fillna(-1,inplace=True)
    return(df)

### Train

In [24]:
countV = ['id_30','id_31','DeviceInfo']

In [25]:
for i in countV:
    train=countVect(train,i,dict_encoding)

In [26]:
oneHot = ['P_emaildomain','card4','ProductCD','id_38','id_35','id_34']

In [27]:
for i in oneHot:
    train = one_hot_encode(train,i,dict_encoding)

In [28]:
mean = ['addr2','DeviceType']

In [29]:
for i in mean:
    train = mean_encoding(train,i,label_train,dict_encoding)

In [30]:
train = label_ec(train)

In [31]:
red = MLPClassifier(random_state=1,learning_rate_init=0.5,max_iter=20)

In [32]:
red.fit(train, label_train)

MLPClassifier(learning_rate_init=0.5, max_iter=20, random_state=1)

In [33]:
print(classification_report(label_train, red.predict(train)))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98    455833
           1       0.00      0.00      0.00     16599

    accuracy                           0.96    472432
   macro avg       0.48      0.50      0.49    472432
weighted avg       0.93      0.96      0.95    472432



##### RocAuc Train

In [34]:
roc_auc_score(label_train, red.predict_proba(train)[:,1])

0.5

### Validation

In [35]:
for i in countV:
    validation=countVect(validation,i,dict_encoding)

In [36]:
for i in oneHot:
    validation = one_hot_encode(validation,i,dict_encoding)

In [37]:
for i in mean:
    validation = mean_encoding(validation,i,label_validation,dict_encoding)

In [38]:
validation = label_ec(validation)

In [39]:
print(classification_report(label_validation, red.predict(validation)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98    114044
           1       0.00      0.00      0.00      4064

    accuracy                           0.97    118108
   macro avg       0.48      0.50      0.49    118108
weighted avg       0.93      0.97      0.95    118108



##### RocAuc Validation

In [40]:
roc_auc_score(label_validation, red.predict_proba(validation)[:,1])

0.5