# SETTINGS

In [None]:
############ LIBRARIES

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')
%matplotlib inline

import os
import time
import datetime
import random
import multiprocessing
import pickle

import scipy.stats

import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

In [None]:
############ HELPER FUNCTIONS

import functions
import importlib
importlib.reload(functions)
from functions import *

In [None]:
############ RANDOMNESS

# seed function
def seed_everything(seed = 23):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
# set seed
seed = 23
seed_everything(seed)

# IMPORT

In [None]:
############ DATA IMPORT

# id data
train_id = pd.read_csv('../input/train_identity.csv')
test_id  = pd.read_csv('../input/test_identity.csv')

# transaction data
train_tr = pd.read_csv('../input/train_transaction.csv')
test_tr  = pd.read_csv('../input/test_transaction.csv')

# check dimensions
print(train_id.shape)
print(train_tr.shape)
print('-'*15)
print(test_id.shape)
print(test_tr.shape)

In [None]:
# check data
train_id.head()

In [None]:
# check data
train_tr.head()

# MERGER

In [None]:
# target variable
target = 'isFraud'
test_tr[target]  = np.nan

In [None]:
############ ALIGN DATA SETS

# align columns
train_tr = train_tr.reindex(sorted(train_tr.columns), axis = 1)
train_id = train_id.reindex(sorted(train_id.columns), axis = 1)
test_tr  = test_tr.reindex(sorted(test_tr.columns),   axis = 1)
test_id  = test_id.reindex(sorted(test_id.columns),   axis = 1)

# check equalty
print(np.all(train_tr.columns == test_tr.columns))
print(np.all(train_id.columns == test_id.columns))

In [None]:
############ MERGE TRAIN & TEST

# rbind columns
df_tr = pd.concat([train_tr, test_tr], axis = 0)
df_id = pd.concat([train_id, test_id], axis = 0)
print(df_tr.shape)
print(df_id.shape)

# clear memory
del train_tr, test_tr
del train_id, test_id

In [None]:
# merge ID and transactions
df = pd.merge(df_tr, df_id, on = 'TransactionID', how = 'left')
print(df.shape)
del df_tr, df_id

# PROCESSING

### COMPRESS

In [None]:
# compress data
df = reduce_mem_usage(df)

### DROP IRRELEVANT FEATURES

In [None]:
# remove columns with a single value
print(df.shape)
df = df.loc[:, df.nunique(dropna = False) != 1]
print(df.shape)

In [None]:
# remove irrelevant columns
print(df.shape)
drops = []
for var in drops:
    del df[var]
print(df.shape)

### MISSING VALUES

In [None]:
# check missings
count_missings(df)

### VARIABLE TYPES

In [None]:
# check data types
df.dtypes

In [None]:
# check distributions
df.describe()

In [None]:
# check value counts
facs = [f for f in df.columns if df[f].dtype == "object"]
for fac in facs:
    print('--------------------------------')
    print(fac + ': ' + str(df[fac].nunique()) + ' unique values')
    print('--------------------------------')
    print(df[fac].value_counts(normalize = True, dropna = False).head(2))
    print('--------------------------------')
    print('')

In [None]:
# class imbalance
df[target].value_counts(normalize = True, dropna = True)

# EXPORT

In [None]:
# export data
df.to_pickle("../input/data.pkl")
df.shape