# Import package

In [1]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import numpy as np 
import os
import pandas as pd 

### Functions to reduce memory usage

In [2]:
def memory_usage_mb(train, *args, **kwargs):
    """Dataframe memory usage in MB. """
    return train.memory_usage(*args, **kwargs).sum() / 1024**2

def reduce_memory_usage(train, deep=True, verbose=True):
    # All types that we want to change for "lighter" ones.
    # int8 and float16 are not include because we cannot reduce
    # those data types.
    # float32 is not include because float16 has too low precision.
    numeric2reduce = ["int16", "int32", "int64", "float64"]
    start_mem = 0
    if verbose:
        start_mem = memory_usage_mb(train, deep=deep)

    for col, col_type in train.dtypes.iteritems():
        best_type = None
        if col_type in numeric2reduce:
            downcast = "integer" if "int" in str(col_type) else "float"
            train[col] = pd.to_numeric(train[col], downcast=downcast)
            best_type = train[col].dtype.name
        # Log the conversion performed.
        if verbose and best_type is not None and best_type != str(col_type):
            print(f"Column '{col}' converted from {col_type} to {best_type}")

    if verbose:
        end_mem = memory_usage_mb(train, deep=deep)
        diff_mem = start_mem - end_mem
        percent_mem = 100 * diff_mem / start_mem
        print(f"Memory usage decreased from"
              f" {start_mem:.2f}MB to {end_mem:.2f}MB"
              f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")
        
    return train

# Get train and test data

In [3]:
train_id = pd.read_csv('ieee-fraud-detection/train_identity.csv')
train_trans = pd.read_csv('ieee-fraud-detection/train_transaction.csv')
test_id = pd.read_csv('ieee-fraud-detection/test_identity.csv')
test_trans = pd.read_csv('ieee-fraud-detection/test_transaction.csv')

### Merge data

In [4]:
train = pd.merge(train_trans, train_id, on='TransactionID', how='left')
test = pd.merge(test_trans, test_id, on='TransactionID', how='left')

# Missing values

In [5]:
train.dropna(thresh = 0.5*len(train),how ='all', axis=1, inplace = True)

### Split data into numeric and categorical columns

In [6]:
train_cat =train.select_dtypes(include=['object', 'O']).copy()
train_num =train.select_dtypes(exclude=['object', 'O']).copy()

In [7]:
mm_scaler = MinMaxScaler()
mm_scaler.fit_transform(train_num)

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.69336826e-06, 0.00000000e+00, 6.35940926e-08, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.38673652e-06, 0.00000000e+00, 4.38799239e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [9.99996613e-01, 0.00000000e+00, 9.99996693e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.99998307e-01, 0.00000000e+00, 9.99997265e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        2.69027496e-03, 2.69027496e-03, 2.69027496e-03]])

### replace null values

In [8]:
for col in train_cat:
   train_cat[col] = train_cat[col].replace(np.nan, train_cat[col].mode()[0])

for col in train_num:
   train_num[col] = train_num[col].replace(np.nan, train_num[col].mean())

### Create dummy values for the categorical column values

In [9]:
dummy = LabelEncoder()
train_catg = train_cat.apply(dummy.fit_transform)

# reduce memory usage

In [10]:
train_num = reduce_memory_usage(train_num, deep=True, verbose=True)

Column 'TransactionID' converted from int64 to int32
Column 'isFraud' converted from int64 to int8
Column 'TransactionDT' converted from int64 to int32
Column 'TransactionAmt' converted from float64 to float32
Column 'card1' converted from int64 to int16
Column 'card2' converted from float64 to float32
Column 'card3' converted from float64 to float32
Column 'card5' converted from float64 to float32
Column 'addr1' converted from float64 to float32
Column 'addr2' converted from float64 to float32
Column 'C1' converted from float64 to float32
Column 'C2' converted from float64 to float32
Column 'C3' converted from float64 to float32
Column 'C4' converted from float64 to float32
Column 'C5' converted from float64 to float32
Column 'C6' converted from float64 to float32
Column 'C7' converted from float64 to float32
Column 'C8' converted from float64 to float32
Column 'C9' converted from float64 to float32
Column 'C10' converted from float64 to float32
Column 'C11' converted from float64 to 

Column 'V285' converted from float64 to float32
Column 'V286' converted from float64 to float32
Column 'V287' converted from float64 to float32
Column 'V288' converted from float64 to float32
Column 'V289' converted from float64 to float32
Column 'V290' converted from float64 to float32
Column 'V291' converted from float64 to float32
Column 'V292' converted from float64 to float32
Column 'V293' converted from float64 to float32
Column 'V294' converted from float64 to float32
Column 'V295' converted from float64 to float32
Column 'V296' converted from float64 to float32
Column 'V297' converted from float64 to float32
Column 'V298' converted from float64 to float32
Column 'V299' converted from float64 to float32
Column 'V300' converted from float64 to float32
Column 'V301' converted from float64 to float32
Column 'V302' converted from float64 to float32
Column 'V303' converted from float64 to float32
Column 'V304' converted from float64 to float32
Column 'V305' converted from float64 to 

# Select upper triangle of correlation matrix

### Drop columns with correlation greater than 0.95

In [11]:
corr_matrix = train_num.corr()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
train_num.drop(train_num[to_drop], axis=1, inplace=True)
train_num.drop(['isFraud'], axis=1, inplace=True)

# Define target and features

### train data

In [12]:
frames = [train_catg,train_num]
features = pd.concat(frames, axis=1)
target = train['isFraud']

### test data

In [13]:
test = pd.get_dummies(test)

In [14]:
### Purge data to reduce memory

In [15]:
del train_id, train_trans, test_id, test_trans, train

# Model data

In [16]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
xgboost_classifier = XGBClassifier()
xgboost_classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

# Predict

In [None]:
train_id = pd.read_csv('ieee-fraud-detection/train_identity.csv')
train_trans = pd.read_csv('ieee-fraud-detection/train_transaction.csv')
train = pd.merge(train_trans, train_id, on='TransactionID', how='left')
print ('train', train.shape)

test_id = pd.read_csv('ieee-fraud-detection/test_identity.csv')
test_trans = pd.read_csv('ieee-fraud-detection/test_transaction.csv')
test = pd.merge(test_trans, test_id, on='TransactionID', how='left')
print ('test', test.shape)

In [None]:
del features, target, X_train, y_train

In [None]:
predictions = xgboost_classifier.predict(X_test)

In [None]:
isFraud1 = xgboost_classifier.predict_proba(test[0:100])

In [None]:
isFraud1 = xgboost_classifier.predict_proba(test[0:101500])[:,1]
isFraud2 = xgboost_classifier.predict_proba(test[101500:203000])[:,1]
isFraud3 = xgboost_classifier.predict_proba(test[203000:304500])[:,1]
isFraud4 = xgboost_classifier.predict_proba(test[304500:406000])[:,1]
isFraud5 = xgboost_classifier.predict_proba(test[406000:506691])[:,1]

In [None]:
test["isFraud"] = xgboost_classifier.predict_proba(test)[:,1]

# Check performace matrix

In [None]:
print(confusion_matrix(predictions, y_test))
print(classification_report(predictions, y_test))

In [None]:
# Save prediction

In [None]:
submission = test[["TransactionID", "isFraud"]]
submission.to_csv("submission.csv", index = False)