# 1. SETTINGS

In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
# pandas options
pd.set_option("display.max_columns", None)

In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# garbage collection
import gc
gc.enable()

# 2. FUNCTIONS

In [5]:
##### FUNCTION FOR COUNTING MISSINGS
def count_missings(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending = False)
    table = pd.concat([total, percent], axis = 1, keys = ["Total", "Percent"])
    table = table[table["Total"] > 0]
    return table

# 3. IMPORT

In [6]:
# import data
train = pd.read_csv("../data/train.csv", sep = "|")
test  = pd.read_csv("../data/test.csv",  sep = "|")
print(train.shape)
print(test.shape)

(1879, 10)
(498121, 9)


In [7]:
# check data
train.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,fraud
0,5,1054,54.7,7,0,3,0.027514,0.051898,0.241379,0
1,3,108,27.36,5,2,4,0.12963,0.253333,0.357143,0
2,3,1516,62.16,3,10,5,0.008575,0.041003,0.230769,0
3,6,1791,92.31,8,4,4,0.016192,0.051541,0.275862,0
4,5,430,81.53,3,7,2,0.062791,0.189605,0.111111,0


In [8]:
# check data
test.head()

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition
0,4,467,88.48,4,8,4,0.014989,0.189465,0.571429
1,3,1004,58.99,7,6,1,0.026892,0.058755,0.259259
2,1,162,14.0,4,5,4,0.006173,0.08642,4.0
3,5,532,84.79,9,3,4,0.026316,0.15938,0.642857
4,5,890,42.16,4,0,0,0.021348,0.047371,0.210526


# 4. CREATE ARTIFICIAL IDs

In [9]:
train['id'] = [f'lab_{i}' for i in range(train.shape[0])]
test['id'] = [f'unlab_{i}' for i in range(test.shape[0])]

# 5. MERGER

In [10]:
# add empty target
test['fraud'] = np.nan

# align columns
train = train.reindex_axis(sorted(train.columns), axis = 1)
test  = test.reindex_axis(sorted(test.columns),   axis = 1)

# check equalty
train.columns == test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [11]:
# concatenate
df = pd.concat([train, test], axis = 0)
del train, test
print(df.shape)

(500000, 11)


# 6. PROCESSING

In [12]:
# check missings
count_missings(df)

Unnamed: 0,Total,Percent
fraud,498121,99.6242


In [13]:
# check data types
df.dtypes

fraud                        float64
grandTotal                   float64
id                            object
lineItemVoids                  int64
lineItemVoidsPerPosition     float64
quantityModifications          int64
scannedLineItemsPerSecond    float64
scansWithoutRegistration       int64
totalScanTimeInSeconds         int64
trustLevel                     int64
valuePerSecond               float64
dtype: object

In [14]:
# check class ratio
df['fraud'].value_counts()

0.0    1775
1.0     104
Name: fraud, dtype: int64

# 7. EXPORT

In [15]:
# export CSV
df.to_csv("../data/data_v2.csv", index = False)
df.shape

(500000, 11)