# IEEE-CIS Fraud Detection
## Preprocessing train_identity.csv, DeviceType
*Visit [kaggle](https://www.kaggle.com/c/ieee-fraud-detection) for competition details*

**Authored by Soyoung Kang**

In [1]:
import pandas as pd
import pandas_profiling as pp
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load train and test data.

train_ = pd.read_csv('../ieee-fraud-detection/given/train_identity.csv', engine = 'python')
test_= pd.read_csv('../ieee-fraud-detection/given/test_identity.csv', engine = 'python')

In [3]:
# Retrieve necessary columns.

train = train_[['TransactionID', 'DeviceType', 'DeviceInfo']]
test = test_[['TransactionID', 'DeviceType', 'DeviceInfo']]

In [7]:
train.shape

(144233, 3)

In [32]:
train.isnull().sum()

TransactionID        0
DeviceType        3423
DeviceInfo       25567
dtype: int64

In [35]:
train.groupby('DeviceType')['TransactionID'].count()

DeviceType
desktop    85165
missing     3423
mobile     55645
Name: TransactionID, dtype: int64

In [38]:
train['DeviceType'].fillna('missing', inplace = True)
test['DeviceType'].fillna('missing', inplace = True)

# Dummify DeviceType

In [39]:
train_dum = pd.get_dummies(data = train, columns = ['DeviceType'])
test_dum = pd.get_dummies(data = test, columns = ['DeviceType'])

In [42]:
test_dum.head()

Unnamed: 0,TransactionID,DeviceInfo,DeviceType_desktop,DeviceType_missing,DeviceType_mobile
0,3663586,MYA-L13 Build/HUAWEIMYA-L13,0,0,1
1,3663588,LGLS676 Build/MXB48T,0,0,1
2,3663597,Trident/7.0,1,0,0
3,3663601,MYA-L13 Build/HUAWEIMYA-L13,0,0,1
4,3663602,SM-G9650 Build/R16NW,0,0,1


In [51]:
train_dum.to_csv('train_deviceType.csv', encoding = 'utf8')
test_dum.to_csv('test_deviceType.csv', encoding = 'utf8')

# DeviceInfo (Deprecated)

In [44]:
train_dum.head()

Unnamed: 0,TransactionID,DeviceInfo,DeviceType_desktop,DeviceType_missing,DeviceType_mobile
0,2987004,SAMSUNG SM-G892A Build/NRD90M,0,0,1
1,2987008,iOS Device,0,0,1
2,2987010,Windows,1,0,0
3,2987011,,1,0,0
4,2987016,MacOS,1,0,0


In [24]:
list(test.DeviceInfo.unique())

['MYA-L13 Build/HUAWEIMYA-L13',
 'LGLS676 Build/MXB48T',
 'Trident/7.0',
 'SM-G9650 Build/R16NW',
 'iOS Device',
 'Windows',
 'MacOS',
 nan,
 'RNE-L03 Build/HUAWEIRNE-L03',
 'MotoE2(4G-LTE',
 'Lenovo K33b36 Build/NRD90N',
 'Lenovo PB1-750M Build/S100',
 'HUAWEI CUN-L03 Build/HUAWEICUN-L03',
 'SM-G610M Build/NRD90M',
 'KFGIWI Build/LVY48F',
 'Moto E (4) Plus Build/NMA26.42-152',
 'SAMSUNG SM-A310F Build/NRD90M',
 'GT-I9060M Build/KTU84P',
 'SM-G610M Build/MMB29K',
 'Power_2',
 'Redmi 4X Build/N2G47H',
 'SM-G920V Build/NRD90M',
 'Z957',
 'rv:60.0',
 'SAMSUNG SM-G892A Build/R16NW',
 'SAMSUNG SM-J701M Build/NRD90M',
 '5049W Build/NRD90M',
 'SAMSUNG',
 '8050G Build/LMY47I',
 'SM-G928V Build/NRD90M',
 'SAMSUNG SM-G950U Build/R16NW',
 'TA-1025 Build/OPR1.170623.026',
 'SM-G930T Build/NRD90M',
 'SM-J730GM Build/NRD90M',
 'HUAWEI TAG-L13 Build/HUAWEITAG-L13',
 'Hisense L675 PRO Build/NRD90M',
 'SM-A310M Build/NRD90M',
 'Hisense F23 Build/NRD90M',
 'rv:61.0',
 'rv:48.0',
 'rv:11.0',
 'SM-G925V B

Samsung (SAMSUNG, SM)  
LG  
Google (Pixel, Nexus)  
Huawei  
Apple (iOS)  
Motorola (Moto)  
Lenovo  
HTC  
ASUS  
Nokia  
etc.  

In [27]:
# Map device name to its brand name.

device = {'SAMSUNG': 'Samsung', 'SM': 'Samsung', 'LG': 'LG', 'Pixel': 'Google', 'Nexus': 'Google', 
          'HUAWEI': 'Huawei', 'iOS': 'Apple', 'Moto': 'Motorola', 'Lenovo': 'Lenovo', 
           'HTC':'HTC', 'ASUS':'ASUS', 'Nokia': 'Nokia'}

In [28]:
device

{'SAMSUNG': 'Samsung',
 'SM': 'Samsung',
 'LG': 'LG',
 'Pixel': 'Google',
 'Nexus': 'Google',
 'HUAWEI': 'Huawei',
 'iOS': 'Apple',
 'Moto': 'Motorola',
 'Lenovo': 'Lenovo',
 'HTC': 'HTC',
 'ASUS': 'ASUS',
 'Nokia': 'Nokia'}

In [None]:
train['DInfo'] = train['DeviceInfo'].apply(lambda x : train['DInfo'] = v if k in x else 'etc.' for k, v in device.items())

In [45]:
train_info = train_dum.replace({'(.+?)SAMSUNG(.+)': 'Samsung', '(.+?)SM(.+)': 'Samsung', '(.+?)LG(.+)': 'LG', 
                                '(.+?)Pixel(.+)': '(.+?)Google', 'Nexus(.+)': 'Google', 
          '(.+?)HUAWEI(.+)': 'Huawei', '(.+?)iOS(.+)': 'Apple', '(.+?)Moto(.+)': 'Motorola', 
                                '(.+?)Lenovo(.+)': 'Lenovo', 
           '(.+?)HTC(.+)':'HTC', '(.+?)ASUS(.+)':'ASUS', '(.+?)Nokia(.+)': 'Nokia'}, regex = True) 


In [48]:
train_info.DeviceInfo.nunique()

1553

In [49]:
train_dum.DeviceInfo.nunique()

1786

In [50]:
train_info.head(100)

Unnamed: 0,TransactionID,DeviceInfo,DeviceType_desktop,DeviceType_missing,DeviceType_mobile
0,2987004,Samsung,0,0,1
1,2987008,iOS Device,0,0,1
2,2987010,Windows,1,0,0
3,2987011,,1,0,0
4,2987016,MacOS,1,0,0
5,2987017,Windows,1,0,0
6,2987022,,0,1,0
7,2987038,,0,0,1
8,2987040,Windows,1,0,0
9,2987048,Windows,1,0,0


In [4]:
train.head()

Unnamed: 0,TransactionID,DeviceType,DeviceInfo
0,2987004,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,mobile,iOS Device
2,2987010,desktop,Windows
3,2987011,desktop,
4,2987016,desktop,MacOS


In [5]:
train.set_index('TransactionID')

Unnamed: 0_level_0,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1
2987004,mobile,SAMSUNG SM-G892A Build/NRD90M
2987008,mobile,iOS Device
2987010,desktop,Windows
2987011,desktop,
2987016,desktop,MacOS
2987017,desktop,Windows
2987022,,
2987038,mobile,
2987040,desktop,Windows
2987048,desktop,Windows


In [16]:
i = train[train['DeviceType'] == 'NaN'].index

AttributeError: 'Int64Index' object has no attribute 'head'