# Hypothesis - Device Activity (only step, no analysis)

## Data Cleaning

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import resample

In [2]:
# import data
data = pd.read_csv('../data/raw/raw_data.csv')

# Data Cleaning
data['intended_balcon_amount'] = data['intended_balcon_amount'].apply(lambda x: -1 if x < 0 else x)
data = data[(data['current_address_months_count'] >= 0) & (data['session_length_in_minutes'] >= 0) & (data['device_distinct_emails_8w'] >= 0)]

# Convert the variables to the appropriate data types
data['fraud_bool'] = data['fraud_bool'].astype('category')
data['payment_type'] = data['payment_type'].astype('category')
data['employment_status'] = data['employment_status'].astype('category')
data['email_is_free'] = data['email_is_free'].astype('uint8')
data['housing_status'] = data['housing_status'].astype('category')
data['phone_home_valid'] = data['phone_home_valid'].astype('uint8')
data['phone_mobile_valid'] = data['phone_mobile_valid'].astype('uint8')
data['has_other_cards'] = data['has_other_cards'].astype('uint8')
data['foreign_request'] = data['foreign_request'].astype('uint8')
data['source'] = data['source'].astype('category')
data['device_os'] = data['device_os'].astype('category')

# Remove Redundant rows
data.drop(columns=['device_fraud_count'], inplace=True)

# Ratio of 1 to 5
# Separate fraud and non-fraud data
fraud_data = data[data['fraud_bool'] == 1]
non_fraud_data = data[data['fraud_bool'] == 0]

# Undersample non-fraud data to match the size of the fraud data
undersampled_non_fraud_data = resample(non_fraud_data, replace=False, n_samples=5*len(fraud_data), random_state=42)

# Combine fraud and undersampled non-fraud data
undersampled_data = pd.concat([fraud_data, undersampled_non_fraud_data])

data = undersampled_data

## EDA Bank Activity and Device Activity

**Hypothesis: Fraudulent Bank Accounts have unique characteristics in relation to how their device activity.**
* More likely to use a phone that has more lax security i.e. windows
* More likely to have more than 1 email in each device. --> more emails to create more fake accounts
* If a phone has more than 1 email from each device, then it keep_alive_session likely to be false (i.e. value=0) --> convienience on fraudster's end. troublesome to log in and out continually.
* Session length will also be short??

**Columns considered**

* Device Activity:
    * `session_length_in_minutes`
    * `device_os`
    * `keep_alive_session`
    * `device_distinct_emails_8w`
    * `source`
    * `foreign_request`

**Explanation of Columns**

| Column Name | Description | Link to Fraud |
|:------------|:------------|:--------------|
| device_distinct_emails_8w | No. of distinct emails in banking website from the used device in last 8 weeks. <br> Simply put, if I use my email address to log into the banking website, <br> then I effectively used 1 distinct email address. <br>If I leave the phone unattended for 8 weeks, then this value becomes 0 due to inactivity for 8 weeks. | Spare phones are commonly used to prevent banking companies from tracking them. <br> Thus, able to explain the inactivity. When the phone is actually used, <br>the fraudsters are likely to use more than 2 distinct email address for 2 accounts using the same phone.


In [3]:
total_fraud_count = data['fraud_bool'].value_counts().to_frame().loc[1, "count"]
total_non_fraud_count = data['fraud_bool'].value_counts().to_frame().loc[0, "count"]

# print total counts of fraud and non-fraud
print(f"Total Fraud Count: {total_fraud_count}")
print(f"Total Non-Fraud Count: {total_non_fraud_count}")
print(f"Total Count: {total_fraud_count + total_non_fraud_count} \t Data Shape: {data.shape}")

Total Fraud Count: 10995
Total Non-Fraud Count: 54975
Total Count: 65970 	 Data Shape: (65970, 31)


In [4]:
def getFraudInfo(group):
    # get total count of fraud within this group
    fraud_count = group['fraud_bool'].value_counts().to_frame().loc[1, "count"]
    total_count = group.shape[0]
    fraud_proportion = round(fraud_count / total_count, 4)

    group['num_fraud'] = fraud_count
    group['proportion_of_fraud_in_group'] = fraud_proportion
    return group[['num_fraud', 'proportion_of_fraud_in_group']].drop_duplicates()

## Feature 1: `FE_01_device_os_emails_prob`

In [5]:
# 5 groups for feature 1
# group01 = filter for conditions where device_os == windows and device_distinct_emails_8w.isin([0,2])
# group02 = filter for conditions where device_os == macintosh and device_distinct_emails_8w.isin([0,2])
# group03 = filter for conditions where device_os == x11 and device_distinct_emails_8w.isin([0,2])
# group04 = filter for conditions where device_os == other and device_distinct_emails_8w.isin([0,2])
# group05 = everything else

# split data into 5 groups by these conditions
group01 = data[(data['device_os'] == 'windows') & (data['device_distinct_emails_8w'].isin([0, 2]))]
group02 = data[(data['device_os'] == 'macintosh') & (data['device_distinct_emails_8w'].isin([0, 2]))]
group03 = data[(data['device_os'] == 'x11') & (data['device_distinct_emails_8w'].isin([0, 2]))]
group04 = data[(data['device_os'] == 'other') & (data['device_distinct_emails_8w'].isin([0, 2]))]
# group05 will be all the groups except those in group01, group02, group03, group04
group05 = data[~data.index.isin(group01.index) & ~data.index.isin(group02.index) & ~data.index.isin(group03.index) & ~data.index.isin(group04.index)]

# combine into 1 dataframe, add label to that dataframe, label each group 'A', 'B', 'C', 'D', 'E', all in new column 'FE_01'
group01['FE_01'] = 'A'; group02['FE_01'] = 'B'; group03['FE_01'] = 'C'; group04['FE_01'] = 'D'; group05['FE_01'] = 'E'

# add back
data = pd.concat([group01, group02, group03, group04, group05])

# change 'FE_01' to category
data['FE_01'] = data['FE_01'].astype('category')

# assign proability of fraud based on label, so if data['FE_01'] labelled 'A', then assign corresponding probability of fraud
FE_01_prob = data.groupby(['FE_01']).apply(getFraudInfo).reset_index()[['FE_01', 'proportion_of_fraud_in_group']]

FE_01_prob_mapping = {"A": 0.6047,
                      "B": 0.4529,
                      "C": 0.3846,
                      "D": 0.2978,
                      "E": 0.1564}

# map the probability of fraud to the device_acitivtiy_df, as a new column 'FE_01_device_os_emails_prob'
data['FE_01_device_os_emails_prob'] = data['FE_01'].map(FE_01_prob_mapping)

# check shape and head
print(f'data shape: {data.shape}')
data.head()

data shape: (65970, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_01'] = 'A'; group02['FE_01'] = 'B'; group03['FE_01'] = 'C'; group04['FE_01'] = 'D'; group05['FE_01'] = 'E'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_01'] = 'A'; group02['FE_01'] = 'B'; group03['FE_01'] = 'C'; group04['FE_01'] = 'D'; group05['FE_01'] = 'E'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,month,FE_01,FE_01_device_os_emails_prob
728,1,0.6,0.25425,-1,187,60,0.007115,-1.0,AB,1040,...,1500.0,0,INTERNET,11.148815,windows,1,2,0,A,0.6047
4666,1,0.1,0.852272,-1,222,50,0.012723,21.345767,AA,3969,...,200.0,0,INTERNET,35.188807,windows,0,2,0,A,0.6047
5351,1,0.2,0.445712,-1,166,30,0.022266,-1.0,AC,2881,...,200.0,0,INTERNET,6.318178,windows,0,0,0,A,0.6047
5394,1,0.2,0.069055,-1,63,30,0.002983,-1.0,AD,3963,...,1500.0,0,INTERNET,10.127331,windows,0,2,0,A,0.6047
5856,1,0.2,0.086361,-1,213,40,0.003501,-1.0,AB,537,...,1500.0,0,INTERNET,7.989567,windows,0,2,0,A,0.6047


## Feature 2: `FE_02_keep_alive_device_emails_prob`

In [6]:
# divide into 6 groups, one for each unique permutation of keep_alive_session and device_distinct_emails_8w
group01 = data[(data['keep_alive_session'] == 0) & (data['device_distinct_emails_8w'] == 0)]
group02 = data[(data['keep_alive_session'] == 0) & (data['device_distinct_emails_8w'] == 1)]
group03 = data[(data['keep_alive_session'] == 0) & (data['device_distinct_emails_8w'] == 2)]
group04 = data[(data['keep_alive_session'] == 1) & (data['device_distinct_emails_8w'] == 0)]
group05 = data[(data['keep_alive_session'] == 1) & (data['device_distinct_emails_8w'] == 1)]
group06 = data[(data['keep_alive_session'] == 1) & (data['device_distinct_emails_8w'] == 2)]

# label each group from "A" to "F"
group01['FE_02'] = "A"; group02['FE_02'] = "B"; group03['FE_02'] = "C"; group04['FE_02'] = "D"; group05['FE_02'] = "E"; group06['FE_02'] = "F"

# concatenate all the group
data = pd.concat([group01, group02, group03, group04, group05, group06])

# change 'FE_02' to category
data['FE_02'] = data['FE_02'].astype('category')

# generate mapping
FE_02_prob_mappping = {"A": 0.3520,
                       "B": 0.2230,
                       "C": 0.4732,
                       "D": 0.1591,
                       "E": 0.1016,
                       "F": 0.3131
                       }

# map the probability of fraud to the device_acitivtiy_df, as a new column 'FE_01_device_os_emails_prob'
data['FE_02_keep_alive_device_emails_prob'] = data['FE_02'].map(FE_02_prob_mappping)

# check shape and head
print(f'data shape: {data.shape}')
data.head()

data shape: (65970, 35)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_02'] = "A"; group02['FE_02'] = "B"; group03['FE_02'] = "C"; group04['FE_02'] = "D"; group05['FE_02'] = "E"; group06['FE_02'] = "F"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_02'] = "A"; group02['FE_02'] = "B"; group03['FE_02'] = "C"; group04['FE_02'] = "D"; group05['FE_02'] = "E"; group06['FE_02'] = "F"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,month,FE_01,FE_01_device_os_emails_prob,FE_02,FE_02_keep_alive_device_emails_prob
5351,1,0.2,0.445712,-1,166,30,0.022266,-1.0,AC,2881,...,INTERNET,6.318178,windows,0,0,0,A,0.6047,A,0.352
11515,1,0.8,0.639234,-1,372,20,0.000372,-1.0,AB,1303,...,INTERNET,2.479942,windows,0,0,0,A,0.6047,A,0.352
23349,1,0.8,0.189285,-1,140,50,4.834888,-1.0,AC,1945,...,INTERNET,2.919222,windows,0,0,0,A,0.6047,A,0.352
44832,1,0.2,0.067415,-1,85,60,0.014106,-1.0,AD,1865,...,INTERNET,4.589128,windows,0,0,0,A,0.6047,A,0.352
92505,1,0.2,0.776447,-1,77,50,0.847151,-1.0,AC,1265,...,INTERNET,38.778095,windows,0,0,0,A,0.6047,A,0.352


## Feature 3: `FE_03_source_foreign_request_prob`

In [7]:
# divide into 4 groups, one for each unique permutation of foreign_request and source
group01 = data[(data['source'] == "INTERNET") & (data['foreign_request'] == 0)]
group02 = data[(data['source'] == "INTERNET") & (data['foreign_request'] == 1)]
group04 = data[(data['source'] == "TELEAPP") & (data['foreign_request'] == 0)]
group03 = data[(data['source'] == "TELEAPP") & (data['foreign_request'] == 1)]

# label each group from "A" to "D", label_name = "FE_03"
group01['FE_03'] = "A"; group02['FE_03'] = "B"; group03['FE_03'] = "C"; group04['FE_03'] = "D"


# concatenate all the group
data = pd.concat([group01, group02, group03, group04])

# change 'FE_02' to category
data['FE_03'] = data['FE_03'].astype('category')

# generate mapping
FE_03_prob_mappping = {"A": 0.1627,
                       "B": 0.2782,
                       "C": 0.2448,
                       "D": 0.5000
                       }

# map the probability of fraud to the device_acitivtiy_df, as a new column 'FE_01_device_os_emails_prob'
data['FE_03_source_foreign_request_prob'] = data['FE_03'].map(FE_03_prob_mappping)

# check shape and head
print(f'data shape: {data.shape}')
data.head()

data shape: (65970, 37)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_03'] = "A"; group02['FE_03'] = "B"; group03['FE_03'] = "C"; group04['FE_03'] = "D"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_03'] = "A"; group02['FE_03'] = "B"; group03['FE_03'] = "C"; group04['FE_03'] = "D"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_03'

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,device_os,keep_alive_session,device_distinct_emails_8w,month,FE_01,FE_01_device_os_emails_prob,FE_02,FE_02_keep_alive_device_emails_prob,FE_03,FE_03_source_foreign_request_prob
5351,1,0.2,0.445712,-1,166,30,0.022266,-1.0,AC,2881,...,windows,0,0,0,A,0.6047,A,0.352,A,0.1627
23349,1,0.8,0.189285,-1,140,50,4.834888,-1.0,AC,1945,...,windows,0,0,0,A,0.6047,A,0.352,A,0.1627
44832,1,0.2,0.067415,-1,85,60,0.014106,-1.0,AD,1865,...,windows,0,0,0,A,0.6047,A,0.352,A,0.1627
92505,1,0.2,0.776447,-1,77,50,0.847151,-1.0,AC,1265,...,windows,0,0,0,A,0.6047,A,0.352,A,0.1627
111006,1,0.9,0.646771,-1,238,40,0.011138,41.912162,AA,1372,...,windows,0,0,0,A,0.6047,A,0.352,A,0.1627


## Feature 4: `FE_04_device_os_foreign_request_prob`

In [8]:
# group into 4 groups
# divide into 4 groups, one for each unique permutation of foreign_request and source
group01 = data[(data['device_os'] == "windows") & (data['foreign_request'] == 0)]
group02 = data[(data['device_os'] == "windows") & (data['foreign_request'] == 1)]
group03 = data[(data['device_os'] == "macintosh") & (data['foreign_request'] == 0)]
group04 = data[~data.index.isin(group01.index) & ~data.index.isin(group02.index) & ~data.index.isin(group03.index)]

# label each group from "A" to "D", label_name = "FE_03"
group01['FE_04'] = "A"; group02['FE_04'] = "B"; group03['FE_04'] = "C"; group04['FE_04'] = "D"

# concatenate all the group
data = pd.concat([group01, group02, group03, group04])

# change 'FE_04' to category
data['FE_04'] = data['FE_04'].astype('category')

# generate mapping
FE_04_prob_mappping = {"A": 0.3027,
                       "B": 0.4605,
                       "C": 0.2010,
                       "D": 0.0916
                       }

# map the probability of fraud to the device_acitivtiy_df, as a new column 'FE_01_device_os_emails_prob'
data['FE_04_device_os_foreign_request_prob'] = data['FE_04'].map(FE_04_prob_mappping)

# check shape and head
print(f'data shape: {data.shape}')
data.head()

data shape: (65970, 39)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_04'] = "A"; group02['FE_04'] = "B"; group03['FE_04'] = "C"; group04['FE_04'] = "D"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_04'] = "A"; group02['FE_04'] = "B"; group03['FE_04'] = "C"; group04['FE_04'] = "D"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group01['FE_04'

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,device_distinct_emails_8w,month,FE_01,FE_01_device_os_emails_prob,FE_02,FE_02_keep_alive_device_emails_prob,FE_03,FE_03_source_foreign_request_prob,FE_04,FE_04_device_os_foreign_request_prob
5351,1,0.2,0.445712,-1,166,30,0.022266,-1.0,AC,2881,...,0,0,A,0.6047,A,0.352,A,0.1627,A,0.3027
23349,1,0.8,0.189285,-1,140,50,4.834888,-1.0,AC,1945,...,0,0,A,0.6047,A,0.352,A,0.1627,A,0.3027
44832,1,0.2,0.067415,-1,85,60,0.014106,-1.0,AD,1865,...,0,0,A,0.6047,A,0.352,A,0.1627,A,0.3027
92505,1,0.2,0.776447,-1,77,50,0.847151,-1.0,AC,1265,...,0,0,A,0.6047,A,0.352,A,0.1627,A,0.3027
111006,1,0.9,0.646771,-1,238,40,0.011138,41.912162,AA,1372,...,0,0,A,0.6047,A,0.352,A,0.1627,A,0.3027


## Feature 5: `FE_05_device_os_prob`

In [9]:
# generate mapping
FE_05_prob_mappping = {"windows": 0.3083,
                       "macintosh": 0.2041,
                       "linux": 0.0001,
                       "other": 0.0001,
                       "x11": 0.0001
                       }

# map the probability of fraud to the device_acitivtiy_df, as a new column 'FE_01_device_os_emails_prob'
data['FE_05_device_os_prob'] = data['device_os'].map(FE_05_prob_mappping)

# check shape and head
print(f'data shape: {data.shape}')
data.head()

data shape: (65970, 40)


Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,month,FE_01,FE_01_device_os_emails_prob,FE_02,FE_02_keep_alive_device_emails_prob,FE_03,FE_03_source_foreign_request_prob,FE_04,FE_04_device_os_foreign_request_prob,FE_05_device_os_prob
5351,1,0.2,0.445712,-1,166,30,0.022266,-1.0,AC,2881,...,0,A,0.6047,A,0.352,A,0.1627,A,0.3027,0.3083
23349,1,0.8,0.189285,-1,140,50,4.834888,-1.0,AC,1945,...,0,A,0.6047,A,0.352,A,0.1627,A,0.3027,0.3083
44832,1,0.2,0.067415,-1,85,60,0.014106,-1.0,AD,1865,...,0,A,0.6047,A,0.352,A,0.1627,A,0.3027,0.3083
92505,1,0.2,0.776447,-1,77,50,0.847151,-1.0,AC,1265,...,0,A,0.6047,A,0.352,A,0.1627,A,0.3027,0.3083
111006,1,0.9,0.646771,-1,238,40,0.011138,41.912162,AA,1372,...,0,A,0.6047,A,0.352,A,0.1627,A,0.3027,0.3083


In [10]:
# drop labelled columns 'FE_01' to 'FE_05'
data.drop(columns=['FE_01','FE_02','FE_03','FE_04'], inplace=True)

In [11]:
# print shape and head
print(f'data shape: {data.shape}')
data.head()

data shape: (65970, 36)


Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,month,FE_01_device_os_emails_prob,FE_02_keep_alive_device_emails_prob,FE_03_source_foreign_request_prob,FE_04_device_os_foreign_request_prob,FE_05_device_os_prob
5351,1,0.2,0.445712,-1,166,30,0.022266,-1.0,AC,2881,...,6.318178,windows,0,0,0,0.6047,0.352,0.1627,0.3027,0.3083
23349,1,0.8,0.189285,-1,140,50,4.834888,-1.0,AC,1945,...,2.919222,windows,0,0,0,0.6047,0.352,0.1627,0.3027,0.3083
44832,1,0.2,0.067415,-1,85,60,0.014106,-1.0,AD,1865,...,4.589128,windows,0,0,0,0.6047,0.352,0.1627,0.3027,0.3083
92505,1,0.2,0.776447,-1,77,50,0.847151,-1.0,AC,1265,...,38.778095,windows,0,0,0,0.6047,0.352,0.1627,0.3027,0.3083
111006,1,0.9,0.646771,-1,238,40,0.011138,41.912162,AA,1372,...,5.06028,windows,0,0,0,0.6047,0.352,0.1627,0.3027,0.3083
