# Task 2B - Find Likelihood to be Bad Actor

## Library and Pre-process

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
# Read main dataset
df = pd.read_csv('UofT_nodes.csv')
df_bad_list = pd.read_csv('badactor_foundin_kyc_bt_match.csv')

In [3]:
# Change all to categorial
df['COUNTRY_RISK_INCOME'] = df['COUNTRY_RISK_INCOME'].astype('category').cat.codes
df['COUNTRY_RISK_RESIDENCY'] = df['COUNTRY_RISK_RESIDENCY'].astype('category').cat.codes
df['GENDER'] = df['GENDER'].astype('category').cat.codes

df['RISK'] = df['RISK'].replace(['low'], 0)
df['RISK'] = df['RISK'].replace(['medium'], 1)
df['RISK'] = df['RISK'].replace(['high'], 2)

In [4]:
# Convert Key(Customer ID) to int for join
df.CUSTOMER_ID = df.CUSTOMER_ID.astype(float).astype(int)
df_bad_list.CUSTOMER_ID = df_bad_list.CUSTOMER_ID.astype(float).astype(int)

# Change all date to date format
df.BIRTH_DT = pd.to_datetime(df.BIRTH_DT)
df.CUST_ADD_DT = pd.to_datetime(df.CUST_ADD_DT)

# Fill NaN with value different from the other
df.CUST_ADD_DT.fillna(pd.Timestamp('1800-01-01'), inplace=True)
df.OCPTN_NM.fillna(-1, inplace=True)

# Convert all dates to Unix timestamps for modeling
df['BIRTH_TS'] = df['BIRTH_DT'].apply(lambda x: x.timestamp())
df['CUST_ADD_TS'] = df['CUST_ADD_DT'].apply(lambda x: x.timestamp())
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 22 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   BIRTH_DT                1000000 non-null  datetime64[ns]
 1   CUST_ADD_DT             1000000 non-null  datetime64[ns]
 2   OCPTN_NM                1000000 non-null  float64       
 3   RES_CNTRY_CA            1000000 non-null  int64         
 4   CNTRY_OF_INCOME_CA      1000000 non-null  int64         
 5   PEP_FL                  1000000 non-null  float64       
 6   CASH_SUM_IN             1000000 non-null  float64       
 7   CASH_CNT_IN             1000000 non-null  float64       
 8   CASH_SUM_OUT            1000000 non-null  float64       
 9   CASH_CNT_OUT            1000000 non-null  float64       
 10  WIRES_SUM_IN            1000000 non-null  float64       
 11  WIRES_CNT_IN            1000000 non-null  float64       
 12  WIRES_SUM_OUT  

In [5]:
df_bad_list.CUSTOMER_ID = df_bad_list.CUSTOMER_ID.astype(float).astype(int)
df_bad_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              45 non-null     int64  
 1   BIRTH_DT                45 non-null     object 
 2   CUST_ADD_DT             45 non-null     object 
 3   OCPTN_NM                45 non-null     float64
 4   RES_CNTRY_CA            45 non-null     int64  
 5   CNTRY_OF_INCOME_CA      45 non-null     int64  
 6   PEP_FL                  45 non-null     float64
 7   CASH_SUM_IN             45 non-null     float64
 8   CASH_CNT_IN             45 non-null     float64
 9   CASH_SUM_OUT            45 non-null     float64
 10  CASH_CNT_OUT            45 non-null     float64
 11  WIRES_SUM_IN            45 non-null     float64
 12  WIRES_CNT_IN            45 non-null     float64
 13  WIRES_SUM_OUT           45 non-null     float64
 14  WIRES_CNT_OUT           45 non-null     floa

In [6]:
# Create bad actor label
df_bad_list['BAD_STATUS'] = [True]*df_bad_list.shape[0] # Create True labels for real bad actors 
df = df.merge(df_bad_list[['BAD_STATUS', 'CUSTOMER_ID']], on='CUSTOMER_ID', how='left') # Join
df.BAD_STATUS.fillna(False, inplace=True) # Fill the rest with False labels
df.head()

Unnamed: 0,BIRTH_DT,CUST_ADD_DT,OCPTN_NM,RES_CNTRY_CA,CNTRY_OF_INCOME_CA,PEP_FL,CASH_SUM_IN,CASH_CNT_IN,CASH_SUM_OUT,CASH_CNT_OUT,...,WIRES_CNT_OUT,COUNTRY_RISK_INCOME,COUNTRY_RISK_RESIDENCY,RISK,NAME,GENDER,CUSTOMER_ID,BIRTH_TS,CUST_ADD_TS,BAD_STATUS
0,1981-09-01,2007-07-05,89.0,1,1,0.0,2577.785,2.0,2851.663,7.0,...,35.0,1,1,0,"Young, Marie Mildren Coleman",0,935382,368150400.0,1183594000.0,False
1,1994-02-21,2019-05-19,89.0,1,1,0.0,3035.502,2.0,4805.997,15.0,...,152.0,1,1,0,Mark Stupar Lecy,1,2305,761788800.0,1558224000.0,False
2,1962-11-16,2011-08-02,89.0,1,1,0.0,1617.571,1.0,3482.809,11.0,...,20.0,1,1,0,Dean Glasper Wendel Reeves,1,472403,-224899200.0,1312243000.0,False
3,1998-06-20,2001-08-15,89.0,1,1,0.0,3587.042,5.0,1940.943,8.0,...,238.0,1,1,0,"Hulsey, Linda Rauth",0,227994,898300800.0,997833600.0,False
4,1942-01-24,2012-10-28,89.0,1,1,0.0,1725.524,2.0,13197.169,43.0,...,180.0,1,1,0,Carolyn Washington Roberts,0,418265,-881625600.0,1351382000.0,False


In [7]:
# Check whether merged correctly
set(df_bad_list.CUSTOMER_ID) == set(df[df.BAD_STATUS].CUSTOMER_ID)

True

## Variable Selection

In [8]:
# Set variable will be used for training
column_feature = df.columns[2:16].to_list() + df.columns[20:22].to_list()

In [9]:
df[column_feature].dtypes

OCPTN_NM                  float64
RES_CNTRY_CA                int64
CNTRY_OF_INCOME_CA          int64
PEP_FL                    float64
CASH_SUM_IN               float64
CASH_CNT_IN               float64
CASH_SUM_OUT              float64
CASH_CNT_OUT              float64
WIRES_SUM_IN              float64
WIRES_CNT_IN              float64
WIRES_SUM_OUT             float64
WIRES_CNT_OUT             float64
COUNTRY_RISK_INCOME          int8
COUNTRY_RISK_RESIDENCY       int8
BIRTH_TS                  float64
CUST_ADD_TS               float64
dtype: object

In [10]:
# Split
X = df[column_feature]
y = df.BAD_STATUS

# Calculate the correlation between each feature and the target variable
corr = X.corrwith(y)

# Get the selected feature names
k = 10
indices = corr.abs().sort_values(ascending=False)[:k].index
selected_features = X[indices].columns.tolist()

In [11]:
selected_features

['COUNTRY_RISK_INCOME',
 'CNTRY_OF_INCOME_CA',
 'RES_CNTRY_CA',
 'CASH_SUM_IN',
 'COUNTRY_RISK_RESIDENCY',
 'CASH_CNT_IN',
 'WIRES_CNT_OUT',
 'CASH_CNT_OUT',
 'CASH_SUM_OUT',
 'WIRES_SUM_OUT']

## Modelling with Logistic Regression

In [12]:
# Remove feature names
X = X[selected_features].values

# Create a StratifiedKFold object with 5 folds (Avoid imbalanced issue)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
scaler = StandardScaler()

# Initialize an empty list to store the accuracy scores
acc_scores = []
prec_scores = []
rec_scores = []

for train_index, test_index in skf.split(X, y):
    # split the data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # normalize the training and testing data using the scaler
    X_train_norm = scaler.fit_transform(X_train)
    X_test_norm = scaler.transform(X_test)
    
    # initialize a LogisticRegression classifier
    lr = LogisticRegression(class_weight='balanced', random_state=7)
    
    # train the classifier on the normalized training data
    lr.fit(X_train_norm, y_train)
    
    # evaluate the classifier on the normalized testing data
    y_pred = lr.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precison = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    acc_scores.append(accuracy)
    prec_scores.append(precison)
    rec_scores.append(recall)

print('Average accuracy:', np.mean(acc_scores))
print('Average precision:', np.mean(prec_scores))
print('Average recall:', np.mean(rec_scores))

Average accuracy: 0.9567309999999999
Average precision: 0.9999101076987091
Average recall: 0.9567309999999999


In [13]:
# Get the whole X back
X_all_norm = scaler.transform(df[selected_features].values)
bad_score = lr.predict_proba(X_all_norm) # This is the final likelihood to be a bad actor

In [14]:
bad_score

array([[0.69436671, 0.30563329],
       [0.62090629, 0.37909371],
       [0.68111191, 0.31888809],
       ...,
       [0.41975386, 0.58024614],
       [0.2782785 , 0.7217215 ],
       [0.00386269, 0.99613731]])