In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import pandas_exploration_util.viz.explore as pe
import seaborn as sns
import pandas_exploration_util.stats as ps
from sklearn.model_selection import train_test_split,GridSearchCV
from catboost import CatBoostClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [2]:
data = pd.read_csv('account_profile.csv')
ids = pd.read_csv('name_matching.csv')
ids = ids.drop_duplicates(subset='Seq')
data = pd.merge(data,ids,on = 'Seq',how='left')
data = data.replace(-102,np.nan)

In [3]:
#Indicate the Missingness of numeric features
data['simChangeFreq_Missing'] = np.where(data['simChangeFreq'].isnull(), 1, 0)
data['deviceChangeFreq_Missing'] = np.where(data['deviceChangeFreq'].isnull(), 1, 0)
data['FirstName_Missing'] = np.where(data['FirstName'].isnull(), 1, 0)
data['LastName_Missing'] = np.where(data['LastName'].isnull(), 1, 0)
data['FirstName'].fillna(100,inplace=True)
data['LastName'].fillna(100,inplace=True)
data['simChangeFreq'].fillna(0,inplace=True)
data['deviceChangeFreq'].fillna(0,inplace=True)

In [4]:
data.fillna('Not_Avaliable',inplace=True)
data=pd.get_dummies(data, prefix=['AccountStatus', 'AccountType', 'AccountClass', 'CarrierName',
       'BrandName', 'deviceMake'],drop_first=True)

In [5]:
data_train,data_test = train_test_split(data,test_size = 0.5,random_state = 10)

In [6]:
X = data_train.drop(['Seq','IsFraud'],axis=1)
Y = data_train['IsFraud']
X_test = data_test.drop(['Seq','IsFraud'],axis=1)
Y_test = data_test['IsFraud']

In [7]:
scaler = RobustScaler()
scaler = scaler.fit(X)
x_train = scaler.transform(X)
x_test = scaler.transform(X_test)

In [8]:
# ros = RandomOverSampler(random_state=10)
# x_train,y_train = ros.fit_resample(x_train,y_train)

In [9]:
rus = RandomUnderSampler(random_state=10)
x_train,y_train = rus.fit_resample(x_train,Y)

In [10]:
# model = CatBoostClassifier(n_estimators=1000,class_weights=[1,100])

In [11]:
# parameters = {
#     'learning_rate': [0.1,0.05,0.02],
#     'depth':[6,8,10],
#     'rsm':[0.3,0.5,1],
# }

In [12]:
# GSCV = GridSearchCV(estimator = model, param_grid=parameters,cv = 3,scoring='balanced_accuracy')

In [13]:
# GSCV.fit(x_train,y_train)

In [14]:
model = CatBoostClassifier(n_estimators=2000,learning_rate = 0.1,class_weights=[1,100],depth = 10,rsm = 0.5)

In [15]:
model.fit(x_train,y_train,metric_period=200)

0:	learn: 0.5399761	total: 215ms	remaining: 7m 9s
200:	learn: 0.0393947	total: 12.1s	remaining: 1m 48s
400:	learn: 0.0261732	total: 36.1s	remaining: 2m 24s
600:	learn: 0.0195823	total: 59.1s	remaining: 2m 17s
800:	learn: 0.0159798	total: 1m 21s	remaining: 2m 1s
1000:	learn: 0.0138598	total: 1m 42s	remaining: 1m 42s
1200:	learn: 0.0123314	total: 2m 3s	remaining: 1m 22s
1400:	learn: 0.0112881	total: 2m 24s	remaining: 1m 1s
1600:	learn: 0.0103109	total: 2m 45s	remaining: 41.3s
1800:	learn: 0.0094738	total: 3m 9s	remaining: 20.9s
1999:	learn: 0.0088812	total: 3m 29s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1a24d45f28>

In [16]:
yp = model.predict_proba(x_test)

In [17]:
yp = [i[1] for i in yp]

In [18]:
thrh = np.percentile(yp,90)

In [19]:
yp = [0 if i<thrh else 2 for i in yp ]

In [21]:
print(classification_report(Y_test,yp))

              precision    recall  f1-score   support

           0       0.98      0.90      0.94      2615
           2       0.05      0.21      0.08        67

    accuracy                           0.88      2682
   macro avg       0.51      0.56      0.51      2682
weighted avg       0.95      0.88      0.92      2682



In [22]:
data_test['yp'] = yp

In [23]:
save_money = 0
for i in data_test.index:
    if (data_test['IsFraud'][i]==2):
        save_money = save_money+data_test['x_amount'][i]
    else:
        pass
print('Total loss Caused by Fraud Transactions: $'+str(round(save_money,2)))
total_loss = round(save_money,2)

Total loss Caused by Fraud Transactions: $121712.31


In [24]:
save_money = 0
for i in data_test.index:
    if (data_test['yp'][i]==2)&(data_test['IsFraud'][i]==2):
        save_money = save_money+data_test['x_amount'][i]
    else:
        pass
print('Total avoid loss: $'+str(round(save_money,2)))

print('loss avoid percentage: '+ str(round((round(save_money,2)/total_loss)*100,2))+'%')

Total avoid loss: $45923.35
loss avoid percentage: 37.73%


In [25]:
yp = model.predict_proba(x_test)
yp = [i[1] for i in yp]

In [26]:
data_test['yp'] = yp

In [27]:
data_test['yp']=[data_test['yp'][i]*data_test['x_amount'][i] for i in data_test.index]

In [28]:
thrh = np.percentile(data_test['yp'],90)

In [29]:
data_test['yp'] = [0 if i<thrh else 2 for i in data_test['yp'] ]

In [30]:
len(data_test[data_test['yp']==2])

269

In [31]:
save_money = 0
for i in data_test.index:
    if (data_test['yp'][i]==2)&(data_test['IsFraud'][i]==2):
        save_money = save_money+data_test['x_amount'][i]
    else:
        pass
print('Total avoid loss: $'+str(round(save_money,2)))

print('loss avoid percentage: '+ str(round((round(save_money,2)/total_loss)*100,2))+'%')

Total avoid loss: $65877.78
loss avoid percentage: 54.13%
