In [667]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

In [668]:
data = pd.read_csv('account_profile.csv')
ids = pd.read_csv('name_matching.csv')
ids = ids.drop_duplicates(subset='Seq')

In [671]:
data = pd.merge(data,ids,on = 'Seq',how='left')

In [672]:
data = data.replace(-102,np.nan)

Indicate the missingness of numeric features

In [673]:
data['simChangeFreq_Missing'] = np.where(data['simChangeFreq'].isnull(), 1, 0)
data['deviceChangeFreq_Missing'] = np.where(data['deviceChangeFreq'].isnull(), 1, 0)
data['FirstName_Missing'] = np.where(data['FirstName'].isnull(), 1, 0)
data['LastName_Missing'] = np.where(data['LastName'].isnull(), 1, 0)

Fill nan with most frequently values

In [674]:
data['FirstName'].fillna(100,inplace=True)
data['LastName'].fillna(100,inplace=True)
data['simChangeFreq'].fillna(0,inplace=True)
data['deviceChangeFreq'].fillna(0,inplace=True)

In [675]:
data.fillna('Not_Avaliable',inplace=True)

In [678]:
data=pd.get_dummies(data, prefix=['AccountStatus', 'AccountType', 'AccountClass', 'CarrierName',
       'BrandName', 'deviceMake'],drop_first=True)

In [680]:
data1 = data[data['x_amount'] <500]
data2 = data[(data['x_amount']>=500)&(data['x_amount']<2000)]
data3 = data[(data['x_amount']>=2000)&(data['x_amount']<3000)]
data4 = data[(data['x_amount']>=3000)&(data['x_amount']<4000)]
data5 = data[data['x_amount']>=4000]

In [727]:
def isof(data,n_estimators,max_samples,contamination):
    df = data
    scaler = RobustScaler()
    X = data.drop(['Seq','IsFraud'],axis=1)
    x = scaler.fit_transform(X)
    clf = IsolationForest(n_estimators=n_estimators,
                          max_samples=max_samples,
                          contamination=contamination,
                         random_state=10)
    clf.fit(x)
    yp = clf.predict(x)
    df['yp'] = yp
    df['yp'] = df['yp'].replace(1,0)
    df['yp'] = df['yp'].replace(-1,2)
    print(classification_report(df['IsFraud'],df['yp']))
    print('number of detected='+str(len(df[df['yp']==2])))
    return df
                        

In [682]:
save_money = 0
for i in data.index:
    if (data['IsFraud'][i]==2):
        save_money = save_money+data['x_amount'][i]
    else:
        pass
print('Total loss Caused by Fraud Transactions: $'+str(round(save_money,2)))
total_loss = round(save_money,2)

Total loss Caused by Fraud Transactions: $231658.32


In [694]:
data_sample = data.sample(frac=0.1,random_state=111)
save_money = 0
for i in data_sample.index:
    if (data_sample['IsFraud'][i]==2):
        save_money = save_money+data_sample['x_amount'][i]
    else:
        pass
print('Total avoid loss: $'+str(round(save_money,2)))

print('loss avoid percentage: '+ str(round((round(save_money,2)/total_loss)*100,2))+'%')

Total avoid loss: $25564.54
loss avoid percentage: 11.04%


In [740]:
data0 = isof(data,2000,5400,0.1)

              precision    recall  f1-score   support

           0       0.98      0.91      0.94      5233
           2       0.07      0.31      0.12       130

    accuracy                           0.89      5363
   macro avg       0.53      0.61      0.53      5363
weighted avg       0.96      0.89      0.92      5363

number of detected=537


In [741]:

save_money = 0
for i in data0.index:
    if (data0['yp'][i]==2)&(data0['IsFraud'][i]==2):
        save_money = save_money+data0['x_amount'][i]
    else:
        pass
print('Total avoid loss: $'+str(round(save_money,2)))

print('loss avoid percentage: '+ str(round((round(save_money,2)/total_loss)*100,2))+'%')

Total avoid loss: $117945.94
loss avoid percentage: 50.91%


In [730]:
data1_p = isof(data1,2000,5400,0)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1198
           2       0.00      0.00      0.00        14

    accuracy                           0.99      1212
   macro avg       0.49      0.50      0.50      1212
weighted avg       0.98      0.99      0.98      1212

number of detected=0


In [731]:
data2_p = isof(data2,2000,5400,0.05)

              precision    recall  f1-score   support

           0       0.98      0.95      0.96      3373
           2       0.06      0.12      0.08        82

    accuracy                           0.93      3455
   macro avg       0.52      0.54      0.52      3455
weighted avg       0.96      0.93      0.94      3455

number of detected=173


In [732]:
data3_p = isof(data3,2000,5400,0.5)

              precision    recall  f1-score   support

           0       0.99      0.52      0.68       272
           2       0.08      0.85      0.14        13

    accuracy                           0.53       285
   macro avg       0.53      0.68      0.41       285
weighted avg       0.94      0.53      0.65       285

number of detected=142


In [733]:
data4_p = isof(data4,2000,5400,0.5)

              precision    recall  f1-score   support

           0       0.99      0.51      0.68       345
           2       0.05      0.90      0.10        10

    accuracy                           0.52       355
   macro avg       0.52      0.71      0.39       355
weighted avg       0.97      0.52      0.66       355

number of detected=177


In [734]:
data5_p = isof(data5,2000,5400,0.8)

              precision    recall  f1-score   support

           0       1.00      0.27      0.42        45
           2       0.25      1.00      0.40        11

    accuracy                           0.41        56
   macro avg       0.62      0.63      0.41        56
weighted avg       0.85      0.41      0.42        56

number of detected=44


In [735]:
data_p = data1_p.append(data2_p).append(data3_p).append(data4_p).append(data5_p)

In [736]:
print(classification_report(data_p['IsFraud'],data_p['yp']))

              precision    recall  f1-score   support

           0       0.98      0.91      0.94      5233
           2       0.08      0.32      0.12       130

    accuracy                           0.89      5363
   macro avg       0.53      0.61      0.53      5363
weighted avg       0.96      0.89      0.92      5363



In [737]:
save_money = 0
for i in data_p.index:
    if (data_p['yp'][i]==2)&(data_p['IsFraud'][i]==2):
        save_money = save_money+data_p['x_amount'][i]
    else:
        pass
print('Total avoid loss: $'+str(round(save_money,2)))

print('loss avoid percentage: '+ str(round((round(save_money,2)/total_loss)*100,2))+'%')

Total avoid loss: $129321.23
loss avoid percentage: 55.82%
