In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv('transactions.gz',compression='gzip')

In [None]:
df.fillna(value={'merchantCountryCode':'UNK','acqCountry':'UNK'},inplace=True)

In [None]:
#empty and correlated columns
df.drop(['Unnamed: 0','merchantCity','merchantState','merchantZip','echoBuffer','posOnPremises','recurringAuthInd',
         'enteredCVV','customerId','availableMoney'],axis=1,inplace=True)

### Adding Features

In [None]:

le = LabelEncoder()
var = ['merchantName','merchantCountryCode','posEntryMode','posConditionCode','merchantCategoryCode','transactionType','cardPresent','expirationDateKeyInMatch','isFraud']
for i in var:
    df[i] = le.fit_transform(df[i])

df['acqCountry_encode'] = le.fit_transform(df['acqCountry'])

In [None]:
# converting to datetime format
df['transactionDateTime'] = pd.to_datetime(df['transactionDateTime'])
df['accountOpenDate'] = pd.to_datetime(df['accountOpenDate'])
df['dateOfLastAddressChange'] = pd.to_datetime(df['dateOfLastAddressChange'])

In [None]:
# extractindg year, month, day, hour, minute and seconds from datetime columns
df['transactionDateTime_year'] = df['transactionDateTime'].dt.year
df['transactionDateTime_month'] = df['transactionDateTime'].dt.month
df['transactionDateTime_day'] = df['transactionDateTime'].dt.day
df['transactionDateTime_hour'] = df['transactionDateTime'].dt.hour
df['transactionDateTime_minute'] = df['transactionDateTime'].dt.minute
df['transactionDateTime_second'] = df['transactionDateTime'].dt.second


df['accountOpenDate_year'] = df['accountOpenDate'].dt.year
df['accountOpenDate_month'] = df['accountOpenDate'].dt.month
df['accountOpenDate_day'] = df['accountOpenDate'].dt.day

df['dateOfLastAddressChange_year'] = df['dateOfLastAddressChange'].dt.year
df['dateOfLastAddressChange_month'] = df['dateOfLastAddressChange'].dt.month
df['dateOfLastAddressChange_day'] = df['dateOfLastAddressChange'].dt.day

In [None]:
# drop datetime column
df.drop('transactionDateTime',axis = 1,inplace = True)
df.drop('currentExpDate',axis = 1,inplace = True)
df.drop('accountOpenDate',axis = 1,inplace = True)
df.drop('dateOfLastAddressChange',axis = 1,inplace = True)

### Running Local Model - Population 

In [None]:


X = df.drop('isFraud',axis = 1)
y = df['isFraud']

X = X.drop('acqCountry',axis=1)

preds = IsolationForest(random_state=42).fit_predict(X)
preds_bool = preds==-1
y_bool = y.values == 1


acc = accuracy_score(y_bool,preds_bool)
precision = precision_score(y_bool,preds_bool, average='weighted')
recall = recall_score(y_bool,preds_bool, average='weighted')
f1 = f1_score(y_bool,preds_bool, average='weighted')

print(f'Acc: {acc}, Precision: {precision}, Recall: {recall}, F1: {f1}')

print('*'*20)

### Running Local Model - Single Client

In [None]:

client = 380680241
print(client)

X_client = df.loc[df.accountNumber==client].drop('isFraud',axis = 1).drop('acqCountry',axis=1)
y_client = df.loc[df.accountNumber==client]['isFraud']
print(X_client.shape[0])

preds = IsolationForest(random_state=42).fit_predict(X_client)
preds_bool = preds==-1
y_bool = y_client.values == 1

acc = accuracy_score(y_bool,preds_bool)
precision = precision_score(y_bool,preds_bool, average='weighted')
recall = recall_score(y_bool,preds_bool, average='weighted')
f1 = f1_score(y_bool,preds_bool, average='weighted')

print(f'Acc: {acc}, Precision: {precision}, Recall: {recall}, F1: {f1}')

print('*'*20)

    