![Money.jpeg](attachment:b75bd037-c330-4f2c-9039-52a7446c3185.jpeg)

# <span style="font-family:cursive;">WHAT IS ANTI MONEY LAUNDERING ? </span>
Anti money laundering (AML) refers to the web of laws, regulations, and procedures aimed at uncovering efforts to disguise illicit funds as legitimate income. Money laundering seeks to conceal crimes ranging from small-time tax evasion and drug trafficking to public corruption and the financing of groups designated as terrorist organizations.


AML legislation was a response to the growth of the financial industry, the lifting of international capital controls and the growing ease of conducting complex chains of financial transactions.

A high-level United Nations panel has estimated annual money laundering flows at $1.6 trillion, accounting for 2.7% of global GDP in 2020.


# Description of Dataset

 * customer_id    : Unique id for customers
 * credit_debit   : Transaction(debit or credit)
 * cpcc           : Country which transaction was made
 * risk    : transaction status
 * type : transaction type(cash, cheque, wire

In [0]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, GridSearchCV,train_test_split, validation_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import  classification_report,confusion_matrix,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score,roc_curve
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('display.float_format', lambda x: '%.2f' %x)

In [0]:
df_data = pd.read_excel('/kaggle/input/anti-money-laundering/Data for assignment.xlsx')
df_country = pd.read_excel('/kaggle/input/anti-money-laundering/Data for assignment.xlsx',sheet_name = 'country_risk')
df_country.columns = ['CPCC','risk']
df1 = df_data.merge(df_country,on='CPCC')
df_product = pd.read_excel('/kaggle/input/anti-money-laundering/Data for assignment.xlsx',sheet_name='product_type')
df_product.columns = ['product_type','type']
df_ = df1.merge(df_product,on='product_type')
df_.columns = [col.lower() for col in df_.columns]
df = df_.copy()
df.head()
df.drop("product_type", axis=1, inplace=True)
df.dropna(inplace=True)
df = df.loc[df['amount']>0]
df['risk'] = df['risk'].map({"Y":1, "N":0})
df.head()


In [0]:
# Creating a function for controling dataframe
def check_data(df):
    print('HEAD {0}'.format(df.head()))
    print("-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-")
    print('TAIL {0}'.format(df.tail()))
    print('SHAPE: {0}'.format(df.shape))
    print("-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-")
    print('INFO: {0}'.format(df.info()))
    print("-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-")
    print('COLUMNS: {0}'.format(df.columns))
    print("-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-")
    print('DESCRIBE {0}'.format(df.describe().T))
    print("-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-")
    print("ISNULL {0}".format(df.isnull().sum()))
    print("-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-")

In [0]:
check_data(df)

In [0]:
# Classification of users according to the their operations 
custom_list = []
def checking_customers_operations(dataframe, col):
    IQR_tran = dataframe[col].value_counts().quantile(0.75) - dataframe[col].value_counts().quantile(0.25)
    value = dataframe[col].value_counts()
    for idx, count in value.items():
        if count >= dataframe[col].value_counts().quantile(0.75)+IQR_tran:
            custom_list.append(idx)
checking_customers_operations(df, "customer_id")

df["cus_stat"] = df["customer_id"].apply(lambda x: "dangerous" if x in custom_list else "not_dangerous")
df["cus_stat"].value_counts()

In [0]:
# Creating new customers's list who make an operations risky countries and their neigbors
# (PF(r), WS, TO, CK, FJ, NC ),(TL(r), CX), (YT(r), KM), (RO(r), MD, BG, XS, HU)
list_dangerous_id = []
def checking_customer_id_neigbor(dataframe, country, neig):
    for col in dataframe["customer_id"].loc[dataframe["cpcc"] == country].unique():
        if col in dataframe["customer_id"].loc[dataframe["cpcc"] == neig].unique():
            list_dangerous_id.append(col)
    return list_dangerous_id

checking_customer_id_neigbor(df, "PF", "WS")
checking_customer_id_neigbor(df, "PF", "TO")
checking_customer_id_neigbor(df, "PF", "CK")
checking_customer_id_neigbor(df, "PF", "FJ")
checking_customer_id_neigbor(df, "PF", "NC")
checking_customer_id_neigbor(df, "TL", "CX")
checking_customer_id_neigbor(df, "YT", "KM")
checking_customer_id_neigbor(df, "RO", "MD")
checking_customer_id_neigbor(df, "RO", "BG")
checking_customer_id_neigbor(df, "RO", "XS")
checking_customer_id_neigbor(df, "RO", "HU")
print(list_dangerous_id)

In [0]:
# Creating Pivot_table 
table = pd.pivot_table(index = ["customer_id", 'risk', 'credit_debit'], columns=["type"], 
                      values="amount", aggfunc='mean', data=df).reset_index()
table.columns = [col.lower() for col in table.columns]
table.fillna(0, inplace=True)
table.head()

In [0]:
# Checking tha new dataframe table
check_data(table)

In [0]:
table.describe().T

# FEATURE ENGINEERING

In [0]:
# Customer's operation segmentation
custom_list_table = []
def checking_value_counts(df, col):
    value = table[col].value_counts()
    for idx, count in value.items():
        if count >= 3:
            custom_list_table.append(idx)

checking_value_counts(table,"customer_id")
len(custom_list_table)


table["cus_status"] = table["customer_id"].apply(lambda x: "dangerous" if x in custom_list_table else 'not_dangerous')
table.head()


In [0]:
# Creating new variables using mathematical realtions between "cash", "wire", "cheque"
table["total_amount"] = table["cash"] + table["wire"] + table["cheque"] 
table["wire_cheque_sum"] = table["wire"] + table["cheque"]
table["wire_cheque_minus"] = table["wire"] - table["cheque"]
table["wire_cash_sum"] = table["wire"] + table["cash"]
table["wire_cash_minus"] = table["wire"] - table["cash"]
table["cheque_cash_sum"] = table["cheque"] + table["cash"]
table["cheque_cash_minus"] = table["cheque"] - table["cash"]
table.head()



In [0]:
# Creating new variable using total_amount
IQR_table = table["total_amount"].quantile(0.75) - table["total_amount"].quantile(0.25)

table.loc[(table["total_amount"] >= (table['total_amount'].quantile(0.75)+ 1.5*IQR_table)), "total_amo_status"] = "level1"
table.loc[(table["total_amount"] < (table['total_amount'].quantile(0.75)+ 1.5*IQR_table)) &
          (table['total_amount'] >=table['total_amount'].quantile(0.75)-IQR_table), "total_amo_status"] = "level2"
table.loc[(table["total_amount"] < (table['total_amount'].quantile(0.75)-IQR_table)), "total_amo_status"] = "level3"

table.head()

In [0]:
list_dangerous_id

In [0]:
# New variables using cus_stat variable and list_dangerous_id

table.loc[(table['cus_status'] == "dangerous") & (table['customer_id'].apply(lambda x: x in list_dangerous_id)), "cus_dange_rate"] = "high"
table.loc[(table['cus_status'] == "not_dangerous") & (table['customer_id'].apply(lambda x: x in list_dangerous_id)), "cus_dange_rate"] = "mid"
table.loc[(table['cus_status'] == "dangerous") & (table['customer_id'].apply(lambda x: x not in list_dangerous_id)), "cus_dange_rate"] = "low"
table.loc[(table['cus_status'] == "not_dangerous") & (table['customer_id'].apply(lambda x: x not in list_dangerous_id)), "cus_dange_rate"] = "not"
table.cus_dange_rate.value_counts()

In [0]:
# Creating separate threshold values for cash, wire, cheque variables and create new variables using them

def variables_using_thresholds(dataframe, col, col_name):
    IQR = dataframe[col].quantile(0.75) - dataframe[col].quantile(0.25)
    dataframe.loc[dataframe[col] >= (dataframe[col].quantile(0.75) + IQR), col_name] = "risk"
    dataframe.loc[dataframe[col] < (dataframe[col].quantile(0.75) + IQR), col_name] = "not_risk"


variables_using_thresholds(table, "cash", "cash_status")
variables_using_thresholds(table, "cheque", "cheque_status")
variables_using_thresholds(table, "wire", "wire_status")

In [0]:
table.isnull().sum()

In [0]:
cat_cols = [col for col in table.columns if table[col].dtypes == "object"]
num_cols = [col for col in table.columns if table[col].dtypes != "object" and "customer_id" not in col]

In [0]:
cat_cols

In [0]:
num_cols

# Visualizations

In [0]:
fig, axs = plt.subplots(nrows=2,ncols=4,figsize=(12,10))
axs = axs.flatten()
for i, col in enumerate(cat_cols):
    sns.countplot(x=col,data=table,ax=axs[i])
    plt.title(col)
plt.show()

In [0]:
fig, axs = plt.subplots(nrows=4,ncols=3,figsize=(15,12))
axs = axs.flatten()
for i, col in enumerate(num_cols):
    sns.boxplot(y=col,data=table,ax=axs[i])
    plt.title(col)
plt.show()

In [0]:
sns.scatterplot(x="cheque", y="cash", hue="risk", data=table)
plt.show()

In [0]:
sns.scatterplot(x="wire", y="cash", hue='risk', data=table)
plt.show()

In [0]:
sns.pairplot(table[num_cols])
plt.title('All Num Cols')
plt.show()

In [0]:
fig, axs = plt.subplots(nrows=2,ncols=4,figsize=(15,12))
axs = axs.flatten()
for i, col in enumerate(cat_cols):
    sns.boxplot(x=col, y='total_amount', hue='risk', data=table, ax=axs[i])
fig.tight_layout()
plt.show()

In [0]:
corr = table.corr()
sns.heatmap(corr)

# Encoding

In [0]:
# Except credit_debit variables, all the categorical variables are hierarchic for this reason I use LabelEncoder

cat_cols.remove("credit_debit")
cat_cols


In [0]:
def label_encoder(dataframe, cols):
    labelencoder = LabelEncoder()
    dataframe[cols] = labelencoder.fit_transform(dataframe[cols])
    return dataframe

for col in cat_cols:
    label_encoder(table,col)
table.head()

In [0]:
# Using get_dummies for credit_debit variable
table = pd.get_dummies(table, columns=["credit_debit"], drop_first="True")
table.head()

In [0]:
num_cols.remove("risk")
num_cols

In [0]:
# RobustScaler for numerical variables

table[num_cols] = RobustScaler().fit_transform(table[num_cols])

table.head()

# Models

### XGBoost

In [0]:

X = table.iloc[:, table.columns != "risk"]
y = table.iloc[:, table.columns == "risk"]

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.20, random_state=1)

xgb_model = XGBClassifier().fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))


In [0]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt='.2f')
plt.show()

### Random Forest

In [0]:
X = table.iloc[:, table.columns != "risk"]
y = table.iloc[:, table.columns == "risk"]

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.20, random_state=1)

rf_model = RandomForestClassifier().fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [0]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt='.2f')
plt.show()

### LightGBM

In [0]:
X = table.iloc[:, table.columns != "risk"]
y = table.iloc[:, table.columns == "risk"]

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.20, random_state=1)

lgb_model = LGBMClassifier().fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
print(classification_report(y_test, y_pred))


In [0]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt='.2f')
plt.show()

### DecisionTree

In [0]:
X = table.iloc[:, table.columns != "risk"]
y = table.iloc[:, table.columns == "risk"]

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.20, random_state=1)
dt_model = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
print(classification_report(y_test, y_pred))


In [0]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt='.2f')
plt.show()

In [0]:
# RandomForest Model Tuning

X = table.iloc[:, table.columns != "risk"]
y = table.iloc[:, table.columns == "risk"]

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.20, random_state=1)


rf_cv_model = RandomForestClassifier(max_depth=10, max_features="sqrt", 
                                     min_samples_split=2).fit(X_train, y_train)
y_pred = rf_cv_model.predict(X_test)
print(classification_report(y_test, y_pred))

In [0]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,fmt='.2f')
plt.show()

In [0]:
# Validation Curve
def val_score_param(model, X, y, param_name, param_range, scoring='roc_auc', cv=5):
    train_score, test_score = validation_curve(model, X, y, param_name = param_name, param_range = param_range,
                                              scoring = scoring, cv=cv)
    mean_train_score = np.mean(train_score, axis=1)
    mean_test_score = np.mean(test_score,axis=1)
    
    plt.plot(param_range, mean_train_score, label='Training Score', color='b')
    plt.plot(param_range, mean_test_score, label='Validation Score', color='r')
    
    plt.title(f'Validation Curve For{type(model).__name__}')
    plt.xlabel(f'Number of {param_range}')
    plt.ylabel(f'{scoring}')
    plt.tight_layout()
    plt.legend(loc='best')
    plt.show()
val_score_param(rf_cv_model,X_train,y_train,param_name='max_depth',param_range=range(1,11), scoring='recall', cv=5)

In [0]:
def plot_importance(model, features, num=len(X),save=False):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_, 'Features':features.columns})
    plt.figure(figsize=(6,4))
    sns.set(font_scale=1)
    sns.barplot(x='Value',y='Features',data=feature_imp.sort_values(by='Value',ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importance.png')
plot_importance(rf_cv_model,X)

In [0]:
# ROC Graphs

def plot_roc_auc_curve(model):
    logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label = "AUC (area = %0.2f)" % logit_roc_auc)
    plt.plot([0,1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title("ROC Curve")
    plt.show()
plot_roc_auc_curve(rf_cv_model)

In [0]:
# LightGBM Model Tuning

X = table.iloc[:, table.columns != "risk"]
y = table.iloc[:, table.columns == "risk"]

X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.20, random_state=1)
lgb_cv_model = LGBMClassifier(learning_rate=0.01, max_depth=10, subsample=0.6, 
                              n_estimators=1000, min_child_samples=10).fit(X_train, y_train)
y_pred = lgb_cv_model.predict(X_test)
print(classification_report(y_test, y_pred))

### Model Visualization with Shap(LightGBM)

In [0]:
import shap
shap.initjs()

In [0]:
# Standard Shap Values


shap_values = shap.TreeExplainer(lgb_cv_model).shap_values(X_test)


In [0]:
# Summary for model's variables

shap.summary_plot(shap_values, X_test, plot_type="bar")

In [0]:
shap.dependence_plot("cus_status", shap_values[1], X_test)

In [0]:
shap.dependence_plot("total_amount", shap_values[1], X_test)

In [0]:
shap.dependence_plot("cheque", shap_values[1], X_test)

In [0]:
from shap import Explainer, Explanation

explainer = Explainer(lgb_cv_model)
shap_values = explainer(X_test)
np.shape(shap_values.values)

In [0]:
# Features importance with Waterfall Plot

exp = Explanation(shap_values[:,:,0], shap_values.base_values[:,0], X_test, feature_names=X_test.columns)
idx = 1
shap.plots.waterfall(exp[idx])

In [0]:
# Force Plot

shap.plots.force(exp[idx])

In [0]:
# Beeswarm Plot

shap.plots.beeswarm(shap_values[:,:,1])

In [0]:
# Absolute Mean Shap

shap.plots.bar(exp[idx])

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#5642C5;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="padding: 10px;
              color:white;">
Thank you for supports and comments.✌️✌🏿
    </br>
You can follow me and my teammates: </br>
Yunus Emre TURKOGLU: <a href="https://www.linkedin.com/in/yunus-emre-turkoglu/" target="_blank"><font color=white>Visit the Linkedin Profile</font></a> </br>
Veysel ALEVCAN: <a href="https://www.linkedin.com/in/veyselalevcan/" target="_blank"><font color=white>Visit the Linkedin Profile</font></a></br>
Burak DERVISOGLU: <a href="https://www.linkedin.com/in/burak-dervi%C5%9Fo%C4%9Flu-6363aa18b/" target="_blank"><font color=white>Visit the Linkedin Profile</font></a>    </br>
Canan TAMTURK KARA: <a href="https://www.linkedin.com/in/canantamturkkara/" target="_blank"><font color=white>Visit the Linkedin Profile</font></a></br>
Sercan AKI:  <a href="https://www.linkedin.com/in/sercan-aki/" target="_blank"><font color=white>Visit the Linkedin Profile</font></a>
    
</p>
</div>
