# Revisiting the FinCEN

FinCEN files by ICIJ contains transaction that flagged by financial institutions as suspicious to United States authorities. 

More Information: [View](https://www.icij.org/investigations/fincen-files/explore-the-fincen-files-data/)<br>
Data Source: 
- [FinCEN](https://media.icij.org/uploads/2020/09/download_data_fincen_files.zip)<br>


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from rapidfuzz import process, utils
from IPython.display import Markdown
from rich.console import Console
from rich.table import Table
import typing

### Preprocessing

In [None]:
tm = pd.read_csv("download_transactions_map.csv")
tm["number_transactions"] = tm["number_transactions"].fillna(1)
tm = tm.dropna()
tm["begin_date"] = pd.to_datetime(tm["begin_date"])
tm["end_date"] = pd.to_datetime(tm["end_date"])
tm = tm.drop(
    columns=[
        "id",
        "icij_sar_id",
        "filer_org_name_id",
        "originator_bank_id",
        "beneficiary_bank_id",
    ],
)
tm = tm.sort_values(by="end_date")


def compute_zscore(arr: pd.Series) -> np.array:

    arr = np.array(arr)
    mean = arr.mean()
    differences = [(value - mean) ** 2 for value in arr]
    sum_of_differences = sum(differences)
    standard_deviation = (sum_of_differences / (len(arr) - 1)) ** 0.5
    zscores = [(value - mean) / standard_deviation for value in arr]
    return zscores


tm["amount_transactions_zscores"] = compute_zscore(tm["amount_transactions"])
tm = tm[tm["amount_transactions_zscores"] < 3].reset_index(drop=True)
tm[["originator_bank", "beneficiary_bank"]] = tm[
    ["originator_bank", "beneficiary_bank"]
].apply(lambda x: x.str.strip())

# Matches similar bank name
bank_dict = dict()
org_list = (
    tm["originator_bank"].unique().tolist() + tm["beneficiary_bank"].unique().tolist()
)
org_list = np.unique(np.array(org_list))
matches = list()
processed_orgs = [utils.default_process(org) for org in org_list]
for (i, processed_query) in enumerate(processed_orgs):
    processed_orgs[i] = None
    match = process.extractOne(
        processed_query, processed_orgs, processor=None, score_cutoff=90
    )
    processed_orgs[i] = processed_query
    if match:
        bank_dict[org_list[i]] = org_list[match[2]]
    else:
        bank_dict[org_list[i]] = org_list[i]

bank_list = list(bank_dict.values())
tm["originator_bank"].replace(bank_dict, inplace=True)
tm["beneficiary_bank"].replace(bank_dict, inplace=True)

### Helper

In [None]:
def plot_network(
    G: nx.classes.graph.Graph, k: int = 1, iterations: int = 100, title: str = None
) -> None:

    fig, ax = plt.subplots(figsize=(30, 30), dpi=200)
    if title != None:
        plt.title(f"{title}", fontsize=25)
    nx.draw(
        G,
        with_labels=True,
        node_size=[v * 100 for v in dict(G.degree).values()],
        pos=nx.spring_layout(G, k=k, iterations=iterations),
        arrows=True,
    )

In [None]:
def printmd(string: str, color: str = None, fontsize: int = 14) -> None:

    colorstr = "<span style='color:{}; font-size:{}px'>{}</span>".format(
        color, fontsize, string
    )
    display(Markdown(colorstr))

In [None]:
console = Console()


def df_to_table(
    pandas_dataframe: pd.DataFrame,
    rich_table: Table,
    show_index: bool = True,
    index_name: typing.Optional[str] = None,
) -> Table:

    if show_index:
        index_name = str(index_name) if index_name else ""
        rich_table.add_column(index_name)

    for column in pandas_dataframe.columns:
        if pandas_dataframe.dtypes[column] == "object":
            rich_table.add_column(str(column), justify="left")
        elif (
            pandas_dataframe.dtypes[column] == "float"
            or pandas_dataframe.dtypes[column] == "int"
        ):
            rich_table.add_column(str(column), justify="right")

    for index, value_list in enumerate(pandas_dataframe.values.tolist()):
        row = [str(index)] if show_index else []
        row += [str(x) for x in value_list]
        rich_table.add_row(*row)

    return rich_table

### All bank, grouped and ignore timeseries

In [None]:
grouped_bank_tm = tm.groupby(tm.columns[3:9].tolist(), as_index=False)[
    ["number_transactions", "amount_transactions"]
].sum()
grouped_bank_tm = grouped_bank_tm[
    grouped_bank_tm["amount_transactions"] > 500000
].reset_index(drop=True)

In [None]:
all_G = nx.Graph()
all_G = nx.from_pandas_edgelist(
    grouped_bank_tm, source="originator_bank", target="beneficiary_bank", edge_attr=True
)
plot_network(all_G)

In [None]:
# Degree analysis on all records
degree_sequence = sorted((d for n, d in all_G.degree()), reverse=True)

fig, ax = plt.subplots(1, 2, figsize=(9, 3), dpi=100)
ax[0].plot(degree_sequence, "r-", marker="o")
ax[0].set_title("Degree Rank")
ax[0].set_ylabel("Degree")
ax[0].set_xlabel("Rank")
ax[1].bar(*np.unique(degree_sequence, return_counts=True))
ax[1].set_title("Degree Histogram")
ax[1].set_xlabel("Degree")
ax[1].set_ylabel("# of Nodes")
plt.tight_layout()
plt.show()

### Weighting bank (nodes) centrality

In [None]:
degree_central = nx.degree_centrality(all_G)
degree_central = dict(
    sorted(degree_central.items(), key=lambda item: item[1], reverse=True)
)
between_central = nx.betweenness_centrality(all_G, normalized=True, endpoints=True)
between_central = dict(
    sorted(between_central.items(), key=lambda item: item[1], reverse=True)
)

n_top = 50
top_dcentral_bank_node = list(degree_central)[:n_top]
top_bcentral_bank_node = list(between_central)[:n_top]

central_bank_node = list(set(top_dcentral_bank_node) & set(top_bcentral_bank_node))
ncentral_bank_node = list(set(bank_list).difference(central_bank_node))

central_G = all_G.copy()
central_G.remove_nodes_from(ncentral_bank_node)
plot_network(central_G)

### Weighting amount transactions(edges) attributes

In [None]:
edge_between_central = nx.edge_betweenness_centrality(
    all_G, weight="amount_transactions"
)
edge_between_central = dict(
    sorted(edge_between_central.items(), key=lambda item: item[1], reverse=True)
)

n_top = 50
top_bcentral_bank_edge = list(edge_between_central)[:n_top]
ncentral_bank_edge = list(set(all_G.edges()).difference(top_bcentral_bank_edge))
central_bank_node = [item for t in top_bcentral_bank_edge for item in t]
ncentral_bank_node = list(set(bank_list).difference(central_bank_node))

edge_G = all_G.copy()
edge_G.remove_edges_from(ncentral_bank_edge)
edge_G.remove_nodes_from(ncentral_bank_node)
plot_network(edge_G, k=0.3, iterations=30)

### Timeseries transaction network, not grouped

In [None]:
min_year = tm["begin_date"].min().year
max_year = tm["end_date"].max().year
year_list = np.arange(min_year, max_year + 1, 2)

for year in year_list:
    subset_tm = tm[
        (tm["begin_date"] > f"{year}-01-01") & (tm["end_date"] < f"{year+1}-12-31")
    ]
    timeseries_G = nx.Graph()
    timeseries_G = nx.from_pandas_edgelist(
        subset_tm, source="originator_bank", target="beneficiary_bank", edge_attr=True
    )
    plot_network(timeseries_G, title=f"{year} to {year+1}")

### Overall view of each countries

In [None]:
for country in np.unique(
    tm[["originator_bank_country", "beneficiary_bank_country"]].values
):
    sent = tm[tm["originator_bank_country"] == country]
    received = tm[tm["beneficiary_bank_country"] == country]

    total_sent = sent["amount_transactions"].sum()
    n_transactions = sent["number_transactions"].sum()

    frequent_sending_bank = (
        sent.groupby("originator_bank")[["number_transactions", "amount_transactions"]]
        .sum()
        .sort_values(by="amount_transactions", ascending=False)
    )
    frequent_receiving_bank = (
        sent.groupby("beneficiary_bank")[["number_transactions", "amount_transactions"]]
        .sum()
        .sort_values(by="amount_transactions", ascending=False)
    )

    table = Table(show_header=True, header_style="bold black")
    frequent_sending_bank_table = df_to_table(
        frequent_sending_bank.reset_index(), table, show_index=False
    )
    table = Table(show_header=True, header_style="bold black")
    frequent_receiving_bank_table = df_to_table(
        frequent_receiving_bank.reset_index(), table, show_index=False
    )

    printmd(f"Country: ***{country}***", color="black", fontsize=20)
    printmd(f"Total $ Sent: <u>{total_sent}</u>")
    printmd(f"# of Transactions: <u>{n_transactions}</u>")
    print("\nFrequent Sending Bank:")
    display(frequent_sending_bank_table)
    print("\nFrequent Receiving Bank:")
    display(frequent_receiving_bank_table)
    print("\n")