<a href="https://colab.research.google.com/github/henryjhu/Anomaly-Detection-in-Wire-Activities/blob/main/DSC_680_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DSC-680-Z1 Research Practicum** <BR> Machine Learning

## **Project Description**
The research practicum involves on-site experiential learning in a research setting. This setting may be in the private or public sector, it may include such locations as education, governmental, non-governmental, or general
research organization. The experience must provide students the opportunity to collect and analyze data, consider ethical implications of research, and draw empirically grounded conclusions.

<b>Purpose:</b><br>
Carry out both unsupervised and supervised machine learnings with the sample data.<br>
<b>Universtiy Name:</b> Utica College <br>
<b>Course Name:</b> DSC-680-Z1 Research Practicum <br>
<b>Student Name:</b> Henry J. Hu <br>
<b>Program Director Name:</b> Dr. McCarthy, Michael <br>
<b>Runtime Environment:</b> Google Colab<br>
<b>Programming Language:</b> Python <br>
<b>Sample Data Frame:</b>
A random sample of international wires belonging to 139 customers from 3 continents for the entire year of 2020.<br>
<b> Last Update:</b> July 21st, 2021

## **Mounting Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

## **Importing Libraries**

In [None]:
# Importing libraries
import io
import pandas as pd
import numpy as np
from numpy import quantile, where, random
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn import model_selection, preprocessing
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest, VotingClassifier, StackingClassifier
from sklearn.datasets import make_blobs
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score, KFold
import traceback
import time
from datetime import datetime
import pytz

## **Importing Data Into Google Colab**

In [None]:
# Importing data and looking at head
input_data = pd.read_csv("gdrive/MyDrive/sample_df_4M.txt")
input_data.head()

## **Data Segregation**

In [None]:
NN_103_df = input_data[(input_data['CONTINENT_CODE']=='NN') & (input_data['SWIFT_MSG_TYPE']==103)]
NN_103_df.head()
NN_103_df.shape


In [None]:
NN_202_df = input_data[(input_data['CONTINENT_CODE']=='NN') & (input_data['SWIFT_MSG_TYPE']==202)]
NN_202_df.head()
NN_202_df.shape

In [None]:
EU_103_df = input_data[(input_data['CONTINENT_CODE']=='EU') & (input_data['SWIFT_MSG_TYPE']==103)]
EU_103_df.head()
EU_103_df.shape

In [None]:
EU_202_df = input_data[(input_data['CONTINENT_CODE']=='EU') & (input_data['SWIFT_MSG_TYPE']==202)]
EU_202_df.head()
EU_202_df.shape

In [None]:
AS_103_df = input_data[(input_data['CONTINENT_CODE']=='EU') & (input_data['SWIFT_MSG_TYPE']==103)]
AS_103_df.head()
AS_103_df.shape

In [None]:
AS_202_df = input_data[(input_data['CONTINENT_CODE']=='EU') & (input_data['SWIFT_MSG_TYPE']==202)]
AS_202_df.head()
AS_202_df.shape

## **Unsupervised Ensemble Learner**

In [None]:
##################################################################################################################
#
# Purpose: Function to calculate outlier scores for a given input data set.
# Machine learning method: Ensemble learner of Local Outlier Factor and Isolaion Forest.
# Score to fraud label rule: A score smaller than -1 is fraud and greater than or equal to -1 is not fraud.
#
##################################################################################################################

def ensemble_fun (df, n_neighbors_n=20, leaf_size=30, pairwise_n=2, n_estimators_n=100, random_state_n=42, contamination_n=0.05):

  # Include only the relavent independent varialbes
  X=df[['TRXN_MONTH','TRANSACTION_AMOUNT']]
  
  # Center the data around the mean of 0
  from sklearn.preprocessing import StandardScaler
  scaling=StandardScaler()
  scaling.fit_transform(X)

  # Initialize the log variable
  class log:
    def_tz = pytz.timezone('America/New_York')
    def info(text):        
        print(f'{datetime.now(log.def_tz).replace(microsecond=0)} : {text}');

  # Initialize an enumerate list of estimators
  estimator_list = {
    # novelty=False because this is outlier detection.
    # pairwise_n = 2 for Euclidian distance and 1 for Manhattan distance.
    'LOF':LocalOutlierFactor(novelty=False, n_neighbors=n_neighbors_n, algorithm='auto', leaf_size=30, 
                             metric='minkowski', p=pairwise_n, metric_params=None, contamination=contamination_n),
    'iForest':IsolationForest(n_estimators=n_estimators_n, random_state=random_state_n, max_samples=len(X), contamination=contamination_n)
  }

  # Input data frame size
  n_rows_in = X.shape[0]
  n_features_in = X.shape[1]

  # Initializing score array
  ensemble_scores = np.zeros([n_rows_in, len(estimator_list)])

  # Ensemble via score averaging
  log.info (f'Input data frame size: Rows = {n_rows_in}, Columns = {n_features_in}')

  for i, (clf_name, clf) in enumerate(estimator_list.items()):
    try:
        clf.fit(X)
        if clf_name == "LOF":
            log.info(f'Fitting {clf_name}')
            ensemble_scores[:, i] = clf.negative_outlier_factor_
        else:
            log.info(f'Fitting {clf_name}')
            ensemble_scores[:, i] = clf.decision_function(X)
    except:
            log.info(traceback.print_exc())
    else:    
            log.info(f'{clf_name} is fitted successfully with {len(ensemble_scores)} scores')  

  # Repalce NaN with 0's
  ensemble_scores=np.nan_to_num(ensemble_scores) 

  # Removing rows where scores from either algorithm is 0
  ensemble_scores = ensemble_scores[:,~np.all(ensemble_scores == 0.0, axis=0)] 

  # Averaging scores from both algorithms
  score_by_avg = np.mean(ensemble_scores, axis = 1) 

  # Make a copy of final score array
  pred_y = np.copy(score_by_avg) 

  score_min = min(pred_y)
  score_max = max(pred_y)

  log.info (f'Minimum Score = {score_min}, Maximum Score = {score_max}')

  # Labeling all scores <-0.5 as fraud and >=-0.5 as non-fraud
  pred_y[pred_y < -50] = -99
  pred_y[pred_y >= -50] = 0.0
  pred_y[pred_y == -99] = 1.0

  fraud_ct = np.count_nonzero(pred_y == 1.0)
  fraud_pct = fraud_ct/n_rows_in

  log.info (f'Percentage of suspicious transactions: {fraud_pct}')

  df_arr=df.to_numpy() # Converting the input data frame to array
  df_arr_f=np.column_stack( (df_arr, pred_y)) # Add the scores to the input data frame

  # Converting the combinded array back to a data frame
  df_f = pd.DataFrame(df_arr_f, columns = ['TRANSACTION_ID','TRANSACTION_TIME','TRXN_MONTH','CLIENT_ID','COUNTRY_NAME','COUNTRY_CODE','CONTINENT_NAME',	'CONTINENT_CODE','SWIFT_MSG_TYPE','AVG_TRXN_AMT','TRANSACTION_AMOUNT','FRAUD_LABEL']) 

  # Output data frame size
  n_rows_o = df_f.shape[0]
  n_features_o = df_f.shape[1]
  log.info (f'Output data frame size: Rows = {n_rows_o}, Columns = {n_features_o}')

  return df_f

In [None]:
NN_103_score_df = ensemble_fun (NN_103_df, n_neighbors_n=20, leaf_size=30, pairwise_n=2, n_estimators_n=100, random_state_n=42, contamination_n=0.01)
NN_103_score_df

In [None]:
NN_202_score_df = ensemble_fun (NN_202_df, n_neighbors_n=20, leaf_size=30, pairwise_n=2, n_estimators_n=100, random_state_n=42, contamination_n=0.01)
NN_202_score_df

In [None]:
EU_103_score_df = ensemble_fun (EU_103_df, n_neighbors_n=20, leaf_size=30, pairwise_n=2, n_estimators_n=100, random_state_n=42, contamination_n=0.01)
EU_103_score_df

In [None]:
EU_202_score_df = ensemble_fun (EU_202_df, n_neighbors_n=20, leaf_size=30, pairwise_n=2, n_estimators_n=100, random_state_n=42, contamination_n=0.01)
EU_202_score_df

In [None]:
AS_103_score_df = ensemble_fun (AS_103_df, n_neighbors_n=20, leaf_size=30, pairwise_n=2, n_estimators_n=100, random_state_n=42, contamination_n=0.01)
AS_103_score_df

In [None]:
AS_202_score_df = ensemble_fun (AS_202_df, n_neighbors_n=20, leaf_size=30, pairwise_n=2, n_estimators_n=100, random_state_n=42, contamination_n=0.01)
AS_202_score_df