In [87]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, FloatType, DateType
import datetime, time, calendar

print(sc)

<pyspark.context.SparkContext object at 0x102a1b470>


In [88]:
spark = (SparkSession
         .builder
         .appName("Catch you merchant")
         .config("spark.sql.warehouse.dir", "/opt/jupyter_workspace/spark-warehouse")
         .getOrCreate())
print(spark)

<pyspark.sql.session.SparkSession object at 0x105135e48>


In [136]:
workspace = "/Users/AUM/Desktop/MerchantInsight/mock_data/"

df = (spark
     .read
     .option("header", "true")
     .option("inferSchema", "true")
     .csv(workspace + "deposit_mock.csv"))

In [137]:
df = df.dropna()

df.show()

+-----+-----------+-------+---------+---------+------+--------+------+------+--------+
|ar_id|fm_to_ar_id|txn_amt|svc_br_no|opm_tp_cd|txn_cd|ptn_yyyy|ptn_mm|ptn_dd|  txn_tm|
+-----+-----------+-------+---------+---------+------+--------+------+------+--------+
|  11a|        13c|    100|      900|       CR|     0|    2016|     8|     3| 9:58:18|
|  11a|        13c|1000000|      900|       DR|     0|    2016|     8|     5|10:58:18|
|  11a|        13c|  10000|      900|       DR|     0|    2016|     9|    10|11:58:18|
|  11a|        13c|  10000|      900|       DR|     0|    2016|     9|    10|11:58:18|
|  11a|        14d| 200000|      901|       DR|     0|    2016|    10|     3|12:58:18|
|  12b|        15c|  30000|      902|       DR|     0|    2016|    12|     3|13:58:18|
|  12b|        16d| 200000|      903|       CR|     0|    2016|     8|    30|14:58:18|
|  12b|        16d|   5000|      904|       CR|     0|    2016|     8|    13|15:58:18|
|  13c|        11b|     10|      904|      

In [138]:
# define useful variable

number_of_months = 10
transfer_code = 0
deposit_code = 1
withdraw_code = 2

In [148]:
# Re-organize the original data set

def no_days_in_month(month, year):
    if month in day_months_31: 
        return 31
    elif month in day_months_30:
        return 30
    else:
        if calendar.isleap(year):
            return 29
        else:
            return 28
        
def day_of_week_code(day_of_week):
    if day_of_week < 4:
        return 0
    elif day_of_week > 4:
        return 2
    else:
        return 1
    
def quarter_code(date, month):
    month_31_days = [1,3,5,7,8,10,12]
    month_30_days = [4,6,9,11]
    if(month in month_31_days):
        if(date in range(1,9)):
            return 1
        elif(date in range(9,16)):
            return 2
        elif(date in range(16,24)):
            return 3
        else:
            return 4
    elif(month in month_30_days):
        if(date in range(1,9)):
            return 1
        elif(date in range(9,16)):
            return 2
        elif(date in range(16,23)):
            return 3
        else:
            return 4
    else: # February
        return date / 4 

def period_code(time):
    hour = int(time[:-6])
    if hour in range(0, 6):
        return 0
    elif hour in range(6, 12):
        return 1
    elif hour in range(12, 18):
        return 2
    else:
        return 3

date = udf(lambda y, m, d : datetime.datetime(y, m ,d), DateType())
day_of_week = udf(lambda date : int(date.weekday()), IntegerType())
day_of_week_code_udf = udf(day_of_week_code, IntegerType())
quarter_code_udf = udf(quarter_code, IntegerType())
period_code_udf = udf(period_code, IntegerType())

df = df.withColumn("date", date(df["ptn_yyyy"], df["ptn_mm"], df["ptn_dd"]))
df = df.withColumn("day_of_week", day_of_week(df["date"]))
df = df.withColumn("day_of_week_code", day_of_week_code_udf(df["day_of_week"]))
df = df.withColumn("quarter_code", quarter_code_udf(df["ptn_dd"], df["ptn_mm"]))
df = df.withColumn("period_code", period_code_udf(df["txn_tm"]))


# df = df.select("ar_id", "fm_to_ar_id", "txn_amt", "svc_br_no", "opm_tp_cd", "txn_cd", "day_of_week_code")
df.show()

+-----+-----------+-------+---------+---------+------+--------+------+------+--------+----------+-----------+-----------+-------------+----------------+------------+------------+-----------+
|ar_id|fm_to_ar_id|txn_amt|svc_br_no|opm_tp_cd|txn_cd|ptn_yyyy|ptn_mm|ptn_dd|  txn_tm|      date|day_of_week|hour_of_day|days_in_month|day_of_week_code|day_in_month|quarter_code|period_code|
+-----+-----------+-------+---------+---------+------+--------+------+------+--------+----------+-----------+-----------+-------------+----------------+------------+------------+-----------+
|  11a|        13c|    100|      900|       CR|     0|    2016|     8|     3| 9:58:18|2016-08-03|          2|          9|           31|               0|          31|           1|          1|
|  11a|        13c|1000000|      900|       DR|     0|    2016|     8|     5|10:58:18|2016-08-05|          4|         10|           31|               1|          31|           1|          1|
|  11a|        13c|  10000|      900|       D

In [144]:
df.select("day_of_week").show()

+-----------+
|day_of_week|
+-----------+
|          2|
|          4|
|          5|
|          5|
|          0|
|          5|
|          1|
|          5|
|          2|
+-----------+



In [93]:
data = df.select("ar_id").distinct().union(df.select("fm_to_ar_id").distinct()).dropna()
data.show()

+-----+
|ar_id|
+-----+
|  13c|
|  11a|
|  12b|
|  16d|
|  15c|
|  14d|
|  13c|
|  11b|
+-----+



In [151]:
# define feature extraction functions

def noDepositBranchVisit(is_unique):
    if (is_unique):
        return df.select("ar_id", "svc_br_no").groupby("ar_id").agg({"svc_br_no" : "count"}).withColumnRenamed("count(svc_br_no)", "noDepositBranchVisit")
    else:
        return df.select("ar_id", "svc_br_no").distinct().groupby("ar_id").agg({"svc_br_no" : "count"}).withColumnRenamed("count(svc_br_no)", "noDepositBranchVisitUnique")
    
def noDepositTransferIn(is_unique):
    if (is_unique):
        return df.filter("opm_tp_cd = 'CR' and txn_cd = 0").groupby(["ar_id", "fm_to_ar_id"]).agg({"*" : "count"}).withColumnRenamed("count(1)", "noDepositTransferInUnique")
    else:
        return df.filter("opm_tp_cd = 'CR' and txn_cd = 0").groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "noDepositTransferIn")
    
def noDepositTransferOut(is_unique):
    if (is_unique):
        return df.filter("opm_tp_cd = 'DR' and txn_cd = 0").groupby(["ar_id", "fm_to_ar_id"]).agg({"*" : "count"}).withColumnRenamed("count(1)", "noDepositTransferOutUniqie")
    else:
        return df.filter("opm_tp_cd = 'DR' and txn_cd = 0").groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "noDepositTransferOut")

def noDeposit(): 
    return df.filter("opm_tp_cd = 'CR' and txn_cd = 1").groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "noDeposit")

def noWithdraw(): 
    return df.filter("opm_tp_cd = 'DR' and txn_cd = 2").groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "noWithdraw")

def depositAmount(): 
    return df.filter("opm_tp_cd = 'CR' and txn_cd = 1").groupby("ar_id").agg({"txn_amt" : "sum"}).withColumnRenamed("sum(txn_amt)", "depositAmount")

def withdrawAmount(): 
    return df.filter("opm_tp_cd = 'DR' and txn_cd = 2").groupby("ar_id").agg({"txn_amt" : "sum"}).withColumnRenamed("sum(txn_amt)", "withdrawAmount")

def transferInAmount():
    return df.filter("opm_tp_cd = 'CR' and txn_cd = 0").groupby("ar_id").agg({"txn_amt" : "sum"}).withColumnRenamed("sum(txn_amt)", "transferInAmount")
    
def transferOutAmount():
    return df.filter("opm_tp_cd = 'DR' and txn_cd = 0").groupby("ar_id").agg({"txn_amt" : "sum"}).withColumnRenamed("sum(txn_amt)", "transferOutAmount")

In [154]:
# define udf functions for all about ratio
ratioDepositBranchVisit = udf(lambda visit, unique_visit : visit / unique_visit)
ratioTransferIn = udf(lambda transfer_in, transfer_in_unique : transfer_in / transfer_in_unique)
ratioTransferOut = udf(lambda transfer_out, transfer_out_unique : transfer_out / transfer_out_unique)

In [153]:
# noDepositBranchVisit(is_unique=True).show()

# noDepositTransferIn(is_unique=True).show()

# noDepositTransferOut(is_unique=True).show()

transferOutAmount().show()

+-----+-----------------+
|ar_id|transferOutAmount|
+-----+-----------------+
|  11a|          1220000|
|  12b|            30000|
+-----+-----------------+

