In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, DateType
import datetime, math

print(sc)

<pyspark.context.SparkContext object at 0x10112a470>


In [2]:
spark = (SparkSession
         .builder
         .appName("Catch you merchant")
         .config("spark.sql.warehouse.dir", "/opt/jupyter_workspace/spark-warehouse")
         .getOrCreate())
print(spark)

<pyspark.sql.session.SparkSession object at 0x105104e48>


In [3]:
workspace = "/Users/AUM/Desktop/MerchantInsight/mock_data/"

df = (spark
     .read
     .option("header", "true")
     .option("inferSchema", "true")
     .csv(workspace + "deposit_mock.csv"))

In [4]:
df = df.dropna()

df.show()

+-----+-----------+-------+---------+---------+------+--------+------+------+--------+
|ar_id|fm_to_ar_id|txn_amt|svc_br_no|opm_tp_cd|txn_cd|ptn_yyyy|ptn_mm|ptn_dd|  txn_tm|
+-----+-----------+-------+---------+---------+------+--------+------+------+--------+
|    A|          B|     10|      900|       DR|     0|    2017|     1|     3| 5:00:00|
|    B|          A|     10|      900|       CR|     0|    2017|     1|     3| 5:00:00|
|    A|          B|    100|      900|       CR|     0|    2017|     1|     3|11:00:00|
|    B|          A|    100|      900|       DR|     0|    2017|     1|     3|11:00:00|
|    A|          B|     10|      900|       DR|     0|    2017|     1|     3|17:00:00|
|    B|          A|     10|      900|       CR|     0|    2017|     1|     3|17:00:00|
|    A|          B|    100|      900|       CR|     0|    2017|     1|     3|23:00:00|
|    B|          A|    100|      900|       DR|     0|    2017|     1|     3|23:00:00|
|    A|          B|     10|      900|      

In [5]:
# define useful variable

number_of_months = 10
transfer_code = 0
deposit_code = 2
withdraw_code = 1

In [6]:
# Re-organize the original dataset format

## Define function & udf that need to derive columns
def no_days_in_month(month, year):
    if month in day_months_31: 
        return 31
    elif month in day_months_30:
        return 30
    else:
        if calendar.isleap(year):
            return 29
        else:
            return 28
        
def day_of_week_code(day_of_week):
    if day_of_week < 4:
        return 1
    elif day_of_week > 4:
        return 3
    else:
        return 2
    
def quarter_code(date, month):
    month_31_days = [1,3,5,7,8,10,12]
    month_30_days = [4,6,9,11]
    if(month in month_31_days):
        if(date in range(1,9)):
            return 1
        elif(date in range(9,16)):
            return 2
        elif(date in range(16,24)):
            return 3
        else:
            return 4
    elif(month in month_30_days):
        if(date in range(1,9)):
            return 1
        elif(date in range(9,16)):
            return 2
        elif(date in range(16,23)):
            return 3
        else:
            return 4
    else: # February
        return math.ceil(date / 4.0) 

def period_code(time):
    hour = int(time[:-6])
    if hour in range(0, 6):
        return 1
    elif hour in range(6, 12):
        return 2
    elif hour in range(12, 18):
        return 3
    else:
        return 4

date = udf(lambda y, m, d : datetime.datetime(y, m ,d), DateType())
day_of_week = udf(lambda date : int(date.weekday()), IntegerType())
day_of_week_code_udf = udf(day_of_week_code, IntegerType())
quarter_code_udf = udf(quarter_code, IntegerType())
period_code_udf = udf(period_code, IntegerType())

## Compute with original dataset
df = df.withColumn("date", date(df["ptn_yyyy"], df["ptn_mm"], df["ptn_dd"]))
df = df.withColumn("day_of_week", day_of_week(df["date"]))
df = df.withColumn("day_of_week_code", day_of_week_code_udf(df["day_of_week"]))
df = df.withColumn("quarter_code", quarter_code_udf(df["ptn_dd"], df["ptn_mm"]))
df = df.withColumn("period_code", period_code_udf(df["txn_tm"]))

df.show(10)

+-----+-----------+-------+---------+---------+------+--------+------+------+--------+----------+-----------+----------------+------------+-----------+
|ar_id|fm_to_ar_id|txn_amt|svc_br_no|opm_tp_cd|txn_cd|ptn_yyyy|ptn_mm|ptn_dd|  txn_tm|      date|day_of_week|day_of_week_code|quarter_code|period_code|
+-----+-----------+-------+---------+---------+------+--------+------+------+--------+----------+-----------+----------------+------------+-----------+
|    A|          B|     10|      900|       DR|     0|    2017|     1|     3| 5:00:00|2017-01-03|          1|               1|           1|          1|
|    B|          A|     10|      900|       CR|     0|    2017|     1|     3| 5:00:00|2017-01-03|          1|               1|           1|          1|
|    A|          B|    100|      900|       CR|     0|    2017|     1|     3|11:00:00|2017-01-03|          1|               1|           1|          2|
|    B|          A|    100|      900|       DR|     0|    2017|     1|     3|11:00:00|20

In [7]:
# Initial user dataset

## Find oldest & youngest Datetime object to calculate user's number of months.
from pyspark.sql.functions import from_unixtime, unix_timestamp, min, max

str_to_date = udf(lambda date_str : datetime.datetime.strptime(date_str[:-12], "%Y-%m"), DateType()) # ignore time & date
get_user_number_of_month = udf(lambda oldest, youngest : (youngest.year - oldest.year) * 12 + (youngest.month - oldest.month) + 1, IntegerType())

def get_df_with_number_of_month():
    user_df = df.select("ar_id", "date").withColumn("unix_date", unix_timestamp("date")).groupby("ar_id").agg(
                from_unixtime(min("unix_date")).alias("min_date"), 
                from_unixtime(max("unix_date")).alias("max_date"))
    user_df = user_df.withColumn("oldest_month", str_to_date(user_df["min_date"]))
    user_df = user_df.withColumn("youngest_month", str_to_date(user_df["max_date"]))
    user_df = user_df.withColumn("user_number_of_month", get_user_number_of_month(user_df["oldest_month"], user_df["youngest_month"]))
    return user_df

def initial_df():
    user_df = get_df_with_number_of_month()
    return user_df.join(df.filter("opm_tp_cd = 'CR' and txn_cd = 0").groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "user_number_of_transfer_in"), "ar_id")\
                    .join(df.filter("opm_tp_cd = 'DR' and txn_cd = 0").groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "user_number_of_transfer_out"), "ar_id")\
                    .join(df.filter("opm_tp_cd = 'CR' and txn_cd = 2").groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "user_number_of_withdraw"), "ar_id")\
                    .join(df.filter("opm_tp_cd = 'DR' and txn_cd = 1").groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "user_number_of_deposit"), "ar_id")
        
user_df = initial_df()
user_df.show(10)

In [None]:
# Define function for generating period & day_of_week & quarter code

## Quarter code generator (quarter of the month)
def get_df_with_quarter_code(data, col_name, agg_func, agg_col):
    if agg_func == "sum":
        return data.groupby("ar_id").pivot("quarter_code").sum(agg_col)\
                .toDF('ar_id', col_name + 'Q1', col_name + 'Q2', col_name + 'Q3', col_name + 'Q4')
    else :
        return data.groupby("ar_id").pivot("quarter_code").count()\
                .toDF('ar_id', col_name + 'Q1', col_name + 'Q2', col_name + 'Q3', col_name + 'Q4')

## Period code generator (period of the day)
def get_df_with_period_code(data, col_name, agg_func, agg_col):
    if agg_func == "sum":
        return data.groupby("ar_id").pivot("period_code").sum(agg_col)\
                .toDF('ar_id', col_name + 'P1', col_name + 'P2', col_name + 'P3', col_name + 'P4')
    else :
        return data.groupby("ar_id").pivot("period_code").count()\
                .toDF('ar_id', col_name + 'P1', col_name + 'P2', col_name + 'P3', col_name + 'P4')

## Day of week code generator (Mon-Thu, Fri, Sat-Sun)
def get_df_with_day_of_week_code(data, col_name):
    return data.groupby("ar_id").pivot("day_of_week_code").count()\
            .toDF('ar_id', col_name + 'D1', col_name + 'D2', col_name + 'D3')


In [None]:
# Define feature extraction functions

def noDepositBranchVisit(is_unique):
    if is_unique:
        data = df.select("ar_id", "svc_br_no", "quarter_code").distinct()
        grouped_data = data.groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "allNoDepositBranchVisitUnique")
        return grouped_data.join(get_df_with_quarter_code(data, "noDepositBranchVisitUnique", "count", "*"), "ar_id")

    else:
        data = df.select("ar_id", "svc_br_no", "quarter_code")
        grouped_data = data.groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "allNoDepositBranchVisit")
        return grouped_data.join(get_df_with_quarter_code(data, "noDepositBranchVisit", "count", "*"), "ar_id")

def noDepositTransferIn(is_unique):
    if is_unique:
        data = df.filter("opm_tp_cd = 'CR' and txn_cd = 0").select("ar_id", "period_code", "day_of_week_code", "quarter_code").distinct()
        grouped_data = data.groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "allNoTransferInUnique")
        return grouped_data.join(get_df_with_quarter_code(data, "noTransferInUnique", "count", "*"), "ar_id")\
                .join(get_df_with_period_code(data, "noTransferInUnique", "count", "*"), "ar_id")\
                .join(get_df_with_day_of_week_code(data, "noTransferInUnique"), "ar_id")
    else:
        data = df.filter("opm_tp_cd = 'CR' and txn_cd = 0").select("ar_id", "period_code", "day_of_week_code", "quarter_code")
        grouped_data = data.groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "allNoTransferIn")
        return grouped_data.join(get_df_with_quarter_code(data, "noTransferIn", "count", "*"), "ar_id")\
                .join(get_df_with_period_code(data, "noTransferIn", "count", "*"), "ar_id")\
                .join(get_df_with_day_of_week_code(data, "noTransferIn"), "ar_id")
    
def noDepositTransferOut(is_unique):
    if is_unique:
        data = df.filter("opm_tp_cd = 'DR' and txn_cd = 0").select("ar_id", "period_code", "day_of_week_code", "quarter_code").distinct()
        grouped_data = data.groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "allNoTransferOutUnique")
        return grouped_data.join(get_df_with_quarter_code(data, "noTransferOutUnique", "count", "*"), "ar_id")\
                .join(get_df_with_period_code(data, "noTransferOutUnique", "count", "*"), "ar_id")\
                .join(get_df_with_day_of_week_code(data, "noTransferOutUnique"), "ar_id")
    else:
        data = df.filter("opm_tp_cd = 'DR' and txn_cd = 0").select("ar_id", "period_code", "day_of_week_code", "quarter_code")
        grouped_data = data.groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "allNoTransferOut")
        return grouped_data.join(get_df_with_quarter_code(data, "noTransferOut", "count", "*"), "ar_id")\
                .join(get_df_with_period_code(data, "noTransferOut", "count", "*"), "ar_id")\
                .join(get_df_with_day_of_week_code(data, "noTransferOut"), "ar_id")
    
def noDeposit(): 
    data = df.filter("opm_tp_cd = 'CR' and txn_cd = 2")
    grouped_data = data.groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "allNoDeposit")
    return grouped_data.join(get_df_with_quarter_code(data, "noDeposit", "count", "*"), "ar_id")\
                .join(get_df_with_period_code(data, "noDeposit", "count", "*"), "ar_id")\
                .join(get_df_with_day_of_week_code(data, "noDeposit"), "ar_id")
def noWithdraw(): 
    data = df.filter("opm_tp_cd = 'DR' and txn_cd = 1")
    grouped_data = data.groupby("ar_id").agg({"*" : "count"}).withColumnRenamed("count(1)", "allNoWithdraw")
    return grouped_data.join(get_df_with_quarter_code(data, "noWithdraw", "count", "*"), "ar_id")\
                .join(get_df_with_period_code(data, "noWithdraw", "count", "*"), "ar_id")\
                .join(get_df_with_day_of_week_code(data, "noWithdraw"), "ar_id")
def depositAmount(): 
    data = df.filter("opm_tp_cd = 'CR' and txn_cd = 2")
    grouped_data = data.groupby("ar_id").agg({"txn_amt" : "sum"}).withColumnRenamed("sum(txn_amt)", "allDepositAmount")
    return grouped_data.join(get_df_with_quarter_code(data, "depositAmount", "sum", "txn_amt"), "ar_id")

def withdrawAmount(): 
    data = df.filter("opm_tp_cd = 'DR' and txn_cd = 1")
    grouped_data = data.groupby("ar_id").agg({"txn_amt" : "sum"}).withColumnRenamed("sum(txn_amt)", "allWithdrawAmount")
    return grouped_data.join(get_df_with_quarter_code(data, "withdrawAmount", "sum", "txn_amt"), "ar_id")

def depositTransferInAmount():
    data = df.filter("opm_tp_cd = 'CR' and txn_cd = 0")
    grouped_data = data.groupby("ar_id").agg({"txn_amt" : "sum"}).withColumnRenamed("sum(txn_amt)", "allDepositTransferInAmount")
    return grouped_data.join(get_df_with_quarter_code(data, "depositTransferInAmount", "sum", "txn_amt"), "ar_id")
    
def depositTransferOutAmount():
    data = df.filter("opm_tp_cd = 'DR' and txn_cd = 0")
    grouped_data = data.groupby("ar_id").agg({"txn_amt" : "sum"}).withColumnRenamed("sum(txn_amt)", "allDepositTransferOutAmount")
    return grouped_data.join(get_df_with_quarter_code(data, "depositTransferOutAmount", "sum", "txn_amt"), "ar_id")

In [None]:
# define udf functions for ratio issues.
ratio = udf(lambda x, y : x / y)

In [None]:
# Join section

final_df = user_df.join(noDepositBranchVisit(is_unique=False), "ar_id", "left_outer")\
            .join(noDepositBranchVisit(is_unique=True), "ar_id", "left_outer")\
            .join(noDeposit(), "ar_id", "left_outer")\
            .join(noWithdraw(), "ar_id", "left_outer")\
            .join(depositAmount(), "ar_id", "left_outer")\
            .join(withdrawAmount(), "ar_id", "left_outer")\
            .join(noDepositTransferIn(is_unique=False), "ar_id", "left_outer")\
            .join(noDepositTransferIn(is_unique=True), "ar_id", "left_outer")\
            .join(noDepositTransferOut(is_unique=False), "ar_id", "left_outer")\
            .join(noDepositTransferOut(is_unique=True), "ar_id", "left_outer")\
            .join(depositTransferInAmount(), "ar_id", "left_outer")\
            .join(depositTransferOutAmount(), "ar_id", "left_outer")


In [9]:
final_df.withColumn("noDepositBranchVisit", ratio(df["allNoDepositBranchVisit"], df["user_number_of_month"])\
            .withColumn("noDepositBranchVisitUnique", ratio(df["allNoDepositBranchVisitUnique"], df["user_number_of_month"])\
            .withColumn("noDeposit", ratio(df["allNoDeposit"], df["user_number_of_month"])\
            .withColumn("noWithdraw", ratioDepositBranchVisit(df["allNoWithdraw"], df["user_number_of_month"])\
            .withColumn("noTransferIn", ratio(df["allNoTransferIn"], df["user_number_of_month"])\
            .withColumn("noTransferOut", ratio(df["allNoTransferOut"], df["user_number_of_month"])\
            .withColumn("noTransferInUnique", ratio(df["allNoTransferInUnique"], df["user_number_of_month"])\
            .withColumn("noTransferOutUnique", ratio(df["allNoTransferOutUnique"], df["user_number_of_month"])

SyntaxError: unexpected character after line continuation character (<ipython-input-9-3cb6d53f1a7a>, line 1)

In [None]:
for i in range(1, 5):
    print(i)

In [None]:
final_df.toPandas()

In [None]:
# Quarter code example

def get_df_with_quarter_code(data, col_name, agg_func, agg_col):
    if agg_func == "sum":
        return data.groupby("ar_id").pivot("quarter_code").sum(agg_col)\
                .toDF('ar_id', col_name + 'Q1', col_name + 'Q2', col_name + 'Q3', col_name + 'Q4')
    else :
        return data.groupby("ar_id").pivot("quarter_code").count()\
                .toDF('ar_id', col_name + 'Q1', col_name + 'Q2', col_name + 'Q3', col_name + 'Q4')
        
data = df.filter("opm_tp_cd = 'CR' and txn_cd = 0")

# data.groupby(["ar_id", "quarter_code"]).agg({"*" : "count"})\
# .withColumnRenamed("count(1)", "allNoTransferIn").show()

# data.groupby("ar_id").pivot("quarter_code").sum("txn_amt")\
# .toDF('ar_id', 'depositAmountQ1', 'depositAmountQ2', 'depositAmountQ3', 'depositAmountQ4').show()

get_df_with_quarter_code(data, "noTransferIn", "count", "*").show()

# data.groupby("ar_id").agg({"*" : "count"})\
# .withColumnRenamed("count(1)", "allNoTransferIn")\
# .join(data.groupby("ar_id").pivot("quarter_code").count()\
# .toDF('ar_id', 'noTransferInQ1', 'noTransferInQ2', 'noTransferInQ3', 'noTransferInQ4'), "ar_id").show()

In [None]:
# Period code example
            
def get_df_with_period_code(data, col_name, agg_func, agg_col):
    if agg_func == "sum":
        return data.groupby("ar_id").pivot("period_code").sum(agg_col)\
                .toDF('ar_id', col_name + 'P1', col_name + 'P2', col_name + 'P3', col_name + 'P4')
    else :
        return data.groupby("ar_id").pivot("period_code").count()\
                .toDF('ar_id', col_name + 'P1', col_name + 'P2', col_name + 'P3', col_name + 'P4')

data = df.filter("opm_tp_cd = 'CR' and txn_cd = 0")

# data.groupby("ar_id").agg({"*" : "count"})\
# .withColumnRenamed("count(1)", "allNoDeposit").show()

# data.groupby("ar_id", "period_code").agg({"*" : "count"})\
# .withColumnRenamed("count(1)", "noDeposit").show()

# data.groupby("ar_id").pivot("period_code").count()\
# .toDF('ar_id', 'noDepositQ1', 'noDepositQ2', 'noDepositQ3', 'noDepositQ4').show()

get_df_with_period_code(data, "noTransferIn", "count", "*").show()

# data.groupby("ar_id").agg({"*" : "count"})\
# .withColumnRenamed("count(1)", "allNoDeposit")\
# .join(data.groupby("ar_id").pivot("period_code").count()\
# .toDF('ar_id', 'noDepositP1', 'noDepositP2', 'noDepositP3', 'noDepositP4'), "ar_id").show()

In [None]:
# Day of week code example
def get_df_with_day_of_week_code(data, col_name):
    return data.groupby("ar_id").pivot("day_of_week_code").count()\
            .toDF('ar_id', col_name + 'D1', col_name + 'D2', col_name + 'D3')

get_df_with_day_of_week_code(data, "noTransferIn").show()

In [None]:
# Monthly example
## Find oldest & youngest Datetime object
from pyspark.sql.functions import from_unixtime, unix_timestamp, min, max

str_to_date = udf(lambda date_str : datetime.datetime.strptime(date_str[:-12], "%Y-%m"), DateType()) # ignore time & date
get_user_number_of_month = udf(lambda oldest, youngest : (youngest.year - oldest.year) * 12 + (youngest.month - oldest.month) + 1, IntegerType())


def get_user_number_of_month(user_df):
    user_df = df.select("ar_id", "date").withColumn("unix_date", unix_timestamp("date")).groupby("ar_id").agg(
                from_unixtime(min("unix_date")).alias("min_date"), 
                from_unixtime(max("unix_date")).alias("max_date"))

    # user_df.show()

    user_df = user_df.withColumn("oldest_month", str_to_date(user_df["min_date"]))
    user_df = user_df.withColumn("youngest_month", str_to_date(user_df["max_date"]))
    user_df = user_df.withColumn("user_number_of_month", get_user_number_of_month(user_df["oldest_month"], user_df["youngest_month"]))

    # user_df.show()

    return user_df