In [25]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.types import IntegerType, FloatType, DateType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
import datetime, time, calendar

In [26]:
sc = SparkContext.getOrCreate()
print(sc)

<pyspark.context.SparkContext object at 0x000002173CBC2C50>


In [27]:
# join 
# ip <-> ar <-> mb
#           <-> dept

# ip <-> ar : "ip_id"
# ip <-> cc_cst : "x"
# cc_cst <-> cc_visa : "x"
# ar <-> mb : "src_ar_id"
# ar <-> dept : "ar_id"

# DerivedFeature
# [PerWeek & PerMonth]
# noTransferInUnique, noTransferOutUnique, noTransferIn, noTransferOut, ratioTransferIn, ratioTransferOut
# avgTransferInAmount, avgTransferOutAmount, avgTransferAmount
# noFeeAmountGroupBy(0,10,25,35)
# noTransferInPeriod(morning,afternoon,evening,night :: 6hr)
# noTransferOutPeriod(morning,afternoon,evening,night :: 6hr)
# noTransferInPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# noTransferOutPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# avgTransferInHoliday, avgTransferOutHoliday


# Deposit
# [PerWeek & PerMonth]
# noBranchVisit,noBranchVisitUnique,ratioBranchVisit
# avgTransferInAmount, avgTransferOutAmount, avgTransferAmount
# noDepositAmount, noWithdrawAmount
# avgDepositAmount, avgWithdrawAmount
# noWithdrawPeriod(morning,afternoon,evening,night :: 6hr)
# noDepositPeriod(morning,afternoon,evening,night :: 6hr)
# noDepositPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# noWithdrawPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# noTransferInUnique, noTransferOutUnique, noTransferIn, noTransferOut, ratioTransferIn, ratioTransferOut
# noTransferInPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# noTransferOutPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# ??Balance??
# ??CBS Sub Operation Code??
# ahrzd_usr_id <-- [KMP, EDC, ATM]

# Credit


# Approach
# 1 Detech ลูกน้องโดยดูจำนวนเงินที่รับมากที่สุดแล้วเช็ค

# Visualize
# Location

# feature = ["vc_ip.ip_id"]
# category = ["vc_ip.ip_tp_cd","vc_ip.mar_st_cd","vc_ip.ctf_tp_cd","vc_ip.ocp_cd","vc_ip.idv_incm_seg_cd"]
# filteredFeature = ["vc_ip.prvn_f","vc_ip.ip_st_cd","vc_ip.death_f"]

In [28]:
spark = (SparkSession
         .builder
         .appName("MerchantInsight")
         .config("spark.sql.warehouse.dir", "/opt/jupyter_workspace/spark-warehouse")
         .getOrCreate())
print (spark)

<pyspark.sql.session.SparkSession object at 0x000002173E42CAC8>


In [29]:
mb_trans_data =(spark
      .read
      .option("header", "true")
      .option("inferSchema","true")
      .csv('mock_data/mock-transaction.csv'))

In [30]:
mb_trans_data.show()

+--------+--------+--------+------+------+------+-------+--------+-------+
|fm_ar_id|to_ar_id|ptn_yyyy|ptn_mm|ptn_dd|ptn_hr|ptn_min| tfr_amt|fee_amt|
+--------+--------+--------+------+------+------+-------+--------+-------+
|     128|     140|    2016|     6|    19|    11|     22|  6201.0|      0|
|     114|     150|    2016|     8|     4|    13|     37| 24021.0|      0|
|     131|     120|    2016|     7|     6|    13|     31| 2361.28|      0|
|     144|     104|    2016|     9|    10|    11|     32| 24410.0|     10|
|     133|     130|    2016|    12|    15|    13|     25| 18157.0|      0|
|     109|     140|    2016|     6|    19|    22|     33| 11466.0|     25|
|     116|     146|    2016|     7|    23|    15|     57| 15670.0|      0|
|     110|     128|    2016|    10|    29|    20|     55| 15124.0|      0|
|     101|     123|    2016|     7|    10|    22|     52| 21758.0|     10|
|     128|     137|    2016|     8|    30|    20|     38| 23290.0|      0|
|     108|     114|    20

In [31]:
#Pre-processing data

def no_days_in_month(month, year):
    if month in day_months_31: 
        return 31
    elif month in day_months_30:
        return 30
    else:
        if calendar.isleap(year):
            return 29
        else:
            return 28
        
def day_of_week_code(day_of_week):
    if day_of_week < 4:
        return 0
    elif day_of_week > 4:
        return 2
    else:
        return 1
    
def quarter_code(date, month):
    month_31_days = [1,3,5,7,8,10,12]
    month_30_days = [4,6,9,11]
    if(month in month_31_days):
        if(date in range(1,9)):
            return 1
        elif(date in range(9,16)):
            return 2
        elif(date in range(16,24)):
            return 3
        else:
            return 4
    elif(month in month_30_days):
        if(date in range(1,9)):
            return 1
        elif(date in range(9,16)):
            return 2
        elif(date in range(16,23)):
            return 3
        else:
            return 4
    else: # February
        return date / 4 

def period_code(time):
    hour = int(time[:-6])
    if hour in range(0, 6):
        return 0
    elif hour in range(6, 12):
        return 1
    elif hour in range(12, 18):
        return 2
    else:
        return 3

quarter_code_udf = udf(quarter_code,IntegerType())
date = udf(lambda y, m, d : datetime.datetime(y, m ,d), DateType())
day_of_week = udf(lambda date : int(date.weekday()), IntegerType())
day_of_week_code_udf = udf(day_of_week_code, IntegerType())
quarter_code_udf = udf(quarter_code, IntegerType())

In [32]:
quarter_code(19,12)

3

In [33]:
type(mb_trans_data.select("ptn_dd").collect()[0].ptn_dd)

int

In [34]:
mb_trans_data = mb_trans_data.withColumn("quarter_code",quarter_code_udf(mb_trans_data['ptn_dd'],mb_trans_data['ptn_mm']))
mb_trans_data = mb_trans_data.withColumn("date", date(mb_trans_data["ptn_yyyy"], mb_trans_data["ptn_mm"], mb_trans_data["ptn_dd"]))
mb_trans_data = mb_trans_data.withColumn("day_of_week", day_of_week(mb_trans_data["date"]))
mb_trans_data = mb_trans_data.withColumn("day_of_week_code", day_of_week_code_udf(mb_trans_data["day_of_week"]))
mb_trans_data.show()

+--------+--------+--------+------+------+------+-------+--------+-------+------------+----------+-----------+----------------+
|fm_ar_id|to_ar_id|ptn_yyyy|ptn_mm|ptn_dd|ptn_hr|ptn_min| tfr_amt|fee_amt|quarter_code|      date|day_of_week|day_of_week_code|
+--------+--------+--------+------+------+------+-------+--------+-------+------------+----------+-----------+----------------+
|     128|     140|    2016|     6|    19|    11|     22|  6201.0|      0|           3|2016-06-19|          6|               2|
|     114|     150|    2016|     8|     4|    13|     37| 24021.0|      0|           1|2016-08-04|          3|               0|
|     131|     120|    2016|     7|     6|    13|     31| 2361.28|      0|           1|2016-07-06|          2|               0|
|     144|     104|    2016|     9|    10|    11|     32| 24410.0|     10|           2|2016-09-10|          5|               2|
|     133|     130|    2016|    12|    15|    13|     25| 18157.0|      0|           2|2016-12-15|      

In [35]:
quarter_code(16,5)

3

In [36]:
# mb_trans_data.select('fm_ar_id').groupby('fm_ar_id').count().show()
mb_trans_data.select('fm_ar_id',mb_trans_data['quarter_code']).groupby(['fm_ar_id','quarter_code']).count().sort(['fm_ar_id','quarter_code']).show()
# mb_trans_data.groupby(mb_trans_data['quarter_code']).count().show()
# F.count(mb_trans_data['quarter_code']==1).alias("noMbTransferOutDuringQ1")

+--------+------------+-----+
|fm_ar_id|quarter_code|count|
+--------+------------+-----+
|     101|           1|    1|
|     101|           2|    3|
|     101|           4|    2|
|     102|           1|    1|
|     102|           2|    3|
|     102|           3|    1|
|     102|           4|    4|
|     103|           2|    1|
|     103|           4|    2|
|     104|           2|    1|
|     104|           3|    2|
|     105|           1|    1|
|     105|           4|    3|
|     106|           1|    1|
|     106|           4|    2|
|     107|           1|    1|
|     107|           4|    5|
|     108|           1|    4|
|     108|           2|    2|
|     108|           3|    3|
+--------+------------+-----+
only showing top 20 rows



In [37]:
# mb_trans_data.select('fm_ar_id','quarter_code').groupby(['fm_ar_id','quarter_code']).agg(F.count(mb_trans_data['quarter_code']==1).alias("noMbTransferOutDuringQ1")).sort('fm_ar_id').show()
mb_trans_data.select('fm_ar_id','to_ar_id','quarter_code').distinct()
mb_trans_data.stat.crosstab("fm_ar_id","quarter_code").sort('fm_ar_id_quarter_code')\
.toDF('ar_id','noMbTransferOutQ1','noMbTransferOutQ2','noMbTransferOutQ3','noMbTransferOutQ4').show()

+-----+-----------------+-----------------+-----------------+-----------------+
|ar_id|noMbTransferOutQ1|noMbTransferOutQ2|noMbTransferOutQ3|noMbTransferOutQ4|
+-----+-----------------+-----------------+-----------------+-----------------+
|  101|                1|                3|                0|                2|
|  102|                1|                3|                1|                4|
|  103|                0|                1|                0|                2|
|  104|                0|                1|                2|                0|
|  105|                1|                0|                0|                3|
|  106|                1|                0|                0|                2|
|  107|                1|                0|                0|                5|
|  108|                4|                2|                3|                1|
|  109|                0|                1|                1|                3|
|  110|                1|               

## Features Extraction

In [38]:
#Feature Extraction
def getMbFrequency(columnName,newColumnName):
    freqIn = mb_trans_data.select(columnName).groupby(columnName).count()
    if((columnName == "fm_ar_id") | (columnName == "to_ar_id")):
        freqIn = freqIn.withColumnRenamed(columnName,"ar_id")
    freqIn = freqIn.withColumnRenamed("count",newColumnName)
    return freqIn

In [74]:
#Feature Extraction V2
def getMbFrequencyPerQuarter(columnName,newColumnName):
    freqIn = mb_trans_data.stat.crosstab(columnName,"quarter_code")\
    .toDF(columnName,newColumnName+'Q1',newColumnName+'Q2',newColumnName+'Q3',newColumnName+'Q4')
    if((columnName == "fm_ar_id") | (columnName == "to_ar_id")):
        freqIn = freqIn.withColumnRenamed(columnName,"ar_id")
    return freqIn

def getMbFrequencyUniquePerQuarter(columnName,newColumnName):
    mb_trans_data_unique = mb_trans_data.select('fm_ar_id','to_ar_id','quarter_code').distinct()
    freqIn = mb_trans_data_unique.stat.crosstab(columnName,"quarter_code")\
    .toDF(columnName,newColumnName+'Q1',newColumnName+'Q2',newColumnName+'Q3',newColumnName+'Q4')
    if((columnName == "fm_ar_id") | (columnName == "to_ar_id")):
        freqIn = freqIn.withColumnRenamed(columnName,"ar_id")
    return freqIn

#get Ratio below

def getMbFeeFrequencyPerQuarter(columnName,newColumnName):
    noMonth = mb_trans_data.select("ptn_yyyy","ptn_mm").distinct().count()
    mb_fee = mb_trans_data.withColumn("pay_fee",mb_trans_data["fee_amt"] != 0).select("fm_ar_id","fee_amt","pay_fee")
    freqFee = mb_fee.stat.crosstab(columnName,"pay_fee")\
        .toDF(columnName,newColumnName+'Free',newColumnName+'NotFree')
    freqFee = freqFee.withColumn(newColumnName+'FreePerMonth',freqFee[newColumnName+'Free']/noMonth)
    freqFee = freqFee.withColumn(newColumnName+'NotFreePerMonth',freqFee[newColumnName+'NotFree']/noMonth)
    freqFee = freqFee.select(columnName,newColumnName+'FreePerMonth',newColumnName+'NotFreePerMonth')
    if((columnName == "fm_ar_id") | (columnName == "to_ar_id")):
        freqFee = freqFee.withColumnRenamed(columnName,"ar_id")
    return freqFee

#Doing
def getAvgTransferAmountPerQuarter(columnName,newColumnName):
    mb_trans_data.groupby("ar_id").agg({"tfr_amt" : "sum"}).withColumnRenamed("sum(tfr_amt)", "transferAmount")
    if((columnName == "fm_ar_id") | (columnName == "to_ar_id")):
        freqIn = freqIn.withColumnRenamed(columnName,"ar_id")
        

In [40]:
mb_trans_data.groupby(["fm_ar_id","quarter_code"]).agg({"tfr_amt" : "sum"}).sort(['fm_ar_id','quarter_code']).withColumnRenamed("sum(tfr_amt)", "transferAmount").show()

+--------+------------+--------------+
|fm_ar_id|quarter_code|transferAmount|
+--------+------------+--------------+
|     101|           1|       10779.0|
|     101|           2|       55389.0|
|     101|           4|       21351.0|
|     102|           1|        4610.0|
|     102|           2|       42225.0|
|     102|           3|       15650.0|
|     102|           4|       53230.0|
|     103|           2|       17396.0|
|     103|           4|       25857.0|
|     104|           2|        6114.0|
|     104|           3|        3488.0|
|     105|           1|       15854.0|
|     105|           4|       25937.0|
|     106|           1|       23069.0|
|     106|           4|       17855.0|
|     107|           1|       15521.0|
|     107|           4|       37583.0|
|     108|           1|       39724.0|
|     108|           2|       32657.0|
|     108|           3|       57412.5|
+--------+------------+--------------+
only showing top 20 rows



In [75]:
df = mb_trans_data.select("fm_ar_id").distinct().withColumnRenamed("fm_ar_id","ar_id")
train_data = df.join(getMbFrequencyPerQuarter("fm_ar_id","noMbTransferOut"),"ar_id","left_outer")
train_data = train_data.join(getMbFrequencyPerQuarter("to_ar_id","noMbTransferIn"),"ar_id","left_outer")
train_data = train_data.join(getMbFrequencyUniquePerQuarter("fm_ar_id","noMbTransferOutUnique"),"ar_id","left_outer")
train_data = train_data.join(getMbFrequencyUniquePerQuarter("to_ar_id","noMbTransferInUnique"),"ar_id","left_outer")
#Calculate Ratio
for i in range(1,5):
    train_data = train_data.withColumn("ratioTransferOutQ"+str(i),train_data["noMbTransferOutUniqueQ"+str(i)]/train_data["noMbTransferOutQ"+str(i)])
    train_data = train_data.withColumn("ratioTransferInQ"+str(i),train_data["noMbTransferInUniqueQ"+str(i)]/train_data["noMbTransferInQ"+str(i)])
train_data = train_data.join(getMbFeeFrequencyPerQuarter("fm_ar_id","noMbFee"),"ar_id","left_outer")
train_data.printSchema()

root
 |-- ar_id: integer (nullable = true)
 |-- noMbTransferOutQ1: long (nullable = true)
 |-- noMbTransferOutQ2: long (nullable = true)
 |-- noMbTransferOutQ3: long (nullable = true)
 |-- noMbTransferOutQ4: long (nullable = true)
 |-- noMbTransferInQ1: long (nullable = true)
 |-- noMbTransferInQ2: long (nullable = true)
 |-- noMbTransferInQ3: long (nullable = true)
 |-- noMbTransferInQ4: long (nullable = true)
 |-- noMbTransferOutUniqueQ1: long (nullable = true)
 |-- noMbTransferOutUniqueQ2: long (nullable = true)
 |-- noMbTransferOutUniqueQ3: long (nullable = true)
 |-- noMbTransferOutUniqueQ4: long (nullable = true)
 |-- noMbTransferInUniqueQ1: long (nullable = true)
 |-- noMbTransferInUniqueQ2: long (nullable = true)
 |-- noMbTransferInUniqueQ3: long (nullable = true)
 |-- noMbTransferInUniqueQ4: long (nullable = true)
 |-- ratioTransferOutQ1: double (nullable = true)
 |-- ratioTransferInQ1: double (nullable = true)
 |-- ratioTransferOutQ2: double (nullable = true)
 |-- ratioTransf

In [72]:
train_data.select("ar_id","noMbFeeFree","noMbFeeNotFree")\
.withColumn("noMbFeeFreePerMonth",train_data["noMbFeeFree"]/7).show()

+-----+-----------+--------------+-------------------+
|ar_id|noMbFeeFree|noMbFeeNotFree|noMbFeeFreePerMonth|
+-----+-----------+--------------+-------------------+
|  148|          5|             2| 0.7142857142857143|
|  137|          4|             0| 0.5714285714285714|
|  133|          6|             4| 0.8571428571428571|
|  108|          6|             4| 0.8571428571428571|
|  101|          2|             4| 0.2857142857142857|
|  126|          4|             1| 0.5714285714285714|
|  115|          3|             0|0.42857142857142855|
|  103|          1|             2|0.14285714285714285|
|  128|          7|             4|                1.0|
|  122|          1|             1|0.14285714285714285|
|  140|          2|             1| 0.2857142857142857|
|  132|          4|             3| 0.5714285714285714|
|  146|          3|             2|0.42857142857142855|
|  142|          4|             3| 0.5714285714285714|
|  139|          5|             2| 0.7142857142857143|
|  120|   

In [73]:
mb_trans_data.select("ptn_yyyy","ptn_mm").distinct().count()

7