In [75]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.types import IntegerType, FloatType, DateType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
import datetime, time, calendar

In [2]:
sc = SparkContext.getOrCreate()
print(sc)

<pyspark.context.SparkContext object at 0x000002402944F860>


In [3]:
# join 
# ip <-> ar <-> mb
#           <-> dept

# ip <-> ar : "ip_id"
# ip <-> cc_cst : "x"
# cc_cst <-> cc_visa : "x"
# ar <-> mb : "src_ar_id"
# ar <-> dept : "ar_id"

# DerivedFeature
# [PerWeek & PerMonth]
# noTransferInUnique, noTransferOutUnique, noTransferIn, noTransferOut, ratioTransferIn, ratioTransferOut
# avgTransferInAmount, avgTransferOutAmount, avgTransferAmount
# noFeeAmountGroupBy(0,10,25,35)
# noTransferInPeriod(morning,afternoon,evening,night :: 6hr)
# noTransferOutPeriod(morning,afternoon,evening,night :: 6hr)
# noTransferInPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# noTransferOutPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# avgTransferInHoliday, avgTransferOutHoliday


# Deposit
# [PerWeek & PerMonth]
# noBranchVisit,noBranchVisitUnique,ratioBranchVisit
# avgTransferInAmount, avgTransferOutAmount, avgTransferAmount
# noDepositAmount, noWithdrawAmount
# avgDepositAmount, avgWithdrawAmount
# noWithdrawPeriod(morning,afternoon,evening,night :: 6hr)
# noDepositPeriod(morning,afternoon,evening,night :: 6hr)
# noDepositPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# noWithdrawPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# noTransferInUnique, noTransferOutUnique, noTransferIn, noTransferOut, ratioTransferIn, ratioTransferOut
# noTransferInPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# noTransferOutPerDay(Mon-Thu, Fri, Sat-Sun [avg]) <-- PerMonth
# ??Balance??
# ??CBS Sub Operation Code??
# ahrzd_usr_id <-- [KMP, EDC, ATM]

# Credit


# Approach
# 1 Detech ลูกน้องโดยดูจำนวนเงินที่รับมากที่สุดแล้วเช็ค

# Visualize
# Location

# feature = ["vc_ip.ip_id"]
# category = ["vc_ip.ip_tp_cd","vc_ip.mar_st_cd","vc_ip.ctf_tp_cd","vc_ip.ocp_cd","vc_ip.idv_incm_seg_cd"]
# filteredFeature = ["vc_ip.prvn_f","vc_ip.ip_st_cd","vc_ip.death_f"]

In [4]:
spark = (SparkSession
         .builder
         .appName("MerchantInsight")
         .config("spark.sql.warehouse.dir", "/opt/jupyter_workspace/spark-warehouse")
         .getOrCreate())
print (spark)

<pyspark.sql.session.SparkSession object at 0x000002402944F358>


In [106]:
mb_trans_data =(spark
      .read
      .option("header", "true")
      .option("inferSchema","true")
      .csv('mock_data/mock-transaction.csv'))

In [105]:
mb_trans_data.show()

+--------+--------+--------+------+------+------+-------+
|fm_ar_id|to_ar_id|ptn_yyyy|ptn_mm|ptn_dd|ptn_hr|ptn_min|
+--------+--------+--------+------+------+------+-------+
|     128|     140|    2016|     6|    19|    11|     22|
|     114|     150|    2016|     8|     4|    13|     37|
|     131|     120|    2016|     7|     6|    13|     31|
|     144|     104|    2016|     9|    10|    11|     32|
|     133|     130|    2016|    12|    15|    13|     25|
|     109|     140|    2016|     6|    19|    22|     33|
|     116|     146|    2016|     7|    23|    15|     57|
|     110|     128|    2016|    10|    29|    20|     55|
|     101|     123|    2016|     7|    10|    22|     52|
|     128|     137|    2016|     8|    30|    20|     38|
|     108|     114|    2016|    12|    19|    20|     49|
|     103|     142|    2016|    12|    31|    18|      5|
|     135|     123|    2016|     8|     3|    14|     52|
|     118|     141|    2016|     7|    27|    15|     19|
|     138|    

In [107]:
#Pre-processing data

def no_days_in_month(month, year):
    if month in day_months_31: 
        return 31
    elif month in day_months_30:
        return 30
    else:
        if calendar.isleap(year):
            return 29
        else:
            return 28
        
def day_of_week_code(day_of_week):
    if day_of_week < 4:
        return 0
    elif day_of_week > 4:
        return 2
    else:
        return 1
    
def quarter_code(date, month):
    month_31_days = [1,3,5,7,8,10,12]
    month_30_days = [4,6,9,11]
    if(month in month_31_days):
        if(date in range(1,9)):
            return 1
        elif(date in range(9,16)):
            return 2
        elif(date in range(16,24)):
            return 3
        else:
            return 4
    elif(month in month_30_days):
        if(date in range(1,9)):
            return 1
        elif(date in range(9,16)):
            return 2
        elif(date in range(16,23)):
            return 3
        else:
            return 4
    else: # February
        return date / 4 

def period_code(time):
    hour = int(time[:-6])
    if hour in range(0, 6):
        return 0
    elif hour in range(6, 12):
        return 1
    elif hour in range(12, 18):
        return 2
    else:
        return 3

quarter_code_udf = udf(quarter_code,IntegerType())
date = udf(lambda y, m, d : datetime.datetime(y, m ,d), DateType())
day_of_week = udf(lambda date : int(date.weekday()), IntegerType())
day_of_week_code_udf = udf(day_of_week_code, IntegerType())
quarter_code_udf = udf(quarter_code, IntegerType())

In [108]:
quarter_code(19,12)

3

In [109]:
type(mb_trans_data.select("ptn_dd").collect()[0].ptn_dd)

int

In [110]:
mb_trans_data = mb_trans_data.withColumn("quarter_code",quarter_code_udf(mb_trans_data['ptn_dd'],mb_trans_data['ptn_mm']))
mb_trans_data = mb_trans_data.withColumn("date", date(mb_trans_data["ptn_yyyy"], mb_trans_data["ptn_mm"], mb_trans_data["ptn_dd"]))
mb_trans_data = mb_trans_data.withColumn("day_of_week", day_of_week(mb_trans_data["date"]))
mb_trans_data = mb_trans_data.withColumn("day_of_week_code", day_of_week_code_udf(mb_trans_data["day_of_week"]))
mb_trans_data.show()

+--------+--------+--------+------+------+------+-------+------------+----------+-----------+----------------+
|fm_ar_id|to_ar_id|ptn_yyyy|ptn_mm|ptn_dd|ptn_hr|ptn_min|quarter_code|      date|day_of_week|day_of_week_code|
+--------+--------+--------+------+------+------+-------+------------+----------+-----------+----------------+
|     128|     140|    2016|     6|    19|    11|     22|           3|2016-06-19|          6|               2|
|     114|     150|    2016|     8|     4|    13|     37|           1|2016-08-04|          3|               0|
|     131|     120|    2016|     7|     6|    13|     31|           1|2016-07-06|          2|               0|
|     144|     104|    2016|     9|    10|    11|     32|           2|2016-09-10|          5|               2|
|     133|     130|    2016|    12|    15|    13|     25|           2|2016-12-15|          3|               0|
|     109|     140|    2016|     6|    19|    22|     33|           3|2016-06-19|          6|               2|
|

In [19]:
quarter_code(16,5)

3

In [111]:
# mb_trans_data.select('fm_ar_id').groupby('fm_ar_id').count().show()
mb_trans_data.select('fm_ar_id',mb_trans_data['quarter_code']).groupby(['fm_ar_id','quarter_code']).count().sort(['fm_ar_id','quarter_code']).show()
# mb_trans_data.groupby(mb_trans_data['quarter_code']).count().show()
# F.count(mb_trans_data['quarter_code']==1).alias("noMbTransferOutDuringQ1")

+--------+------------+-----+
|fm_ar_id|quarter_code|count|
+--------+------------+-----+
|     101|           1|    1|
|     101|           2|    3|
|     101|           4|    2|
|     102|           1|    1|
|     102|           2|    3|
|     102|           3|    1|
|     102|           4|    4|
|     103|           2|    1|
|     103|           4|    2|
|     104|           2|    1|
|     104|           3|    2|
|     105|           1|    1|
|     105|           4|    3|
|     106|           1|    1|
|     106|           4|    2|
|     107|           1|    1|
|     107|           4|    5|
|     108|           1|    4|
|     108|           2|    2|
|     108|           3|    3|
+--------+------------+-----+
only showing top 20 rows



In [118]:
# mb_trans_data.select('fm_ar_id','quarter_code').groupby(['fm_ar_id','quarter_code']).agg(F.count(mb_trans_data['quarter_code']==1).alias("noMbTransferOutDuringQ1")).sort('fm_ar_id').show()
mb_trans_data.select('fm_ar_id','to_ar_id','quarter_code').distinct()
mb_trans_data.stat.crosstab("fm_ar_id","quarter_code").sort('fm_ar_id_quarter_code')\
.toDF('ar_id','noMbTransferOutQ1','noMbTransferOutQ2','noMbTransferOutQ3','noMbTransferOutQ4').show()

+-----+-----------------+-----------------+-----------------+-----------------+
|ar_id|noMbTransferOutQ1|noMbTransferOutQ2|noMbTransferOutQ3|noMbTransferOutQ4|
+-----+-----------------+-----------------+-----------------+-----------------+
|  101|                1|                3|                0|                2|
|  102|                1|                3|                1|                4|
|  103|                0|                1|                0|                2|
|  104|                0|                1|                2|                0|
|  105|                1|                0|                0|                3|
|  106|                1|                0|                0|                2|
|  107|                1|                0|                0|                5|
|  108|                4|                2|                3|                1|
|  109|                0|                1|                1|                3|
|  110|                1|               

## Features Extraction

In [22]:
#Feature Extraction
def getMbFrequency(columnName,newColumnName):
    freqIn = mb_trans_data.select(columnName).groupby(columnName).count()
    if((columnName == "fm_ar_id") | (columnName == "to_ar_id")):
        freqIn = freqIn.withColumnRenamed(columnName,"ar_id")
    freqIn = freqIn.withColumnRenamed("count",newColumnName)
    return freqIn

In [60]:
#Feature Extraction V2
def getMbFrequencyPerQuarter(columnName,newColumnName):
    freqIn = mb_trans_data.stat.crosstab(columnName,"quarter_code")\
    .toDF(columnName,newColumnName+'Q1',newColumnName+'Q2',newColumnName+'Q3',newColumnName+'Q4')
    if((columnName == "fm_ar_id") | (columnName == "to_ar_id")):
        freqIn = freqIn.withColumnRenamed(columnName,"ar_id")
    return freqIn

def getMbFrequencyUniquePerQuarter(columnName,newColumnName):
    mb_trans_data_unique = mb_trans_data.select('fm_ar_id','to_ar_id','quarter_code').distinct()
    freqIn = mb_trans_data_unique.stat.crosstab(columnName,"quarter_code")\
    .toDF(columnName,newColumnName+'Q1',newColumnName+'Q2',newColumnName+'Q3',newColumnName+'Q4')
    if((columnName == "fm_ar_id") | (columnName == "to_ar_id")):
        freqIn = freqIn.withColumnRenamed(columnName,"ar_id")
    return freqIn

In [68]:
df = mb_trans_data.select("fm_ar_id").distinct().withColumnRenamed("fm_ar_id","ar_id")
train_data = df.join(getMbFrequencyPerQuarter("fm_ar_id","noMbTransferOut"),"ar_id","left_outer")
train_data = train_data.join(getMbFrequencyPerQuarter("to_ar_id"," noMbTransferIn"),"ar_id","left_outer")
train_data = train_data.join(getMbFrequencyUniquePerQuarter("fm_ar_id"," noMbTransferOutUnique"),"ar_id","left_outer")
train_data = train_data.join(getMbFrequencyUniquePerQuarter("to_ar_id"," noMbTransferInUnique"),"ar_id","left_outer")
train_data.printSchema()

root
 |-- ar_id: integer (nullable = true)
 |-- noMbTransferOutQ1: long (nullable = true)
 |-- noMbTransferOutQ2: long (nullable = true)
 |-- noMbTransferOutQ3: long (nullable = true)
 |-- noMbTransferOutQ4: long (nullable = true)
 |--  noMbTransferInQ1: long (nullable = true)
 |--  noMbTransferInQ2: long (nullable = true)
 |--  noMbTransferInQ3: long (nullable = true)
 |--  noMbTransferInQ4: long (nullable = true)
 |--  noMbTransferOutUniqueQ1: long (nullable = true)
 |--  noMbTransferOutUniqueQ2: long (nullable = true)
 |--  noMbTransferOutUniqueQ3: long (nullable = true)
 |--  noMbTransferOutUniqueQ4: long (nullable = true)
 |--  noMbTransferInUniqueQ1: long (nullable = true)
 |--  noMbTransferInUniqueQ2: long (nullable = true)
 |--  noMbTransferInUniqueQ3: long (nullable = true)
 |--  noMbTransferInUniqueQ4: long (nullable = true)

