# EDA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import datetime

In [2]:
#columns name for player log, transactions and match length by GameID provided by data owner. 

columns_play_log=('Sequence','UserID', 'GameID', 'Level', 'WinNo', 'DrawNo', 'LostNo', 'WinAmt', 'LostAmt', 'Date', 'Currency_Type1', 'Currency_Type2')
columns_transactions=('UserID','Amount','Chips','Date', 'Channel')

In [3]:
#load the play log file. 

play_log_df=pd.read_csv('data/PlayerDetails.csv', names=columns_play_log)
play_log_df.shape

(1768640, 12)

In [4]:
play_log_df.drop(['Sequence','Currency_Type1','Currency_Type2'], axis=1, inplace=True)
play_log_df['Date']=pd.to_datetime(play_log_df['Date'].str[:10])
play_log_df.head()

Unnamed: 0,UserID,GameID,Level,WinNo,DrawNo,LostNo,WinAmt,LostAmt,Date
0,5894561,9009,1000,0,0,1,0,24964,2020-05-09
1,6047531,9009,20,2,0,0,515,0,2020-05-09
2,4972744,9009,100,1,0,3,1748,7800,2020-05-09
3,5608136,9009,1000,9,0,15,128680,137000,2020-05-09
4,4725768,9009,100,10,0,34,18272,27100,2020-05-09


In [15]:
play_log_df.isna().sum()

UserID          0
GameID          0
Level           0
WinNo           0
DrawNo          0
LostNo          0
WinAmt          0
LostAmt         0
Date            0
Match_Length    0
Games_Played    0
Active_Time     0
dtype: int64

In [5]:
#making the dataframe for GameID and Match_Length (data provided by game owner)

#define BINH                9001: 80 seconds
#define POKDENG             9002: 80 seconds
#define BAUCUA              9005: 20 seconds
#define SHOW                9004: 80 seconds
#define XOCDIA              9003: 20 seconds
#define SIKU                9006: 80 seconds
#define SLOT_20_LINE        9008: 20 seconds
#define SLOT_100_LINE       1008: 20 seconds
#define TIENLEN_CAM         9009: 120 seconds

Game_Code_Length=pd.DataFrame({'GameID':[9001, 9002, 9005, 9004, 9003, 9006, 9008, 1008, 9009],
                               'Match_Length':[80,80,20,80,20,80,20,20,120]})
Game_Code_Length

Unnamed: 0,GameID,Match_Length
0,9001,80
1,9002,80
2,9005,20
3,9004,80
4,9003,20
5,9006,80
6,9008,20
7,1008,20
8,9009,120


In [6]:
#getting the Match_Length column ready to calculate the active session of user.

play_log_df=play_log_df.merge(Game_Code_Length, on='GameID', how='left')
play_log_df.head()

Unnamed: 0,UserID,GameID,Level,WinNo,DrawNo,LostNo,WinAmt,LostAmt,Date,Match_Length
0,5894561,9009,1000,0,0,1,0,24964,2020-05-09,120
1,6047531,9009,20,2,0,0,515,0,2020-05-09,120
2,4972744,9009,100,1,0,3,1748,7800,2020-05-09,120
3,5608136,9009,1000,9,0,15,128680,137000,2020-05-09,120
4,4725768,9009,100,10,0,34,18272,27100,2020-05-09,120


In [7]:
#counting the number of games play in each log and get the actual active play time of users.

play_log_df['Games_Played']=play_log_df['WinNo']+play_log_df['DrawNo']+play_log_df['LostNo']
play_log_df['Active_Time']=play_log_df['Games_Played']*play_log_df['Match_Length']
play_log_df

Unnamed: 0,UserID,GameID,Level,WinNo,DrawNo,LostNo,WinAmt,LostAmt,Date,Match_Length,Games_Played,Active_Time
0,5894561,9009,1000,0,0,1,0,24964,2020-05-09,120,1,120
1,6047531,9009,20,2,0,0,515,0,2020-05-09,120,2,240
2,4972744,9009,100,1,0,3,1748,7800,2020-05-09,120,4,480
3,5608136,9009,1000,9,0,15,128680,137000,2020-05-09,120,24,2880
4,4725768,9009,100,10,0,34,18272,27100,2020-05-09,120,44,5280
...,...,...,...,...,...,...,...,...,...,...,...,...
1768635,4969001,9009,20,2,0,0,1435,0,2020-05-10,120,2,240
1768636,812253,9009,5000,0,0,3,0,335000,2020-05-10,120,3,360
1768637,6037451,9009,20,0,0,1,0,294,2020-05-10,120,1,120
1768638,4449836,9009,5000,0,0,1,0,135000,2020-05-10,120,1,120


In [8]:
#slicing the play log on first day of the study period.

t0=play_log_df['Date'].min()
# play_log_t0=play_log_df.loc[play_log_df['Date']<=(t0+datetime.timedelta(days=4))]
play_log_t0=play_log_df.loc[play_log_df['Date']==t0]
play_log_t0_user=pd.DataFrame(play_log_t0['UserID'].unique())
play_log_t0_user.columns=['UserID']
play_log_t0_user

Unnamed: 0,UserID
0,4775802
1,5929933
2,5728479
3,5725542
4,5942114
...,...
56158,5155914
56159,5930800
56160,4858147
56161,5345776


In [9]:
#slice out the dataframe that contains user playlog from t0 -> t4.
play_log_t0_t4=play_log_df.loc[play_log_df['Date']<=(t0+datetime.timedelta(days=3))]
play_log_t0_t4['Date'].max()

Timestamp('2020-05-04 00:00:00')

In [10]:
#compute the Winning rate and Drawing rate, Losing Rate, Winning Amount, Losing Amount
WinNo=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['WinNo'])
DrawNo=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['DrawNo'])
LostNo=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['LostNo'])
WinAmt=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['WinAmt'])
LostAmt=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['LostAmt'])
Games_Played=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['Games_Played'])
Active_Time=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['Active_Time'])

In [11]:
play_log_summary=WinNo.merge(DrawNo, on='UserID', how='left')\
                        .merge(LostNo, on='UserID', how='left')\
                        .merge(WinAmt, on='UserID', how='left')\
                        .merge(LostAmt, on='UserID', how='left')\
                        .merge(Games_Played, on='UserID', how='left')\
                        .merge(Active_Time, on='UserID', how='left')
play_log_summary.columns=['WinNo', 'DrawNo', 'LostNo', 'WinAmt', 'LostAmt', 'Games_Played', 'Active_Time']
play_log_summary

Unnamed: 0_level_0,WinNo,DrawNo,LostNo,WinAmt,LostAmt,Games_Played,Active_Time
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1980,0,0,3,0,417197,3,60
3452,7,0,12,6800050,8971800,19,1160
3700,73,0,72,18404347,21637393,145,11480
4079,10,0,13,10639687,17156250,23,1840
4732,3,0,12,10726,52726,15,1200
...,...,...,...,...,...,...,...
5986120,4,0,22,8400,18000,26,520
5986122,2,0,1,476,876,3,240
5986131,3,0,6,7500,4590,9,180
5986132,1,0,2,100,130,3,60


In [12]:
play_log_summary=play_log_summary.reset_index(level='UserID')

In [13]:
play_log_summary

Unnamed: 0,UserID,WinNo,DrawNo,LostNo,WinAmt,LostAmt,Games_Played,Active_Time
0,1980,0,0,3,0,417197,3,60
1,3452,7,0,12,6800050,8971800,19,1160
2,3700,73,0,72,18404347,21637393,145,11480
3,4079,10,0,13,10639687,17156250,23,1840
4,4732,3,0,12,10726,52726,15,1200
...,...,...,...,...,...,...,...,...
118805,5986120,4,0,22,8400,18000,26,520
118806,5986122,2,0,1,476,876,3,240
118807,5986131,3,0,6,7500,4590,9,180
118808,5986132,1,0,2,100,130,3,60


In [14]:
final_play_log_t0=play_log_t0_user.merge(play_log_summary, on='UserID', how='left')
final_play_log_t0

Unnamed: 0,UserID,WinNo,DrawNo,LostNo,WinAmt,LostAmt,Games_Played,Active_Time
0,4775802,121,0,242,1900393,3498025,363,37620
1,5929933,54,0,83,16227,17191,137,15460
2,5728479,22,0,24,26826,38826,46,4760
3,5725542,239,0,442,57603571,68753371,681,81620
4,5942114,67,0,98,852087,745800,165,19400
...,...,...,...,...,...,...,...,...
56158,5155914,19,0,36,35493,23512,55,4400
56159,5930800,1,0,2,2,556,3,240
56160,4858147,4,0,6,7865,18147,10,800
56161,5345776,6,0,8,2830000,5842555,14,1120


In [26]:
play_log_t5=play_log_df.loc[play_log_df['Date']==(t0+datetime.timedelta(days=4))]
play_log_t5

Unnamed: 0,UserID,GameID,Level,WinNo,DrawNo,LostNo,WinAmt,LostAmt,Date,Match_Length,Games_Played,Active_Time
6555,5997581,9009,20,3,0,1,1715,120,2020-05-05,120,4,480
6556,5616548,9009,50,1,0,2,1140,2674,2020-05-05,120,3,360
6557,5999708,9009,50,3,0,2,571,150,2020-05-05,120,5,600
6558,5649304,9009,10000,2,0,7,190000,880000,2020-05-05,120,9,1080
6559,5999584,9009,1000,0,0,2,0,49500,2020-05-05,120,2,240
...,...,...,...,...,...,...,...,...,...,...,...,...
1468406,5774751,9008,100,1,0,2,50,180,2020-05-05,20,3,60
1468407,6000107,9008,1000,1,0,2,100,1000,2020-05-05,20,3,60
1468408,6000110,9008,100,2,0,0,960,0,2020-05-05,20,2,40
1468409,6000108,9008,10000,2,0,1,43000,10000,2020-05-05,20,3,60


In [28]:
final_play_log_t0=final_play_log_t0.assign(Churn=play_log_t0_user.UserID.isin(play_log_t5.UserID).astype(int))

In [48]:
play_log_t0_t4['Level'].unique()

array([      20,      100,     5000,     1000,    20000,      500,
             50,    10000,   100000,       10,    50000,   200000,
              2,      200,  1000000,   500000, 10000000,  2000000,
        5000000])

In [36]:
play_log_t0.groupby(['UserID']).sum()['WinAmt']

UserID
3452       6757000
3700       6771107
4732         10726
4891          1331
10151            0
            ...   
5944464          0
5944465          0
5944468          0
5944472          0
5944478          0
Name: WinAmt, Length: 56163, dtype: int64

In [18]:
play_log_df['Date'].min()+datetime.timedelta(days=5)

Timestamp('2020-05-06 00:00:00')

In [8]:
transactions_df=pd.read_csv('data/TransactionDetail.csv', names=columns_transactions)

In [9]:
transactions_df['Date']=transactions_df['Date'].str[:10]

In [10]:
transactions_df=transactions_df.loc[(transactions_df['Date']>='2020-05-01') & (transactions_df['Date']<='2020-05-13')]
transactions_df.head()

Unnamed: 0,UserID,Amount,Chips,Date,Channel
283749,5827178,1.0,1200000,2020-05-01,DCB_Cellcard_Coda
283750,5710971,0.5,600000,2020-05-01,DCB_Smart_Axiata
283751,3358689,2.0,4100000,2020-05-01,Wing
283752,5595485,0.5,600000,2020-05-01,DCB_Smart_Axiata
283753,2267177,1.0,1200000,2020-05-01,DCB_Smart_Axiata


In [11]:
pd.DataFrame(transactions_df.groupby(['Date', 'UserID']).sum()['Amount'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Amount
Date,UserID,Unnamed: 2_level_1
2020-05-01,3700,1.0
2020-05-01,12775,0.5
2020-05-01,808632,0.5
2020-05-01,811137,0.2
2020-05-01,811332,0.2
...,...,...
2020-05-13,6103669,0.2
2020-05-13,6103694,0.2
2020-05-13,6103741,0.5
2020-05-13,6104060,0.5


In [12]:
pd.DataFrame(transactions_df.groupby(['Date']).sum()['Amount'])

Unnamed: 0_level_0,Amount
Date,Unnamed: 1_level_1
2020-05-01,4001.6
2020-05-02,4265.7
2020-05-03,4364.9
2020-05-04,4131.3
2020-05-05,4381.8
2020-05-06,5322.5
2020-05-07,3921.1
2020-05-08,4734.0
2020-05-09,4730.2
2020-05-10,4447.3
