# EDA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import datetime

Data can be accessed from this google drive link. The "play log" contains the records of users active session. There are 9 mini games inside the app "Lengbear", an social casino games made and publish exclusively for Cambodia. The transactions log is the records of user in app purchase.

In [2]:
# Columns name for player log, transactions and match length by GameID provided by data owner

columns_play_log=('Sequence','UserID', 'GameID', 'Level', 'WinNo', 'DrawNo', 'LostNo', 'WinAmt', 'LostAmt', 'Date', 'Currency_Type1', 'Currency_Type2')
columns_transactions=('UserID','Amount','Chips','Date', 'Channel')

In [3]:
#making the dataframe for GameID and Match_Length (data provided by game owner)

#define BINH                9001: 80 seconds
#define POKDENG             9002: 80 seconds
#define BAUCUA              9005: 20 seconds
#define SHOW                9004: 80 seconds
#define XOCDIA              9003: 20 seconds
#define SIKU                9006: 80 seconds
#define SLOT_20_LINE        9008: 20 seconds
#define SLOT_100_LINE       1008: 20 seconds
#define TIENLEN_CAM         9009: 120 seconds

Game_Code_Length=pd.DataFrame({'GameID':[9001, 9002, 9005, 9004, 9003, 9006, 9008, 1008, 9009],
                               'Match_Length':[75,75,20,75,20,75,20,20,120]})
Game_Code_Length

Unnamed: 0,GameID,Match_Length
0,9001,75
1,9002,75
2,9005,20
3,9004,75
4,9003,20
5,9006,75
6,9008,20
7,1008,20
8,9009,120


In [4]:
# Load the play log file and check if there is any missing values

play_log_df=pd.read_csv('data/PlayerLog.csv', names=columns_play_log)
play_log_df.isna().sum()

Sequence          0
UserID            0
GameID            0
Level             0
WinNo             0
DrawNo            0
LostNo            0
WinAmt            0
LostAmt           0
Date              0
Currency_Type1    0
Currency_Type2    0
dtype: int64

In [5]:
# Load the transactions log data and check if there is any missing values

transactions_df=pd.read_csv('data/TransactionLog.csv', names=columns_transactions)
transactions_df.isna().sum()

UserID     0
Amount     0
Chips      0
Date       0
Channel    0
dtype: int64

In [None]:
# Getting the right format for date in transaction data

transactions_df['Date']=pd.to_datetime(transactions_df['Date'].str[:10])

In [None]:
# Remove redundant columns (suggested by data owner) and set the time to regular format

play_log_df.drop(['Sequence','Currency_Type1','Currency_Type2'], axis=1, inplace=True)
play_log_df['Date']=pd.to_datetime(play_log_df['Date'].str[:10])
play_log_df.head()

In [None]:
# Getting the Match_Length column ready to calculate the active session of user.

play_log_df=play_log_df.merge(Game_Code_Length, on='GameID', how='left')
play_log_df.head()

In [None]:
# Counting the number of games play in each log and get the actual active play time of users.

play_log_df['Games_Played']=play_log_df['WinNo']+play_log_df['DrawNo']+play_log_df['LostNo']
play_log_df['Active_Time']=play_log_df['Games_Played']*play_log_df['Match_Length']
play_log_df

In [None]:
# Getting t0 and list of days on which user data can be extracted

t0=play_log_df['Date'].min()
t_value=[]

for i in range(0,9):
    t_value.append(t0+datetime.timedelta(days=i))
    
print(f't0 can start from: {t_value[0]}, to: {t_value[-1]}, pass t_value[number] to function get_data to get the desired player data')

In [None]:
def get_data(t0):
    
    # Pass in t0, t4 and tchurn
    t0=t0
    t4=t0+datetime.timedelta(days=3)
    tchurn=t0+datetime.timedelta(days=4)
    print(f't0={t0} \nt4={t4} \ntchurn={tchurn}')
    
    # Get unique users on t0
    play_log_t0=play_log_df.loc[play_log_df['Date']==t0]
    t0_users=pd.DataFrame(play_log_t0['UserID'].unique())
    t0_users.columns=['UserID']
    
    # Slice out the dataframe that contains user playlog from t0 -> t4
    play_log_t0_t4=play_log_df.loc[(play_log_df['Date']>=t0) & (play_log_df['Date']<=t4)]
    
    # Compute the Winning rate and Drawing rate, Losing Rate, Winning Amount, Losing Amount for each user in from t0->t4
    WinNo=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['WinNo'])
    DrawNo=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['DrawNo'])
    LostNo=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['LostNo'])
    WinAmt=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['WinAmt'])
    LostAmt=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['LostAmt'])
    Games_Played=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['Games_Played'])
    Active_Time=pd.DataFrame(play_log_t0_t4.groupby(['UserID']).sum()['Active_Time'])
    
    # Merging user data into a single dataframe 
    play_log_summary=WinNo.merge(DrawNo, on='UserID', how='left')\
                        .merge(LostNo, on='UserID', how='left')\
                        .merge(WinAmt, on='UserID', how='left')\
                        .merge(LostAmt, on='UserID', how='left')\
                        .merge(Games_Played, on='UserID', how='left')\
                        .merge(Active_Time, on='UserID', how='left')
    play_log_summary.columns=['WinNo', 'DrawNo', 'LostNo', 'WinAmt', 'LostAmt', 'Games_Played', 'Active_Time']
    
    # Reset index after merging all new 
    play_log_summary=play_log_summary.reset_index(level='UserID')
        
    # Slice out the time for study
    transactions_df_t0_t4=transactions_df.loc[(transactions_df['Date']>=t0) & (transactions_df['Date']<=t4)]
    
    # Sum in-game purchase by users during t0 -> t4
    transactions_df_t0_t4=pd.DataFrame(transactions_df_t0_t4.groupby(['UserID']).sum()['Amount'])
    transactions_df_t0_t4.reset_index(level='UserID')
    
    # Merge transactions records with play log
    play_log_summary=play_log_summary.merge(transactions_df_t0_t4, on='UserID', how='left')
    play_log_summary=play_log_summary.fillna(0)
    
    # Create new features namely: Winning Rate, Drawing Rate, Losing Rate for all unique users
    play_log_summary['WinningRate']=play_log_summary['WinNo']/play_log_summary['Games_Played']
    play_log_summary['DrawRate']=play_log_summary['DrawNo']/play_log_summary['Games_Played']
    play_log_summary['LosingRate']=play_log_summary['LostNo']/play_log_summary['Games_Played']
    
    # Getting target 

    final_play_log=play_log_summary.assign(Churn=play_log_summary.UserID.isin(play_log_tchurn.UserID).astype(int))

    return final_play_log

In [None]:
get_data(t_value[8])

In [None]:
# Assigning X and y
X=final_play_log.iloc[:,:-1]
X=X.drop(['UserID'], axis=1)

# Fill na for some value of 
X=X.fillna(0) 

In [None]:
y=final_play_log.iloc[:,-1]

In [None]:
y.value_counts()

In [None]:
# Create a train/test split with 80% train, 20% test 

X_remainder, X_test, y_remainder, y_test=train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

# 1. Instantiate model
churn_logistic_regression=LogisticRegression(random_state=42, max_iter=10000)

# 2. Fit model
churn_logistic_regression.fit(X_remainder, y_remainder)

In [None]:
# 3. Score model
print(f'Score on train: {churn_logistic_regression.score(X_remainder, y_remainder)}')
print(f'Score on test: {churn_logistic_regression.score(X_test, y_test)}')

In [None]:
from sklearn.tree import DecisionTreeClassifier

DT=DecisionTreeClassifier(random_state=42).fit(X_remainder, y_remainder)

# Accuracy scores
print(DT.score(X_remainder, y_remainder))
print(DT.score(X_test, y_test))

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
churn_linear_regression=LinearRegression()
churn_linear_regression.fit(X_remainder, y_remainder)

In [None]:
print(f'Score on train: {churn_linear_regression.score(X_remainder, y_remainder)}')
print(f'Score on test: {churn_linear_regression.score(X_test, y_test)}')