In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score

pd.set_option("max_columns", 999)
pd.set_option("max_rows", 999)

In [6]:
Player_information = pd.read_csv('Player_info_added_perday.csv', engine='python')
Player_action = pd.read_csv('Player_action_added_perday.csv', engine='python')
Group_activity = pd.read_csv('Group_activity.csv', engine='python')
labeled_accounts = pd.read_csv('labeled_accounts.csv', engine='python')

# merge // get train and validation sets (get only the accounts labeled)

In [7]:
final = pd.merge(Player_information, Player_action, how='outer', on = "actor_account")
final = pd.merge(final, Group_activity, how='outer',  on = "actor_account")

In [8]:
final.fillna(0, inplace=True)
print(final.shape)
final.head()

(21973, 31)


Unnamed: 0,actor_account,login_count,logout_count,login_day_count,play_time,avg_money,ip_count,max_level,playtime_per_day,sit_count,exp_get_amout,item_get_count,exp_repair_count,money_get_count,abyss,use_portal_count,killed_by_pc,killed_by_npc,teleport_count,reborn_count,question_count,login_total_day,sit_count_perday,item_get_count_perday,exp_repair_count_perday,money_get_count_perday,use_portal_count_perday,teleport_count_perday,total_party_time,guild_join_count,average_party_time
0,20000000,25,24,1,216416,-31.818904,25,36,24046.22222,489,77894680,2712,0,499,0,0,11,83,194,72,86,9,54.333333,301.333333,0.0,55.444444,0.0,21.555556,47052.0,1.0,2767.764706
1,20000005,259,259,1,637926,124.849465,259,0,42528.4,270,389149580,31149,0,4686,0,0,47,30,376,51,62,15,18.0,2076.6,0.0,312.4,0.0,25.066667,8862.0,0.0,1477.0
2,20000011,13,12,1,451598,0.537808,13,22,75266.33333,66,7602992,31446,0,281,0,0,5,14,57,18,6,6,11.0,5241.0,0.0,46.833333,0.0,9.5,0.0,0.0,0.0
3,20000016,38,37,1,401885,43.509962,38,42,28706.07143,153,80119427,61579,0,1596,0,0,32,75,343,75,5893,14,10.928571,4398.5,0.0,114.0,0.0,24.5,69458.0,0.0,3655.684211
4,20000025,222,221,1,424827,82.621717,222,0,28321.8,552,322403224,29319,0,2415,0,8,214,111,776,241,131,15,36.8,1954.6,0.0,161.0,0.533333,51.733333,37560.0,0.0,4173.333333


In [9]:
model_data = final.copy()

## Simple EDA

In [10]:
model_data = final.copy()

# Login_count and Logout_count is virtually the same. Remove Logout_count
del model_data['logout_count']

# check if there is any nan values
for i in model_data.columns:
    model_data[model_data[i] == np.nan]

# There are inf values. Change it to max value in the column
for i in model_data.columns:
    model_data.loc[model_data[i] == np.inf, i] = np.nan
    model_data.loc[np.isnan(model_data[i]), i] = max(model_data[i])

In [11]:
# ip count and login_count is the same delete one of them
np.unique(model_data['ip_count']/model_data['login_count'])
del model_data['ip_count']

In [12]:
# login_day_count is all 1 except for one row
print(np.unique(model_data['login_day_count']), np.count_nonzero(model_data['login_day_count']))
del model_data['login_day_count']

[0. 1.] 21959


# Create new columns

In [14]:
# Measure how patient the player is
m = model_data['sit_count'] / model_data['play_time'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['sit_count/play_time'] = m

In [15]:
# the lesser you get killed by a player the more likely you are to be a bot
m = model_data['killed_by_pc'] / model_data['play_time'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['killed_by_pc/play_time'] = m

In [16]:
# similar with above
m = model_data['killed_by_npc'] / model_data['play_time'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['killed_by_npc/play_time'] = m

In [17]:
# How organized the palyer is 
m = model_data['total_party_time'] / model_data['play_time'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['total-party_time/play_time'] = m

In [28]:
# How impatient the player is
m = model_data['teleport_count'] / model_data['play_time'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['teleport_count/play_time'] = m

In [18]:
# How often the player takes a real rest
m = model_data['play_time'] / model_data['login_count'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['play_time/login_count'] = m

In [19]:
# How efficient the player is 
m = model_data['exp_get_amout'] / model_data['play_time'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['exp_get_amount/play_time'] = m

In [20]:
# How efficient the player is 
m = model_data['playtime_per_day'] / model_data['login_total_day'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['playtime_per_day/login_total_day'] = m

In [21]:
# How efficient the player is 
m = model_data['max_level'] / model_data['login_count'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['max_level/login_count'] = m

In [22]:
# How efficient the player is 
m = model_data['reborn_count'] / model_data['login_count'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['reborn_count/login_count'] = m

In [23]:
# Humans do not sit a a lot when the level is high level
m = model_data['sit_count'] / model_data['max_level'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['sit_count/max_level'] = m

In [24]:
# Bots do not finish quest
m = model_data['play_time'] / model_data['question_count'] 
m[m == np.inf] = 0
m[np.isnan(m)] = 0
model_data['play_time/question_count'] = m

In [25]:
model_data.head()

Unnamed: 0,actor_account,login_count,play_time,avg_money,max_level,playtime_per_day,sit_count,exp_get_amout,item_get_count,exp_repair_count,money_get_count,abyss,use_portal_count,killed_by_pc,killed_by_npc,teleport_count,reborn_count,question_count,login_total_day,sit_count_perday,item_get_count_perday,exp_repair_count_perday,money_get_count_perday,use_portal_count_perday,teleport_count_perday,total_party_time,guild_join_count,average_party_time,sit_count/play_time,killed_by_pc/play_time,killed_by_npc/play_time,total-party_time/play_time,play_time/login_count,exp_get_amount/play_time,playtime_per_day/login_total_day,max_level/login_count,reborn_count/login_count,sit_count/max_level,play_time/question_count
0,20000000.0,25.0,216416.0,-31.818904,36.0,24046.22222,489.0,77894680.0,2712.0,0.0,499.0,0.0,0.0,11.0,83.0,194.0,72.0,86.0,9.0,54.333333,301.333333,0.0,55.444444,0.0,21.555556,47052.0,1.0,2767.764706,0.00226,5.1e-05,0.000384,0.217415,8656.64,359.930319,2671.802469,1.44,2.88,13.583333,2516.465116
1,20000005.0,259.0,637926.0,124.849465,0.0,42528.4,270.0,389149580.0,31149.0,0.0,4686.0,0.0,0.0,47.0,30.0,376.0,51.0,62.0,15.0,18.0,2076.6,0.0,312.4,0.0,25.066667,8862.0,0.0,1477.0,0.000423,7.4e-05,4.7e-05,0.013892,2463.034749,610.023075,2835.226667,0.0,0.196911,0.0,10289.129032
2,20000011.0,13.0,451598.0,0.537808,22.0,75266.33333,66.0,7602992.0,31446.0,0.0,281.0,0.0,0.0,5.0,14.0,57.0,18.0,6.0,6.0,11.0,5241.0,0.0,46.833333,0.0,9.5,0.0,0.0,0.0,0.000146,1.1e-05,3.1e-05,0.0,34738.307692,16.835752,12544.388888,1.692308,1.384615,3.0,75266.333333
3,20000016.0,38.0,401885.0,43.509962,42.0,28706.07143,153.0,80119427.0,61579.0,0.0,1596.0,0.0,0.0,32.0,75.0,343.0,75.0,5893.0,14.0,10.928571,4398.5,0.0,114.0,0.0,24.5,69458.0,0.0,3655.684211,0.000381,8e-05,0.000187,0.172831,10575.921053,199.359088,2050.433674,1.105263,1.973684,3.642857,68.197013
4,20000025.0,222.0,424827.0,82.621717,0.0,28321.8,552.0,322403224.0,29319.0,0.0,2415.0,0.0,8.0,214.0,111.0,776.0,241.0,131.0,15.0,36.8,1954.6,0.0,161.0,0.533333,51.733333,37560.0,0.0,4173.333333,0.001299,0.000504,0.000261,0.088412,1913.635135,758.90474,1888.12,0.0,1.085586,0.0,3242.954198


# Creating Final Training Data

In [26]:
final_data = pd.DataFrame

In [29]:
# Through inspection of each columns
final_data = model_data[['login_count', 'play_time', 'max_level', "playtime_per_day", 'abyss', 'sit_count', 'exp_get_amout', 'money_get_count', 'teleport_count', 'killed_by_pc', 'killed_by_npc', 'reborn_count', 'login_total_day', 'sit_count_perday', 'login_total_day', 'item_get_count_perday', 'money_get_count_perday', 'use_portal_count_perday', 'teleport_count_perday', 'total_party_time', 'sit_count/play_time', 'killed_by_pc/play_time', 'killed_by_npc/play_time', 'total-party_time/play_time', 'teleport_count/play_time', 'play_time/login_count', 'exp_get_amount/play_time', 'playtime_per_day/login_total_day', 'max_level/login_count', 'reborn_count/login_count', 'sit_count/max_level', 'play_time/question_count' ]]

In [30]:
final_data.to_csv('dataset_for_creating_labels_for_submission.csv')