# 最初のデータ探索・加工

In [1]:
import os
import sys
sys.path.append(os.path.abspath(".."))

import os
import re

import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## Load Data

In [2]:
def load_dataset(data_dir_path='../data/raw/'):
    print('Reading train.csv file....')
    train = pd.read_csv(data_dir_path + 'train.csv')
    print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

    print('Reading test.csv file....')
    test = pd.read_csv(data_dir_path + 'test.csv')
    print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv(data_dir_path + 'train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv(data_dir_path + 'specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv(data_dir_path + 'sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(
        sample_submission.shape[0], sample_submission.shape[1]))

    return train, test, train_labels, specs, sample_submission

In [3]:
raw_train, raw_test, train_labels, specs, sample_submission = load_dataset()

Reading train.csv file....
Training.csv file have 11341042 rows and 11 columns
Reading test.csv file....
Test.csv file have 1156414 rows and 11 columns
Reading train_labels.csv file....
Train_labels.csv file have 17690 rows and 7 columns
Reading specs.csv file....
Specs.csv file have 386 rows and 3 columns
Reading sample_submission.csv file....
Sample_submission.csv file have 1000 rows and 2 columns


In [4]:
print(raw_train.shape)
raw_train.head()

(11341042, 11)


Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [5]:
raw_test[((raw_test['type']=='Assessment') & (raw_test['event_code']==4100))]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
303,d122731b,8b38fc0d2fd315dc,2019-09-11T18:56:34.661Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",00abaee7,22,4100,22737,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
1561,392e14df,009c890ce6c4f3e3,2019-09-22T21:23:17.737Z,"{""buckets"":[0,0,0],""correct"":false,""buckets_pl...",01242218,15,4100,11474,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
1569,392e14df,009c890ce6c4f3e3,2019-09-22T21:23:26.033Z,"{""buckets"":[0,0,0],""correct"":true,""buckets_pla...",01242218,22,4100,19775,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
1961,d122731b,e8e62de939f916bc,2019-09-22T21:39:06.838Z,"{""correct"":true,""left"":[{""id"":""gem07"",""weight""...",01242218,9,4100,8504,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
2333,93b353f2,ab61cae5e3215355,2019-09-22T21:50:51.897Z,"{""correct"":false,""pillars"":[],""event_count"":16...",01242218,16,4100,13935,Chest Sorter (Assessment),Assessment,CRYSTALCAVES
...,...,...,...,...,...,...,...,...,...,...,...
1156280,392e14df,c116d9e6f8cf85c3,2019-09-10T18:55:19.532Z,"{""buckets"":[2,3,1],""correct"":false,""buckets_pl...",ffe00ca8,24,4100,13351,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
1156309,392e14df,c116d9e6f8cf85c3,2019-09-10T18:55:36.816Z,"{""buckets"":[0,0,0],""correct"":true,""buckets_pla...",ffe00ca8,53,4100,30620,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
1156329,392e14df,70336ec581799feb,2019-09-10T18:56:10.372Z,"{""buckets"":[3,2,1],""correct"":false,""buckets_pl...",ffe00ca8,5,4100,4701,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
1156352,392e14df,70336ec581799feb,2019-09-10T18:56:26.743Z,"{""buckets"":[0,0,0],""correct"":true,""buckets_pla...",ffe00ca8,27,4100,21068,Cauldron Filler (Assessment),Assessment,MAGMAPEAK


## Transform Data

In [116]:
# 各種イベントデータのリストを取得
all_event_code = np.union1d(raw_train['event_code'].unique(), raw_test['event_code'].unique())
all_title = np.union1d(raw_train['title'].unique(), raw_test['title'].unique())

# Assesmentが含まれないinstallation_idは学習に使えないので除外
keep_ids = raw_train.loc[raw_train['type'] == 'Assessment', 'installation_id'].unique()
train = raw_train[raw_train.installation_id.isin(keep_ids)].reset_index(drop=True)

In [117]:
# temp = train[
#     ((train['title']!='Bird Measurer (Assessment)') & (train['event_code']==4100) & (train['type']=='Assessment')) |
#     ((train['title']=='Bird Measurer (Assessment)') & (train['event_code']==4110) & (train['type']=='Assessment'))]

# temp.drop_duplicates(subset=['game_session', 'installation_id'], keep='first')

In [118]:
print(train.shape)
train.head()

(8294138, 11)


Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,34ba1a28d02ba8ba,2019-08-06T04:57:18.904Z,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,4b57c9a59474a1b9,2019-08-06T04:57:45.301Z,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,2b9d5af79bcdb79f,2019-08-06T04:58:14.538Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,2b9d5af79bcdb79f,2019-08-06T04:58:14.615Z,"{""description"":""Let's build a sandcastle! Firs...",0006a69f,2,3010,29,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1325467d,2b9d5af79bcdb79f,2019-08-06T04:58:16.680Z,"{""coordinates"":{""x"":273,""y"":650,""stage_width"":...",0006a69f,3,4070,2137,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [287]:
def calc_avg_accuracy_group(accuracy_group_count):
    total_score = sum([k*v for k, v in accuracy_group_count.items()])
    total_trial_count = sum(accuracy_group_count.values())
    avg_accuracy_group = total_score / total_trial_count if total_trial_count != 0 else 0
    return avg_accuracy_group 


def parse_assessment_session(session, title):
    attempt_code = 4100 if title!='Bird Measurer (Assessment)' else 4110
    attempt_event = session[session['event_code']==attempt_code]
    
    num_correct = attempt_event['event_data'].str.contains('"correct":true').astype(int).sum()
    num_incorrect = attempt_event['event_data'].str.contains('"correct":false').astype(int).sum()
    
    accuracy = num_correct / (num_correct+num_incorrect)
    
    def calc_accuracy_group(accuracy):
        if accuracy == 1.0:
            return 3
        elif accuracy == 1.0 / 2:
            return 2
        elif accuracy <= 1.0 / 3 and accuracy > 0.0:
            return 1
        else:
            return 0
    
    accuracy_group = calc_accuracy_group(accuracy)
    return num_correct, num_incorrect, accuracy, accuracy_group


def parse_session_data(data, is_test_data=False):
    # divise_data is data for each installation_id.
    train_data = []
    # 特定のinstallation_idごとのsessionデータを取得
    for i_id, sessions in tqdm(data.groupby('installation_id', sort=False)):
        total_session_data = {}
        # installation_idごとに特徴量を初期化
        total_session_data['installation_id'] = i_id
        session_type_count = {'Clip': 0, 'Activity': 0, 'Assessment': 0, 'Game': 0}
        session_title_count = {title: 0 for title in all_title}
        session_world_count = {'NONE': 0, 'MAGMAPEAK': 0, 'CRYSTALCAVES': 0, 'TREETOPCITY': 0}
        # event_code, type, world, titleごとの各種カウント
        event_code_count =  {code: 0 for code in all_event_code}
        # type, titleごとのセッション時間
        time_spent_each_type = {'time_spent_Clip': 0, 'time_spent_Activity': 0, 'time_spent_Assessment': 0, 'time_spent_Game': 0} # session_type != 'Assessment'としていた。。？
        time_spent_each_title = {f'time_spent_{title}': 0 for title in all_title} # session_type != 'Assessment'としていた。。？
        # カウントデータから算出
        total_event_count = 0 # 合計イベント回数
        total_type_count = 0 # 合計タイプ個数
        total_time_spent = 0 # 合計セッション時間
        # 最後のセッションタイプ
        last_session_type = None 
        last_session_time_msec = None
        # Assesment から得られるデータ
        accuracy_group_count = {f'accuracy_group_{i}':0 for i in range(4)} # 過去の評価値
        total_num_incorrect = 0
        total_num_correct = 0

        for session_id, session in sessions.groupby('game_session', sort=False):
            # セッション情報からデータを取得
            total_session_data['game_session'] = session_id
            session_type = session['type'].unique()[0]
            session_title = session['title'].unique()[0]
            session_world = session['world'].unique()[0]
            event_count_dict = session['event_code'].value_counts().to_dict()
            session_game_time = session['game_time'].iloc[-1]
            # 累積セッションデータを更新
            session_type_count[session_type] += 1
            session_title_count[session_title] += 1
            session_world_count[session_world] += 1
            time_spent_each_type[f'time_spent_{session_type}'] += session_game_time
            time_spent_each_title[f'time_spent_{session_title}'] += session_game_time
            for k, v in event_count_dict.items():
                event_code_count[k] += v

            total_event_count = sum(event_code_count.values())
            total_type_count = sum(time_spent_each_type.values())
            total_time_spent = sum(time_spent_each_title.values())

            if session_type == 'Assessment':
                # get_accuracy_groupの算出
                num_correct, num_incorrect, accuracy, accuracy_group = parse_assessment_session(session, session_title)

                # ラベルデータを追加し、学習データとして保存
                # １度も評価されていないAssessmentデータを除く
                if num_correct+num_incorrect > 0 or is_test_data:
                    train_data.append(dict(total_session_data, **{'accuracy_group': accuracy_group}))

                accuracy_group_count[f'accuracy_group_{accuracy_group}'] += 1
                total_num_incorrect += num_incorrect
                total_num_correct += num_correct

                last_session_type = session_type
                last_session_time_msec = session_game_time

                total_session_data.update({'total_num_incorrect': total_num_incorrect})
                total_session_data.update({'total_num_correct': total_num_correct})
                total_session_data.update({'last_session_type': last_session_type})
                total_session_data.update({'last_session_time_msec': last_session_time_msec})


            # セッションデータの解析
            total_session_data.update(session_type_count)
            total_session_data.update(session_title_count)
            total_session_data.update(session_world_count)
            total_session_data.update(event_code_count)
            total_session_data.update(time_spent_each_type)
            total_session_data.update(time_spent_each_title)

    df = pd.DataFrame(train_data)
    # テストデータの場合、最後の１Assesmentセッションのみを予測データとして扱う。
    if is_test_data:
        return df.groupby(['installation_id']).tail(2).drop_duplicates(['installation_id'], keep='first')
    return df

In [251]:
%%time
df = parse_session_data(train, is_test_data=False)

In [245]:
print(df.shape)
df.head()

(17690, 149)


Unnamed: 0,installation_id,game_session,Clip,Activity,Assessment,Game,12 Monkeys,Air Show,All Star Sorting,Balancing Act,...,time_spent_Tree Top City - Level 1,time_spent_Tree Top City - Level 2,time_spent_Tree Top City - Level 3,time_spent_Watering Hole (Activity),time_spent_Welcome to Lost Lagoon!,accuracy_group,total_num_incorrect,total_num_correct,last_session_type,last_session_time_msec
0,0006a69f,901acc108f55a5a1,11.0,3.0,0.0,4.0,1.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3,,,,
1,0006a69f,77b8ee947eb84b4e,14.0,4.0,1.0,6.0,1.0,1.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,Assessment,39803.0
2,0006a69f,6bdf9623adc94d89,14.0,4.0,2.0,6.0,1.0,1.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3,11.0,1.0,Assessment,92799.0
3,0006a69f,9501794defd84e4d,24.0,9.0,4.0,10.0,2.0,1.0,4.0,0.0,...,0.0,0.0,0.0,80243.0,0.0,2,11.0,2.0,Assessment,8789.0
4,0006a69f,a9ef3ecb3d1acc6a,28.0,10.0,5.0,13.0,2.0,2.0,4.0,0.0,...,0.0,0.0,0.0,80243.0,0.0,3,12.0,3.0,Assessment,31843.0


In [None]:
# 解析後の特徴量エンジニアリング。
'''
df.groupby(['installation_id'])['avg_game_time'].transform('mean')
df.groupby(['installation_id'])['session_title'].transform('nunique')
df.groupby(['installation_id'])['total_event_count'].transform('mean')
df.groupby(['installation_id'])['total_session_count'].transform('mean')
df['avg_accuracy'] = df['total_num_correct'] / (df['total_num_correct']+df['total_num_incorrect'])
'''

In [288]:
%%time
test_df = parse_session_data(raw_test, is_test_data=True)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

  from ipykernel import kernelapp as app



CPU times: user 47.5 s, sys: 722 ms, total: 48.2 s
Wall time: 48.6 s


In [289]:
print(test_df.shape)
test_df.head()

(1000, 149)


Unnamed: 0,installation_id,game_session,Clip,Activity,Assessment,Game,12 Monkeys,Air Show,All Star Sorting,Balancing Act,...,time_spent_Tree Top City - Level 1,time_spent_Tree Top City - Level 2,time_spent_Tree Top City - Level 3,time_spent_Watering Hole (Activity),time_spent_Welcome to Lost Lagoon!,accuracy_group,total_num_incorrect,total_num_correct,last_session_type,last_session_time_msec
0,00abaee7,8b38fc0d2fd315dc,13.0,1.0,0.0,2.0,2.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,3,,,,
6,01242218,597a8839a5a3468d,25.0,11.0,4.0,11.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,73643.0,0.0,1,5.0,3.0,Assessment,38746.0
8,017c5718,4b165a330a0bdd6c,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,,,,
9,01a44906,be0b655ad1fee30c,10.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,,,,
10,01bc6cb6,46e8bbed71df7520,17.0,1.0,0.0,6.0,0.0,0.0,3.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0,,,,
