In [1]:
import numpy as np
from tqdm import tqdm
import json
import pandas as pd
import os
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
#import lightgbm as lgb

In [2]:
#Shape of data 
print('Reading train.csv file....')
train = pd.read_csv('data/train.csv')
print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

# print('Reading test.csv file....')
# test = pd.read_csv('data/test.csv')
# print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

print('Reading train_labels.csv file....')
train_labels = pd.read_csv('data/train_labels.csv')
print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

# print('Reading specs.csv file....')
# specs = pd.read_csv('data/specs.csv')
# print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

# print('Reading sample_submission.csv file....')
# sample_submission = pd.read_csv('data/sample_submission.csv')
# print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))

Reading train.csv file....
Training.csv file have 11341042 rows and 11 columns
Reading train_labels.csv file....
Train_labels.csv file have 17690 rows and 7 columns


In [3]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [114]:
activities_map = dict(zip(train['title'].unique(), 
                          4100*np.ones(train['title'].nunique()).astype('int')))
activities_map['Bird Measurer (Assessment)'] = 4110

In [161]:
def feature_engineering(user_sample):
    output = []
    cum_assessments, cum_activity, cum_clips, cum_games = 0, 0, 0, 0
    cum_corrects, cum_incorrects = 0, 0
    cum_dur_assessment = 0
    # itarates through each session of one instalation_id
    for session_name, session in user_sample.groupby('game_session', sort=False):

        # Empty dict to have the feature characterestics
        features = {}
        features['installation_id'] = session['installation_id'].unique()[0]
        features['game_session'] = session['game_session'].unique()[0]
        # event_counter includes all event codes and all types
        features['event_counter'] = session.iloc[-1]['event_count']

        # session type
        features['type'] = session['type'].unique()[0]
        # session title
        features['title'] = session['title'].unique()[0]

        # World
        features['world'] = session['world'].unique()[0]

        # Just get back those with event codes of 4100 and 4110
        all_attempts = session.query(
            f'event_code == {activities_map[features["title"]]}')
        all_attempts = session

        if features['type'] == 'Assessment':

            # Check the numbers of correct atteampts
            features['cum_corrects'] = cum_corrects
            features['correct'] = all_attempts['event_data'].str.contains(
                'true').sum()
            cum_corrects += features['correct']

            # Check the numbers of correct atteampts
            features['cum_incorrect'] = cum_incorrects
            features['incorrect'] = all_attempts['event_data'].str.contains(
                'false').sum()
            cum_incorrects += features['incorrect']

            # check the total number of actions
            features['cum_assessments'] = cum_assessments
            # if we consider all event codes,
            # actions should be the same as event counter
            features['assessments'] = len(all_attempts['event_data'])
            cum_assessments += features['assessments']

            # To compute accuracy
            features['acc'] = features['correct'] / (features['assessments'])\
                if features['assessments'] != 0 else None

            # To find the accuracy group
            if features['acc'] == 0:
                features['acc_group'] = 0
            elif features['acc'] == 1:
                features['acc_group'] = 3
            elif features['acc'] == 0.5:
                features['acc_group'] = 2
            else:
                features['acc_group'] = 1

            # The number of clicks that the user watched before this assessment
            features['cum_clips'] = cum_clips
            features['cum_activity'] = cum_activity
            features['cum_games'] = cum_games
            
            #Durations 
            features['cum_dur_assessment'] = cum_dur_assessment
            #print(all_attempts['game_time'])
            durations = all_attempts['game_time'].diff().dropna()
            features['assessment_zero_dur'] = sum(durations == 0)
            features['assessment_dur_sum'] = sum(durations)
            features['assessment_dur_std'] = durations.std()
            features['assessment_dur_std_nonzero'] = durations.loc[durations != 0].std()
            features['assessment_dur_mean_nonzero'] = durations.loc[durations != 0].mean()
            cum_dur_assessment += features['assessment_dur_sum']
            

        elif features['type'] == 'Clip': 
            # check the total number of clips
            cum_clips += len(all_attempts['event_data'])
        elif features['type'] == 'Activity':
            # check the total number of clips
            cum_activity += len(all_attempts['event_data'])
        elif features['type'] == 'Game':
            # check the total number of clips
            cum_games += len(all_attempts['event_data'])
            #return all_attempts
        else:
            raise Exception('{} is not within the predefined features'.format(features['type']))

        if features.get('assessments', 0) > 0:
            output.append(features)
    return pd.DataFrame(output)

In [162]:
#groups = train.groupby('installation_id', sort = False)
#g = groups.get_group('0006a69f')
feature_engineering(g)

Unnamed: 0,acc,acc_group,assessment_dur_mean_nonzero,assessment_dur_std,assessment_dur_std_nonzero,assessment_dur_sum,assessment_zero_dur,assessments,correct,cum_activity,...,cum_dur_assessment,cum_games,cum_incorrect,event_counter,game_session,incorrect,installation_id,title,type,world
0,0.125,1,1105.638889,1110.335782,1151.535749,39803.0,11,48,6,352,...,0.0,284,0,48,901acc108f55a5a1,1,0006a69f,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1,0.034483,1,1288.875,1002.020175,963.217772,92799.0,14,87,3,556,...,39803.0,525,1,87,77b8ee947eb84b4e,30,0006a69f,Bird Measurer (Assessment),Assessment,TREETOPCITY
2,0.114286,1,1219.409091,1051.951188,1090.479303,26827.0,12,35,4,556,...,132602.0,525,31,35,6bdf9623adc94d89,0,0006a69f,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
3,0.0,0,1255.571429,1118.301314,1446.355058,8789.0,9,17,0,1103,...,159429.0,845,31,17,e7e7db2a241eadcc,0,0006a69f,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
4,0.119048,1,1061.433333,1079.503094,1137.829049,31843.0,11,42,5,1103,...,168218.0,845,31,42,9501794defd84e4d,3,0006a69f,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
5,0.25,1,1581.217391,1342.134149,1334.839757,36368.0,8,32,8,1218,...,200061.0,1111,34,32,a9ef3ecb3d1acc6a,0,0006a69f,Bird Measurer (Assessment),Assessment,TREETOPCITY


In [158]:
groups = train.groupby('installation_id', sort = False)
df_train = pd.DataFrame()
count = 0
for session_name, g in train.groupby('installation_id', sort=False):
    df_train = df_train.append(feature_engineering(g))

In [157]:
df_train.shape

(17690, 23)

In [156]:
# for idx in train_labels.index:
#     game_session = train_labels.iloc[idx]['game_session']
#     installation_id = train_labels.iloc[idx]['installation_id']
#     acc = df_train.loc[(df_train['game_session'] == game_session) &
#                 (df_train['installation_id'] == installation_id)]['acc']
#     acc_2 = train_labels.iloc[idx]['accuracy']
#     acc = np.float(acc)
#     if  acc != acc_2:
#         print(idx, acc, acc_2)