In [1]:
import pandas as pd
import numpy as np
import datetime
from itertools import product
from scipy import interpolate ## For other interpolation functions.
import time

from sklearn.cross_validation import LabelKFold



In [2]:
# Load and transform people data. 
ppl = pd.read_csv('../input/people.csv')

# Load activity data ...
TrainActivs = pd.read_csv('../input/act_train.csv', )
TestActivs = pd.read_csv('../input/act_test.csv')

# ... and combine

act = pd.concat([TrainActivs, TestActivs], axis=0) ## Append train and test sets.

In [3]:
act = act.rename(columns={'date': 'adate'})
ppl = ppl.rename(columns={'date': 'pdate'})

In [4]:
act['adate'] = pd.to_datetime(act['adate'])
ppl['pdate'] = pd.to_datetime(ppl['pdate'])

In [5]:
cat_rename = {'type 4':4, 'type 2':2, 'type 3':3, 'type 5':5, 'type 1':1, 'type 7':7, 'type 6':6}

act.activity_category = act.activity_category.replace(cat_rename)

In [6]:
groups_unique = ppl.group_1.unique()

g_remap = []
for g in groups_unique:
    g_remap.append([g, int(g[6:])])

df_g_remap = pd.DataFrame(g_remap)
df_g_remap.columns = ['group_1', '_group_1']

In [7]:
ppl = pd.merge(ppl, df_g_remap, on='group_1', how='left')

In [8]:
ppl.drop('group_1', axis=1, inplace=True)
ppl.rename(columns={'_group_1':'group_1'}, inplace=True)

In [9]:
keyset_typed = ['char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9']
for key in keyset_typed:        
    key_unique = ppl[key].unique()

    k_remap = []
    for k in key_unique:
        k_remap.append([k, int(k[5:]) - 1])
        
    df_k_remap = pd.DataFrame(k_remap)
    df_k_remap.columns = ['{0}'.format(key), '_{0}'.format(key)]     
    
    ppl = pd.merge(ppl, df_k_remap, on=key, how='left')
    

In [10]:
for key in keyset_typed:
    try:
        ppl.drop(key, axis=1, inplace=True)
    except:
        None
        
    ppl.rename(columns={'_{0}'.format(key): '{0}'.format(key)}, inplace=True)

In [11]:
for key in ppl.keys():
    if 'char' not in key:
        continue
        
    ppl.rename(columns={'{0}'.format(key): 'p{0}'.format(key)}, inplace=True)
        

In [12]:
ppl.keys()

Index(['people_id', 'pdate', 'pchar_10', 'pchar_11', 'pchar_12', 'pchar_13',
       'pchar_14', 'pchar_15', 'pchar_16', 'pchar_17', 'pchar_18', 'pchar_19',
       'pchar_20', 'pchar_21', 'pchar_22', 'pchar_23', 'pchar_24', 'pchar_25',
       'pchar_26', 'pchar_27', 'pchar_28', 'pchar_29', 'pchar_30', 'pchar_31',
       'pchar_32', 'pchar_33', 'pchar_34', 'pchar_35', 'pchar_36', 'pchar_37',
       'pchar_38', 'group_1', 'pchar_1', 'pchar_2', 'pchar_3', 'pchar_4',
       'pchar_5', 'pchar_6', 'pchar_7', 'pchar_8', 'pchar_9'],
      dtype='object')

In [13]:
keyset_typed = ['char_1', 'char_10', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9']
for key in keyset_typed:        
    act[key].fillna('type 0', inplace=True)
    key_unique = act[key].unique()

    k_remap = []
    for k in key_unique:
        k_remap.append([k, int(k[5:]) - 1])
        
    df_k_remap = pd.DataFrame(k_remap)
    df_k_remap.columns = ['{0}'.format(key), '_{0}'.format(key)]     
    
    act = pd.merge(act, df_k_remap, on=key, how='left')


In [14]:
for key in keyset_typed:
    act.drop(key, axis=1, inplace=True)
        
    act.rename(columns={'_{0}'.format(key): 'a{0}'.format(key)}, inplace=True)

In [15]:
act.keys()

Index(['activity_category', 'activity_id', 'adate', 'outcome', 'people_id',
       'achar_1', 'achar_10', 'achar_2', 'achar_3', 'achar_4', 'achar_5',
       'achar_6', 'achar_7', 'achar_8', 'achar_9'],
      dtype='object')

In [16]:
pid_unique = ppl.people_id.unique()

pid_remap = []
for pid in pid_unique:
    try:
        pid_remap.append([pid, int(pid[4:])])
    except: # 1e+05 etc
        pid_remap.append([pid, int(float(pid[4:]))])
#        print(pid_remap[-1])

df_p_remap = pd.DataFrame(pid_remap)
df_p_remap.columns = ['people_id', '_people_id']

In [17]:
ppl = pd.merge(ppl, df_p_remap, on='people_id')
ppl.drop('people_id', axis=1, inplace=True)
ppl.rename(columns={'_people_id':'people_id'}, inplace=True)

In [18]:
act = pd.merge(act, df_p_remap, on='people_id')
act.drop('people_id', axis=1, inplace=True)
act.rename(columns={'_people_id':'people_id'}, inplace=True)

In [19]:
output = pd.merge(act, ppl, on='people_id', how='left')

In [21]:
output.keys()

Index(['activity_category', 'activity_id', 'adate', 'outcome', 'achar_1',
       'achar_10', 'achar_2', 'achar_3', 'achar_4', 'achar_5', 'achar_6',
       'achar_7', 'achar_8', 'achar_9', 'people_id', 'pdate', 'pchar_10',
       'pchar_11', 'pchar_12', 'pchar_13', 'pchar_14', 'pchar_15', 'pchar_16',
       'pchar_17', 'pchar_18', 'pchar_19', 'pchar_20', 'pchar_21', 'pchar_22',
       'pchar_23', 'pchar_24', 'pchar_25', 'pchar_26', 'pchar_27', 'pchar_28',
       'pchar_29', 'pchar_30', 'pchar_31', 'pchar_32', 'pchar_33', 'pchar_34',
       'pchar_35', 'pchar_36', 'pchar_37', 'pchar_38', 'group_1', 'pchar_1',
       'pchar_2', 'pchar_3', 'pchar_4', 'pchar_5', 'pchar_6', 'pchar_7',
       'pchar_8', 'pchar_9'],
      dtype='object')

In [24]:
for k in output.keys():
    print(k, output[k].dtype, np.min(output[k]), np.max(output[k]))
    
    if 'achar' in group:

activity_category int64 1 7
activity_id object act1_1 act2_9e+05
adate datetime64[ns] 2022-07-17 00:00:00 2023-08-31 00:00:00
outcome float64 0.0 1.0
achar_1 int64 -1 51
achar_10 int64 -1 9250
achar_2 int64 -1 31
achar_3 int64 -1 10
achar_4 int64 -1 6
achar_5 int64 -1 6
achar_6 int64 -1 4
achar_7 int64 -1 7
achar_8 int64 -1 17
achar_9 int64 -1 18
people_id int64 2 398238
pdate datetime64[ns] 2020-05-18 00:00:00 2023-08-31 00:00:00
pchar_10 bool False True
pchar_11 bool False True
pchar_12 bool False True
pchar_13 bool False True
pchar_14 bool False True
pchar_15 bool False True
pchar_16 bool False True
pchar_17 bool False True
pchar_18 bool False True
pchar_19 bool False True
pchar_20 bool False True
pchar_21 bool False True
pchar_22 bool False True
pchar_23 bool False True
pchar_24 bool False True
pchar_25 bool False True
pchar_26 bool False True
pchar_27 bool False True
pchar_28 bool False True
pchar_29 bool False True
pchar_30 bool False True
pchar_31 bool False True
pchar_32 bool F

In [33]:
for k in output.keys():
    if k == 'achar_10':
        output[k] = output[k].astype(np.int16)
        print(k, output[k].dtype, np.min(output[k]), np.max(output[k]))
    elif 'achar' in k:
        output[k] = output[k].astype(np.int8)
        print(k, output[k].dtype, np.min(output[k]), np.max(output[k]))
    elif 'pchar' in k:
        output[k] = output[k].astype(np.uint8)
        print(k, output[k].dtype, np.min(output[k]), np.max(output[k]))
    elif k == 'people_id':
        output[k] = output[k].astype(np.int32)
        print(k, output[k].dtype, np.min(output[k]), np.max(output[k]))
    elif k == 'group_1':
        output[k] = output[k].astype(np.uint16)
        print(k, output[k].dtype, np.min(output[k]), np.max(output[k]))


achar_1 int8 -1 51
achar_10 int16 -1 9250
achar_2 int8 -1 31
achar_3 int8 -1 10
achar_4 int8 -1 6
achar_5 int8 -1 6
achar_6 int8 -1 4
achar_7 int8 -1 7
achar_8 int8 -1 17
achar_9 int8 -1 18
people_id int32 2 398238
pchar_10 uint8 0 1
pchar_11 uint8 0 1
pchar_12 uint8 0 1
pchar_13 uint8 0 1
pchar_14 uint8 0 1
pchar_15 uint8 0 1
pchar_16 uint8 0 1
pchar_17 uint8 0 1
pchar_18 uint8 0 1
pchar_19 uint8 0 1
pchar_20 uint8 0 1
pchar_21 uint8 0 1
pchar_22 uint8 0 1
pchar_23 uint8 0 1
pchar_24 uint8 0 1
pchar_25 uint8 0 1
pchar_26 uint8 0 1
pchar_27 uint8 0 1
pchar_28 uint8 0 1
pchar_29 uint8 0 1
pchar_30 uint8 0 1
pchar_31 uint8 0 1
pchar_32 uint8 0 1
pchar_33 uint8 0 1
pchar_34 uint8 0 1
pchar_35 uint8 0 1
pchar_36 uint8 0 1
pchar_37 uint8 0 1
pchar_38 uint8 0 100
group_1 uint16 1 51462
pchar_1 uint8 0 1
pchar_2 uint8 0 2
pchar_3 uint8 0 43
pchar_4 uint8 0 24
pchar_5 uint8 0 8
pchar_6 uint8 0 6
pchar_7 uint8 0 24
pchar_8 uint8 0 7
pchar_9 uint8 0 8


In [34]:
# gzip this afterwards, goes from ~295MB to 33.8MB
output.to_pickle('merged7.pkl')