In [18]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
import pickle

In [19]:
dir_in = 'data_ori'
dir_out = 'data/'

In [20]:
datadir = 'data_ori'
gatrain = pd.read_csv(os.path.join(dir_in,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(dir_in,'gender_age_test.csv'),
                      index_col = 'device_id')
# Get rid of duplicate device ids in phone
events = pd.read_csv(os.path.join(dir_in,'events.csv'), index_col='event_id')
appevents = pd.read_csv(os.path.join(dir_in,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active','is_installed'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(dir_in,'app_labels.csv'))

In [21]:

events['timestamp'] = pd.to_datetime(events['timestamp'])
events['timestamp_h'] = events['timestamp'].dt.hour
events['timestamp_hq'] = ((events['timestamp_h'] - 1) // 6).clip_lower(0)
events['timestamp_d'] = events['timestamp'].dt.day
events.loc[events['timestamp_d']==30,'timestamp_d'] = 0
events['timestamp_hd'] = (100*(events['timestamp_h']/24 + events['timestamp_d'])).astype(int)
events['timestamp_hqd'] = (100*(events['timestamp_hq']/4 + events['timestamp_d'])).astype(int)
events['timestamp_during_day'] = events['timestamp_hq'].isin([1,2])
events['timestamp_during_weekend'] = events['timestamp_d'].isin([30,1,7,8])

In [22]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

## Get number of events

In [23]:
n_events = events.groupby('device_id').size()
print n_events.shape
n_events.head(5)

(60865,)


device_id
-9222956879900151005    65
-9222661944218806987     8
-9222399302879214035    10
-9221825537663503111    99
-9221767098072603291     8
dtype: int64

In [24]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)


In [25]:
appevents = appevents.merge(events[['device_id','timestamp_during_weekend','timestamp_during_day']], how='left',left_on='event_id',right_index=True)

In [26]:
appevents.sample(10)

Unnamed: 0,event_id,app_id,is_installed,is_active,app,device_id,timestamp_during_weekend,timestamp_during_day
31050734,3103563,3300927415001585151,1,True,12968,-1791909839271332628,False,True
31458442,3143623,6489166276122938763,1,True,16142,-8893144492511709840,False,True
9370805,941060,-2766653348096505346,1,False,6320,-5215508516534020464,False,False
23158791,2317287,-5924787280098922359,1,True,3163,-7196716156557321514,False,True
27606408,2766629,3433289601737013244,1,True,13094,4813805637689074071,False,False
22760406,2280173,7034929179771260790,1,False,16778,-4483235488687461351,False,False
7995007,803742,6666573790395438502,1,False,16360,-6059695611888480748,False,False
15231841,1522260,2761480706282992376,1,True,12487,-8741653364202949668,False,False
3993093,401238,-6008771162903226529,1,False,3079,3792988047488383506,False,False
4684344,469558,3683147815759994238,1,False,13325,-1430923301939426783,False,True


## App_label anytime dataframe

In [27]:
deviceapps = (appevents.groupby(['device_id','app'])['is_installed','is_active'].sum()
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
print(deviceapps.shape)
deviceapps.head()

(2369025, 6)


Unnamed: 0,device_id,app,is_installed,is_active,trainrow,testrow
0,-9222956879900151005,548,18,4.0,21594.0,
1,-9222956879900151005,1096,18,0.0,21594.0,
2,-9222956879900151005,1248,26,15.0,21594.0,
3,-9222956879900151005,1545,12,2.0,21594.0,
4,-9222956879900151005,1664,18,0.0,21594.0,


In [39]:
devicelabels = (deviceapps[['device_id','app','is_installed','is_active']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['is_installed','is_active'].sum()
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
print(devicelabels.shape)
devicelabels.head()

(4244113, 6)


Unnamed: 0,device_id,label,is_installed,is_active,trainrow,testrow
0,-9222956879900151005,117,16,16.0,21594.0,
1,-9222956879900151005,120,17,17.0,21594.0,
2,-9222956879900151005,126,33,31.0,21594.0,
3,-9222956879900151005,138,59,51.0,21594.0,
4,-9222956879900151005,147,37,34.0,21594.0,


## App_label week dataframe

In [40]:
appevents_week = appevents.copy()
appevents_week.loc[appevents_week.timestamp_during_weekend==True, ['is_installed', 'is_active']]=0


In [41]:
appevents_week.sample(10)

Unnamed: 0,event_id,app_id,is_installed,is_active,app,device_id,timestamp_during_weekend,timestamp_during_day
30707619,3070919,6666573792468194779,0,False,16378,-1819279698299020233,True,True
4734920,475773,3879845647105392250,1,False,13502,7549700249834724229,False,False
16167119,1618957,-5617926790284095062,1,False,3478,-911096199051486977,False,False
26274020,2631760,-196592189248911642,1,False,9454,5872729599536773570,False,False
8696186,870027,6666573788609354393,1,False,16327,-4395718767198095317,False,True
31630756,3163352,-1633938282180439957,1,True,7519,-2488711984835721560,False,True
2152419,216155,-3938663771964020509,1,True,5259,-8224953330746523978,False,False
19444929,1947456,8948670408023620661,1,True,18924,-839850009592711757,False,False
22175718,2222981,1183530713303335378,1,False,10962,1936735806494236513,False,False
19785159,1980841,7317602140724054002,1,False,17229,-7500839935858713046,False,False


In [42]:
deviceapps_week = (appevents_week.groupby(['device_id','app'])['is_installed','is_active'].sum()
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps_week.head(10)

Unnamed: 0,device_id,app,is_installed,is_active,trainrow,testrow
0,-9222956879900151005,548,8,0.0,21594.0,
1,-9222956879900151005,1096,8,0.0,21594.0,
2,-9222956879900151005,1248,12,8.0,21594.0,
3,-9222956879900151005,1545,2,2.0,21594.0,
4,-9222956879900151005,1664,8,0.0,21594.0,
5,-9222956879900151005,1848,13,13.0,21594.0,
6,-9222956879900151005,2236,13,9.0,21594.0,
7,-9222956879900151005,2350,11,11.0,21594.0,
8,-9222956879900151005,2626,10,10.0,21594.0,
9,-9222956879900151005,3384,11,11.0,21594.0,


In [43]:
devicelabels_week = (deviceapps_week[['device_id','app','is_installed','is_active']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['is_installed','is_active'].sum()
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
print(devicelabels_week.shape)
devicelabels_week.sample(10)

(4244113, 6)


Unnamed: 0,device_id,label,is_installed,is_active,trainrow,testrow
647312,-6434881097208547451,308,3,0.0,43961.0,
1906667,-991529057862645066,276,51,33.0,,28695.0
269324,-8041871886807791855,396,24,1.0,,27316.0
3865835,7577528289938767097,417,1,0.0,,26350.0
3165902,4529298113351424933,326,36,5.0,46659.0,
1109834,-4466370067347561779,103,2,0.0,,93710.0
3460096,5789938595176888221,370,2,0.0,,62319.0
2307327,790610582133613479,370,1,0.0,41862.0,
2544275,1817479374539106969,228,1,0.0,,22714.0
3612241,6465984976445028528,296,0,0.0,14026.0,


## App_label day dataframe

In [44]:
appevents_day = appevents.copy()
appevents_day.loc[appevents_day.timestamp_during_day==False, ['is_installed', 'is_active']]=0

In [45]:
appevents_day.sample(10)

Unnamed: 0,event_id,app_id,is_installed,is_active,app,device_id,timestamp_during_weekend,timestamp_during_day
1022481,98228,8948670408023620661,1,True,18924,-5436207345265059976,False,True
26991272,2702689,-645224643319180577,1,False,8945,5291390687847238213,False,True
1676455,167601,6607018907660377991,1,False,16240,-3134967474941078529,False,True
18264609,1827689,628020936226491308,1,True,10420,-7263132807612494195,False,True
21881040,2192752,-3081664261975634860,0,False,6050,6635987146780613271,False,False
26348513,2637588,4194978734950248697,0,False,13776,-6250374472097445840,False,False
28955066,2899447,8693964245073640147,0,False,18686,-7848011077749053046,True,False
23375612,2336894,-461381632058619300,1,True,9177,8586743452220399186,False,True
16650372,1665435,6284164581582112235,1,True,15784,-380094437049880651,False,True
17411488,1742898,-5305696816021977482,1,False,3774,1211103345711218580,False,True


In [46]:
deviceapps_day = (appevents_day.groupby(['device_id','app'])['is_installed','is_active'].sum()
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
deviceapps_day.head()

Unnamed: 0,device_id,app,is_installed,is_active,trainrow,testrow
0,-9222956879900151005,548,16,4.0,21594.0,
1,-9222956879900151005,1096,16,0.0,21594.0,
2,-9222956879900151005,1248,20,11.0,21594.0,
3,-9222956879900151005,1545,10,0.0,21594.0,
4,-9222956879900151005,1664,16,0.0,21594.0,


In [47]:
devicelabels_day = (deviceapps_day[['device_id','app','is_installed','is_active']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['is_installed','is_active'].sum()
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
print(devicelabels_day.shape)
devicelabels_day.head(10)

(4244113, 6)


Unnamed: 0,device_id,label,is_installed,is_active,trainrow,testrow
0,-9222956879900151005,117,12,12.0,21594.0,
1,-9222956879900151005,120,10,10.0,21594.0,
2,-9222956879900151005,126,24,22.0,21594.0,
3,-9222956879900151005,138,41,33.0,21594.0,
4,-9222956879900151005,147,24,21.0,21594.0,
5,-9222956879900151005,170,6,6.0,21594.0,
6,-9222956879900151005,181,16,1.0,21594.0,
7,-9222956879900151005,190,16,1.0,21594.0,
8,-9222956879900151005,207,148,70.0,21594.0,
9,-9222956879900151005,208,201,92.0,21594.0,


In [48]:
unique_labels_installed = devicelabels.groupby('device_id').label.nunique()
print(unique_labels_installed.shape)
unique_labels_installed.head(5)

(60822,)


device_id
-9222956879900151005    56
-9222661944218806987    33
-9222399302879214035    93
-9221825537663503111    58
-9221767098072603291    55
Name: label, dtype: int64

## Save features

In [59]:
feature_file = 'features_label_app_count.csv'
df = pd.DataFrame(index=devicelabels.device_id.unique())
df['n_u_labels_installed_apps'] = devicelabels.groupby('device_id').label.nunique()
df['n_u_labels_active_apps'] = devicelabels[devicelabels.is_active>0].groupby('device_id').label.nunique()
df['r_u_labels_active_apps'] = df['n_u_labels_active_apps'].div(df['n_u_labels_installed_apps'])

df['n_u_labels_installed_apps_day'] = devicelabels_day[devicelabels_day.is_installed>0].groupby('device_id').label.nunique()
df['n_u_labels_active_apps_day'] = devicelabels_day[devicelabels_day.is_active>0].groupby('device_id').label.nunique()
df['r_u_labels_active_apps_day'] = df['n_u_labels_active_apps'].div(df['n_u_labels_installed_apps'])

df['n_u_labels_installed_apps_week'] = devicelabels_week[devicelabels_week.is_installed>0].groupby('device_id').label.nunique()
df['n_u_labels_active_apps_week'] = devicelabels_week[devicelabels_week.is_active>0].groupby('device_id').label.nunique()
df['r_u_labels_active_apps_week'] = df['n_u_labels_active_apps'].div(df['n_u_labels_installed_apps'])

df.fillna(0, inplace=True)

In [60]:
df.head(10)

Unnamed: 0,n_u_labels_installed_apps,n_u_labels_active_apps,r_u_labels_active_apps,n_u_labels_installed_apps_day,n_u_labels_active_apps_day,r_u_labels_active_apps_day,n_u_labels_installed_apps_week,n_u_labels_active_apps_week,r_u_labels_active_apps_week
-9222956879900151005,56,56.0,1.0,56.0,56.0,1.0,56.0,53.0,1.0
-9222661944218806987,33,33.0,1.0,33.0,33.0,1.0,33.0,28.0,1.0
-9222399302879214035,93,28.0,0.301075,93.0,22.0,0.301075,93.0,28.0,0.301075
-9221825537663503111,58,48.0,0.827586,57.0,47.0,0.827586,57.0,44.0,0.827586
-9221767098072603291,55,46.0,0.836364,55.0,45.0,0.836364,55.0,42.0,0.836364
-9221079146476055829,44,44.0,1.0,44.0,42.0,1.0,44.0,44.0,1.0
-9221026417907250887,61,58.0,0.95082,61.0,58.0,0.95082,61.0,58.0,0.95082
-9220830859283101130,24,10.0,0.416667,0.0,0.0,0.416667,24.0,10.0,0.416667
-9220452176650064280,57,26.0,0.45614,55.0,22.0,0.45614,57.0,25.0,0.45614
-9220329415676028483,98,8.0,0.081633,98.0,8.0,0.081633,98.0,8.0,0.081633


In [73]:
gatest.loc[-9220329415676028483,:]

testrow    83094
Name: -9220329415676028483, dtype: int64

In [74]:
df.to_csv(dir_out+feature_file, index_label='device_id')

### Installed any time

In [75]:
feature_file = 'features_label_app_installed'

In [76]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 492), test shape (112071, 492)


In [77]:
Xtr_app[2].todense()[:,:]

matrix([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,
          0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,
          0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
          0.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          1.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  1.,  1.,  1.,
          1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  1.,  0.,

In [78]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_app, f, pickle.HIGHEST_PROTOCOL)

### Label installed proportionate

In [79]:
feature_file = 'features_label_app_installed_prop'

In [80]:
devicelabels.set_index('device_id', inplace=True)
devicelabels['n_labels'] = devicelabels.groupby(devicelabels.index).label.nunique()
devicelabels.reset_index(inplace=True)
devicelabels.head()

Unnamed: 0,device_id,label,is_installed,is_active,trainrow,testrow,n_labels
0,-9222956879900151005,117,16,16.0,21594.0,,56
1,-9222956879900151005,120,17,17.0,21594.0,,56
2,-9222956879900151005,126,33,31.0,21594.0,,56
3,-9222956879900151005,138,59,51.0,21594.0,,56
4,-9222956879900151005,147,37,34.0,21594.0,,56


In [81]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((d['is_installed']/d['n_labels'], (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_app = csr_matrix((d['is_installed']/d['n_labels'], (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 492), test shape (112071, 492)


In [82]:
Xte_app[83094].todense()[:,100:200]

matrix([[ 0.48979592,  0.24489796,  0.        ,  0.12244898,  0.        ,
          0.        ,  0.        ,  0.        ,  0.12244898,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.12244898,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.48979592,  0.        ,  0.        ,  0.        ,
          0.        ,  0.12244898,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.36734694,  0.        ,
          0.        ,  0.        ,  0.57142857,  0.        ,  0.24489796,
          0.        ,  0.        ,  0.24489796,  0.        ,  0.        ,
          0.6122449 ,  0.        ,  0.        ,  0.        ,  0.12244898,
          0.        ,  0.        ,  0.36734694,  0.        ,  0.12244898,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.12244898,  0.

In [83]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_app, f, pickle.HIGHEST_PROTOCOL)

### Active any time

In [84]:
feature_file = 'features_label_app_active'


In [85]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_act_app = csr_matrix(((d['is_active']>0).astype(int), (d.trainrow, d.label)), 
                          shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_act_app = csr_matrix(((d['is_active']>0).astype(int), (d.testrow, d.label)), 
                        shape=(gatest.shape[0],nlabels))
print('Apps data: train shape {}, test shape {}'.format(Xtr_act_app.shape, Xte_act_app.shape))

Apps data: train shape (74645, 492), test shape (112071, 492)


In [86]:
Xte_act_app[83094].todense()[:,100:200]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [87]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_act_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_act_app, f, pickle.HIGHEST_PROTOCOL)

### Relative active anytime

In [88]:
feature_file = 'features_label_app_active_rel'


In [89]:
d = devicelabels.dropna(subset=['trainrow'])
Xtr_act_app = csr_matrix((d['is_active']/d['is_installed'], (d.trainrow, d.label)), 
                          shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_act_app = csr_matrix((d['is_active']/d['is_installed'], (d.testrow, d.label)), 
                        shape=(gatest.shape[0],nlabels))
print('Apps data: train shape {}, test shape {}'.format(Xtr_act_app.shape, Xte_act_app.shape))

Apps data: train shape (74645, 492), test shape (112071, 492)


In [90]:
Xte_act_app[83094].todense()[:,100:200]

matrix([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.71428571,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.

In [91]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_act_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_act_app, f, pickle.HIGHEST_PROTOCOL)

### App labels that are more active during week

In [92]:
feature_file = 'features_label_app_more_active_week'


In [93]:
d = devicelabels_week.dropna(subset=['trainrow'])
d2 = devicelabels.dropna(subset=['trainrow'])
Xtr_act_app = csr_matrix((( (d['is_active']/d['is_installed']) > (d2['is_active']/d2['is_installed']) ).astype(int), 
                          (d.trainrow, d.label)), 
                          shape=(gatrain.shape[0],nlabels))
d = devicelabels_week.dropna(subset=['testrow'])
d2 = devicelabels.dropna(subset=['testrow'])
Xte_act_app = csr_matrix((( (d['is_active']/d['is_installed']) > (d2['is_active']/d2['is_installed']) ).astype(int), 
                          (d.testrow, d.label)), 
                        shape=(gatest.shape[0],nlabels))
print('Apps data: train shape {}, test shape {}'.format(Xtr_act_app.shape, Xte_act_app.shape))

Apps data: train shape (74645, 492), test shape (112071, 492)


In [94]:
Xte_act_app[83094].todense()[:,100:200]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [95]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_act_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_act_app, f, pickle.HIGHEST_PROTOCOL)

### App labels that are more active during day
Bigger signal than apps more active during evening

In [96]:
feature_file = 'features_label_app_more_active_day'


In [97]:
d = devicelabels_day.dropna(subset=['trainrow'])
d2 = devicelabels.dropna(subset=['trainrow'])
Xtr_act_app = csr_matrix((( (d['is_active']/d['is_installed']) > (d2['is_active']/d2['is_installed']) ).astype(int), 
                          (d.trainrow, d.label)), 
                          shape=(gatrain.shape[0],nlabels))
d = devicelabels_day.dropna(subset=['testrow'])
d2 = devicelabels.dropna(subset=['testrow'])
Xte_act_app = csr_matrix((( (d['is_active']/d['is_installed']) > (d2['is_active']/d2['is_installed']) ).astype(int), 
                          (d.testrow, d.label)), 
                        shape=(gatest.shape[0],nlabels))
print('Apps data: train shape {}, test shape {}'.format(Xtr_act_app.shape, Xte_act_app.shape))

Apps data: train shape (74645, 492), test shape (112071, 492)


In [98]:
sum(Xte_act_app[:].todense()).sum()

429780

In [99]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_act_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_act_app, f, pickle.HIGHEST_PROTOCOL)