# Create features based on appid

The folllowing feature sets are created:
- Features_appid_totals.csv:
    - N_u_apps_installed: number of unique apps installed
    - N_u_apps_active: number of unique apps active
    - R_u_apps_active: number of relative apps active
- Features_appid_installed: bag of apps installed anytime during time period
- Features_appid_active: bag of apps active anytime during time period
- Features_appid_r_active: Bag of apps active relative to number of times installed
- Features_appid_installed_active: Bag of apps with 0.5 if app installed and 1 if active
- Features_appid_installed_rel: Relative number of events appid was installed
- Features_appid_active_rel: Relative number of events appid was active


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
import pickle

In [2]:
dir_in = 'data_ori'
dir_out = 'data/'

## Load data

In [3]:
datadir = 'data_ori'
gatrain = pd.read_csv(os.path.join(dir_in,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(dir_in,'gender_age_test.csv'),
                      index_col = 'device_id')
# Get rid of duplicate device ids in phone
events = pd.read_csv(os.path.join(dir_in,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(dir_in,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(dir_in,'app_labels.csv'))

In [4]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

## Get number of events

In [5]:
n_events = events.groupby('device_id').size()
print n_events.shape
n_events.head(5)

(60865,)


device_id
-9222956879900151005    65
-9222661944218806987     8
-9222399302879214035    10
-9221825537663503111    99
-9221767098072603291     8
dtype: int64

## Installed app_id dataframe

In [6]:
appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .merge(pd.DataFrame(n_events, columns=['n_events']), how='left', left_index=True, right_index=True)
                       .reset_index())
print deviceapps.shape
deviceapps.head()

(2369025, 6)


Unnamed: 0,device_id,app,size,trainrow,testrow,n_events
0,-9222956879900151005,548,18,21594.0,,65
1,-9222956879900151005,1096,18,21594.0,,65
2,-9222956879900151005,1248,26,21594.0,,65
3,-9222956879900151005,1545,12,21594.0,,65
4,-9222956879900151005,1664,18,21594.0,,65


## Active app_id dataframe

In [7]:
deviceactiveapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['is_active'].agg(['sum'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .merge(pd.DataFrame(n_events, columns=['n_events']), how='left', left_index=True, right_index=True)
                       .reset_index())
#deviceactiveapps['n_events'] = n_events
print deviceactiveapps.shape
deviceactiveapps.head()

(2369025, 6)


Unnamed: 0,device_id,app,sum,trainrow,testrow,n_events
0,-9222956879900151005,548,4.0,21594.0,,65
1,-9222956879900151005,1096,0.0,21594.0,,65
2,-9222956879900151005,1248,15.0,21594.0,,65
3,-9222956879900151005,1545,2.0,21594.0,,65
4,-9222956879900151005,1664,0.0,21594.0,,65


## Number of installed/active apps

In [8]:
unique_apps_installed = deviceapps.groupby('device_id').app.nunique()
print(unique_apps_installed.shape)
unique_apps_installed.head(5)

(60822,)


device_id
-9222956879900151005    72
-9222661944218806987    13
-9222399302879214035    43
-9221825537663503111    34
-9221767098072603291    30
Name: app, dtype: int64

In [9]:
unique_apps_active = deviceactiveapps[deviceactiveapps['sum']>0].groupby('device_id').app.nunique()
print(unique_apps_active.shape)
unique_apps_active.head(5)

(60669,)


device_id
-9222956879900151005    55
-9222661944218806987    13
-9222399302879214035     4
-9221825537663503111    26
-9221767098072603291    19
Name: app, dtype: int64

In [10]:
unique_apps_active_rel = unique_apps_active.div(unique_apps_installed)
print(unique_apps_active_rel.shape)
unique_apps_active_rel.head(5)

(60822,)


device_id
-9222956879900151005    0.763889
-9222661944218806987    1.000000
-9222399302879214035    0.093023
-9221825537663503111    0.764706
-9221767098072603291    0.633333
Name: app, dtype: float64

## Save features

### Number of installed and unique apps

In [11]:
feature_file = 'features_appid_totals.csv'

df = pd.concat([unique_apps_installed, unique_apps_active, unique_apps_active_rel], axis=1) 
df.columns = ['n_u_apps_installed','n_u_apps_active', 'r_u_apps_active']


In [12]:
print(df.shape)
df.head()

(60822, 3)


Unnamed: 0_level_0,n_u_apps_installed,n_u_apps_active,r_u_apps_active
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-9222956879900151005,72,55.0,0.763889
-9222661944218806987,13,13.0,1.0
-9222399302879214035,43,4.0,0.093023
-9221825537663503111,34,26.0,0.764706
-9221767098072603291,30,19.0,0.633333


In [13]:
df.to_csv(dir_out + feature_file)

### Installed any time

In [14]:
feature_file = 'features_appid_installed'

In [15]:
d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [16]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_app, f, pickle.HIGHEST_PROTOCOL)

### Active any time

In [17]:
feature_file = 'features_appid_active'

In [18]:
d = deviceactiveapps.dropna(subset=['trainrow'])
Xtr_act_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                          shape=(gatrain.shape[0],napps))
d = deviceactiveapps.dropna(subset=['testrow'])
Xte_act_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                        shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_act_app.shape, Xte_act_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [19]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_act_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_act_app, f, pickle.HIGHEST_PROTOCOL)

### App active relative to installed

In [20]:
feature_file = 'features_appid_r_active'

In [21]:
deviceactiveapps['rel'] = deviceactiveapps['sum'].div(deviceapps['size'])
deviceactiveapps.head()

Unnamed: 0,device_id,app,sum,trainrow,testrow,n_events,rel
0,-9222956879900151005,548,4.0,21594.0,,65,0.222222
1,-9222956879900151005,1096,0.0,21594.0,,65,0.0
2,-9222956879900151005,1248,15.0,21594.0,,65,0.576923
3,-9222956879900151005,1545,2.0,21594.0,,65,0.166667
4,-9222956879900151005,1664,0.0,21594.0,,65,0.0


In [22]:
d = deviceactiveapps.dropna(subset=['trainrow'])
Xtr_act_app = csr_matrix((d['rel'], (d.trainrow, d.app)), 
                          shape=(gatrain.shape[0],napps))
d = deviceactiveapps.dropna(subset=['testrow'])
Xte_act_app = csr_matrix((d['rel'], (d.testrow, d.app)), 
                        shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_act_app.shape, Xte_act_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [23]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_act_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_act_app, f, pickle.HIGHEST_PROTOCOL)

### App installed and active combined
Installed value: 0.5
Active value: 1

In [24]:
feature_file = 'features_appid_installed_active'

In [25]:
d = deviceactiveapps.dropna(subset=['trainrow'])
Xtr_act_app = csr_matrix((0.5*(np.ones(d.shape[0]) + d['sum'].clip(upper=1)), (d.trainrow, d.app)), 
                          shape=(gatrain.shape[0],napps))

d = deviceactiveapps.dropna(subset=['testrow'])
Xte_act_app = csr_matrix((0.5*(np.ones(d.shape[0]) + d['sum'].clip(upper=1)), (d.testrow, d.app)), 
                        shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_act_app.shape, Xte_act_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [26]:
Xte_act_app[18].todense()[:, 200:300]

matrix([[ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
          0. ,  0. ,  0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
          0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
          0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
          0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
          0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
          0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
          0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
          0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
          0. ]])

In [27]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_act_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_act_app, f, pickle.HIGHEST_PROTOCOL)

### Relative number of events times app was installed

In [28]:
feature_file = 'features_appid_installed_rel'

In [29]:
d = deviceapps.dropna(subset=['trainrow'])
d['size'] = d['size'].div(d['n_events'])
Xtr_act_app = csr_matrix((d['size'], (d.trainrow, d.app)), 
                          shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
d['size'] = d['size'].div(d['n_events'])
Xte_act_app = csr_matrix((d['size'], (d.testrow, d.app)), 
                        shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_act_app.shape, Xte_act_app.shape))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Apps data: train shape (74645, 19237), test shape (112071, 19237)


In [30]:
d.head(5)

Unnamed: 0,device_id,app,size,trainrow,testrow,n_events
72,-9222661944218806987,1867,0.375,,13612.0,8
73,-9222661944218806987,7519,1.0,,13612.0,8
74,-9222661944218806987,7843,0.125,,13612.0,8
75,-9222661944218806987,8704,0.5,,13612.0,8
76,-9222661944218806987,10000,0.125,,13612.0,8


In [31]:
Xte_act_app[0].todense()[:, 50:100]

matrix([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.85714286,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])

In [32]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_act_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_act_app, f, pickle.HIGHEST_PROTOCOL)

### Relative number of events times app was active

In [33]:
feature_file = 'features_appid_active_rel'

In [34]:
d = deviceactiveapps.dropna(subset=['trainrow'])
d['sum'] = d['sum'].div(d['n_events'])
Xtr_act_app = csr_matrix((d['sum'], (d.trainrow, d.app)), 
                          shape=(gatrain.shape[0],napps))
d = deviceactiveapps.dropna(subset=['testrow'])
d['sum'] = d['sum'].div(d['n_events'])
Xte_act_app = csr_matrix((d['sum'], (d.testrow, d.app)), 
                        shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_act_app.shape, Xte_act_app.shape))

Apps data: train shape (74645, 19237), test shape (112071, 19237)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [35]:
d.head()

Unnamed: 0,device_id,app,sum,trainrow,testrow,n_events,rel
72,-9222661944218806987,1867,0.375,,13612.0,8,1.0
73,-9222661944218806987,7519,0.875,,13612.0,8,0.875
74,-9222661944218806987,7843,0.125,,13612.0,8,1.0
75,-9222661944218806987,8704,0.375,,13612.0,8,0.75
76,-9222661944218806987,10000,0.125,,13612.0,8,1.0


In [36]:
Xte_act_app[100].todense()[:, 250:300]

matrix([[ 0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
          0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
          0.    ,  0.    ,  0.    ,  0.1875,  0.    ,  0.    ,  0.    ,
          0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
          0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
          0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
          0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ]])

In [37]:
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_act_app, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_act_app, f, pickle.HIGHEST_PROTOCOL)