In [45]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
import pickle

In [46]:
dir_out = 'data'
datadir = 'data_ori'

In [47]:
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                      index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')


In [48]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

### Create other features

In [49]:
phone['brand_chin'] = phone['phone_brand'].str.contains('[aeiouAEIOU]').astype(int)
phone['model_chin'] = phone['device_model'].str.contains('[aeiouAEIOU]').astype(int)
phone['brand_len'] = phone['phone_brand'].str.len()
phone['model_len'] = phone['device_model'].str.len()
phone['n_models'] = phone.groupby('phone_brand').device_model.transform(lambda x: len(x.unique()))
phone['device_model'] = phone['phone_brand'].str.cat(phone['device_model'])

In [50]:
phone.sample(10)

Unnamed: 0_level_0,phone_brand,device_model,brand_chin,model_chin,brand_len,model_len,n_models
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-370477508657702127,小米,小米红米note,0,1,6,10,26
6096322417496446933,魅族,魅族魅蓝,0,0,6,6,16
-1436822817945074815,三星,三星Galaxy Note 3,0,1,6,13,163
1615991472913744774,小米,小米note顶配版,0,1,6,13,26
-6716828533125982881,小米,小米红米2,0,0,6,7,26
6119945468610441272,三星,三星Galaxy Note 4,0,1,6,13,163
802852493392259819,小米,小米红米Note3,0,1,6,11,26
-2750604217898316048,小米,小米红米note,0,1,6,10,26
-8241430867221535017,三星,三星Galaxy Grand 2,0,1,6,14,163
-2525148510460889665,小米,小米小米4C,0,0,6,8,26


### Brand features

In [51]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))



Brand features: train shape (74645, 131), test shape (112071, 131)


In [52]:
feature_file = 'features_brand_bag'
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_brand, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_brand, f, pickle.HIGHEST_PROTOCOL)

### Model features

In [53]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [54]:
# Save features
feature_file = 'features_brand_model_bag'
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_model, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_model, f, pickle.HIGHEST_PROTOCOL)

### Other features

In [55]:
feature_file = 'features_brand_model.csv'


In [56]:
cols_to_save=['brand_chin', 'model_chin', 'brand_len','model_len', 'n_models']

In [57]:
phone[cols_to_save].to_csv(os.path.join(dir_out, feature_file))

In [58]:
print phone.shape
phone.sample(10)

(186716, 9)


Unnamed: 0_level_0,phone_brand,device_model,brand_chin,model_chin,brand_len,model_len,n_models,brand,model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2476086426996130628,金立,金立ELIFE S5.5,0,1,6,10,67,120,1569
-3548132448947435317,vivo,vivoY11,1,0,4,3,80,13,246
871841151337988246,酷派,酷派大神F2,0,0,6,8,140,117,1539
7413417160031599109,三星,三星Galaxy Grand Neo Plus,0,1,6,21,163,15,328
-900599243167204267,OPPO,OPPOFind 5,1,1,4,6,65,7,114
387670369109834769,酷派,酷派8720,0,0,6,4,140,117,1508
777480779757651131,华为,华为荣耀畅玩4X,0,0,6,14,145,31,747
5978065598806915282,HTC,HTCSensation Z710E,0,1,3,15,66,2,54
-2525045180639317667,三星,三星Galaxy S5,0,1,6,9,163,15,375
-8984091664228344644,爱派尔,爱派尔iPh-800,0,1,9,7,2,84,1058
