# Create features based on brand and device model info

Three feature sets are created:
- 'features_brand_bag': sparse one-hot encoded brand name
- 'features_brand_model_bag': sparse one-hot encoded device model (concatenated with brand model)
- 'features_brand_model.csv': couple of features derived from brand name and device model

In [4]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
import pickle

In [5]:
dir_out = 'data'
datadir = 'data_ori'

### Load data

In [6]:
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                      index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))

# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')


In [7]:
gatrain['trainrow'] = np.arange(gatrain.shape[0])
gatest['testrow'] = np.arange(gatest.shape[0])

### Create other features
They are strongly correlated to brand and device model and might provide helpful interaction with brand and device model

In [8]:
phone['brand_chin'] = phone['phone_brand'].str.contains('[aeiouAEIOU]').astype(int)
phone['model_chin'] = phone['device_model'].str.contains('[aeiouAEIOU]').astype(int)
phone['brand_len'] = phone['phone_brand'].str.len()
phone['model_len'] = phone['device_model'].str.len()
phone['n_models'] = phone.groupby('phone_brand').device_model.transform(lambda x: len(x.unique()))
phone['device_model'] = phone['phone_brand'].str.cat(phone['device_model'])

In [9]:
phone.sample(10)

Unnamed: 0_level_0,phone_brand,device_model,brand_chin,model_chin,brand_len,model_len,n_models
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
123706494565281080,金立,金立ELIFE S5.5,0,1,6,10,67
-8745178761610459618,华为,华为G610T,0,0,6,5,145
7093706758594001619,小米,小米红米,0,0,6,6,26
2574225068233010304,酷派,酷派8720L,0,0,6,5,140
-2351899753430517708,金立,金立F103,0,0,6,4,67
-7867917699664686614,小米,小米MI 4,0,1,6,4,26
-6012878024505159932,魅族,魅族魅蓝2,0,0,6,7,16
-5723359257508971431,华为,华为Mate 8,0,1,6,6,145
6766525840603262578,小米,小米MI 4,0,1,6,4,26
9183749737193386211,魅族,魅族MX4,0,0,6,3,16


### Brand features

In [10]:
brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand'] = phone['brand']
gatest['brand'] = phone['brand']
Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))



Brand features: train shape (74645, 131), test shape (112071, 131)


In [11]:
feature_file = 'features_brand_bag'
# Save features
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_brand, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_brand, f, pickle.HIGHEST_PROTOCOL)

### Model features

In [12]:
m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))

Model features: train shape (74645, 1667), test shape (112071, 1667)


In [13]:
# Save features
feature_file = 'features_brand_model_bag'
with open(os.path.join(dir_out, feature_file + '_train.pickle'), 'wb') as f:
    pickle.dump(Xtr_model, f, pickle.HIGHEST_PROTOCOL)
with open(os.path.join(dir_out, feature_file + '_test.pickle'), 'wb') as f:
    pickle.dump(Xte_model, f, pickle.HIGHEST_PROTOCOL)

### Store other features

In [14]:
feature_file = 'features_brand_model.csv'

cols_to_save=['brand_chin', 'model_chin', 'brand_len','model_len', 'n_models']

In [15]:
phone[cols_to_save].to_csv(os.path.join(dir_out, feature_file))

In [16]:
print phone.shape
phone.sample(10)

(186716, 9)


Unnamed: 0_level_0,phone_brand,device_model,brand_chin,model_chin,brand_len,model_len,n_models,brand,model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-9162689783589954482,小米,小米红米1S,0,0,6,8,26,51,859
6241952244791867823,三星,三星Galaxy Note 3,0,1,6,13,163,15,348
4541896421140852398,三星,三星Galaxy S3,0,1,6,9,163,15,368
2480272536253170047,三星,三星Galaxy Win,0,1,6,10,163,15,413
-6136453491684066396,华为,华为Mate 7,0,1,6,6,145,31,673
5378161127565812372,华为,华为荣耀4A,0,1,6,8,145,31,734
-8168114287797005561,三星,三星Galaxy S3,0,1,6,9,163,15,368
-7182550678499820874,酷派,酷派8720L,0,0,6,5,140,117,1509
8526518178083328519,魅族,魅族魅蓝NOTE,0,1,6,10,16,128,1659
-1202163322098889967,小米,小米MI 4,0,1,6,4,26,51,848
