In [1]:
#-*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from pandas import DataFrame, Series

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
phone_brand_device_model = pd.read_csv('data/phone_brand_device_model.csv')
gender_age_train = pd.read_csv('data/gender_age_train.csv')
label_categories = pd.read_csv('data/label_categories.csv')
events = pd.read_csv('data/events.csv')
app_labels = pd.read_csv('data/app_labels.csv')
app_events = pd.read_csv('data/app_events.csv', dtype = {'event_id' : np.int8, 'app_id' : np.int64, 
                                                   'is_installed' : np.int8, 'is_active' : np.int8})
gender_age_test = pd.read_csv('data/gender_age_test.csv')

In [3]:
print('train_deviceid counts : {}' .format(len(gender_age_train.device_id.unique())))
print('test_deviceid  counts : {}' .format(len(gender_age_test.device_id.unique())))
print('events event_id counts : {}' .format(len(events.event_id.unique())))
print('events device_id counts : {}' .format(len(events.device_id.unique())))
print('app_events event_id counts : {}' .format(len(app_events.event_id.unique())))
print('app_events app_id counts : {}' .format(len(app_events.app_id.unique())))

train_deviceid counts : 74645
test_deviceid  counts : 112071
events event_id counts : 3252950
events device_id counts : 60865
app_events event_id counts : 256
app_events app_id counts : 19237


## 브랜드명 전환

In [4]:
english_phone_brands_mapping = {
    "三星": "samsung",
    "天语": "Ktouch",
    "海信": "hisense",
    "联想": "lenovo",
    "欧比": "obi",
    "爱派尔": "ipair",
    "努比亚": "nubia",
    "优米": "youmi",
    "朵唯": "dowe",
    "黑米": "heymi",
    "锤子": "hammer",
    "酷比魔方": "koobee",
    "美图": "meitu",
    "尼比鲁": "nibilu",
    "一加": "oneplus",
    "优购": "yougo",
    "诺基亚": "nokia",
    "糖葫芦": "candy",
    "中国移动": "ccmc",
    "语信": "yuxin",
    "基伍": "kiwu",
    "青橙": "greeno",
    "华硕": "asus",
    "夏新": "panosonic",
    "维图": "weitu",
    "艾优尼": "aiyouni",
    "摩托罗拉": "moto",
    "乡米": "xiangmi",
    "米奇": "micky",
    "大可乐": "bigcola",
    "沃普丰": "wpf",
    "神舟": "hasse",
    "摩乐": "mole",
    "飞秒": "fs",
    "米歌": "mige",
    "富可视": "fks",
    "德赛": "desci",
    "梦米": "mengmi",
    "乐视": "lshi",
    "小杨树": "smallt",
    "纽曼": "newman",
    "邦华": "banghua",
    "E派": "epai",
    "易派": "epai",
    "普耐尔": "pner",
    "欧新": "ouxin",
    "西米": "ximi",
    "海尔": "haier",
    "波导": "bodao",
    "糯米": "nuomi",
    "唯米": "weimi",
    "酷珀": "kupo",
    "谷歌": "google",
    "昂达": "ada",
    "聆韵": "lingyun",
    "小米": "Xiaomi",
    "华为": "Huawei",
    "魅族": "Meizu",
    "中兴": "ZTE",
    "酷派": "Coolpad",
    "金立": "Gionee",
    "SUGAR": "SUGAR",
    "OPPO": "OPPO",
    "vivo": "vivo",
    "HTC": "HTC",
    "LG": "LG",
    "ZUK": "ZUK",
    "TCL": "TCL",
    "LOGO": "LOGO",
    "SUGAR": "SUGAR",
    "Lovme": "Lovme",
    "PPTV": "PPTV",
    "ZOYE": "ZOYE",
    "MIL": "MIL",
    "索尼" : "Sony",
    "欧博信" : "Opssom",
    "奇酷" : "Qiku",
    "酷比" : "CUBE",
    "康佳" : "Konka",
    "亿通" : "Yitong",
    "金星数码" : "JXD",
    "至尊宝" : "Monkey King",
    "百立丰" : "Hundred Li Feng",
    "贝尔丰" : "Bifer",
    "百加" : "Bacardi",
    "诺亚信" : "Noain",
    "广信" : "Kingsun",
    "世纪天元" : "Ctyon",
    "青葱" : "Cong",
    "果米" : "Taobao",
    "斐讯" : "Phicomm",
    "长虹" : "Changhong",
    "欧奇" : "Oukimobile",
    "先锋" : "XFPLAY",
    "台电" : "Teclast",
    "大Q" : "Daq",
    "蓝魔" : "Ramos",
    "奥克斯" : "AUX"
}

phone_brand_device_model.phone_brand = phone_brand_device_model.phone_brand.map(pd.Series(english_phone_brands_mapping), na_action='ignore')

### time saperate year / month / day / hour

In [5]:
events['timestamp'].replace('-', '', regex = True, inplace = True)
events['timestamp'].replace(':', '', regex = True, inplace = True)
events['timestamp'].replace(' ', '', regex = True, inplace = True)

In [6]:
events['year'] = events['timestamp'].map(lambda x: x[0:4])
events['month'] = events['timestamp'].map(lambda x: x[4:6])
events['day'] = events['timestamp'].map(lambda x: x[6:8])
events['time'] = events['timestamp'].map(lambda x: x[8:10])

In [7]:
events.drop(['timestamp'], axis = 1, inplace = True)
events.drop(['year'], axis = 1, inplace = True)

### longitude / latitude mean값으로 변경
* dvice_id에 따른 지역 편차 줄임

In [8]:
event_mean = events.groupby('device_id', as_index = False).agg({'longitude' : 'mean', 
                                                   'latitude' : 'mean'})

In [9]:
event_mean.head()

Unnamed: 0,device_id,longitude,latitude
0,-9222956879900151005,90.592,18.552
1,-9222661944218806987,0.0,0.0
2,-9222399302879214035,0.0,0.0
3,-9221825537663503111,112.300808,33.859091
4,-9221767098072603291,0.0,0.0


In [10]:
events_tude_mean = pd.merge(events, event_mean, on = 'device_id')

In [11]:
events_tude_mean = events_tude_mean.drop('longitude_x', 1)
events_tude_mean = events_tude_mean.drop('latitude_x', 1)

In [12]:
events_tude_mean.sort_values(by = 'device_id').head()

Unnamed: 0,event_id,device_id,month,day,time,longitude_y,latitude_y
1510824,661623,-9222956879900151005,5,7,11,90.592,18.552
1510850,2085017,-9222956879900151005,5,6,15,90.592,18.552
1510849,2085015,-9222956879900151005,5,6,15,90.592,18.552
1510848,2084996,-9222956879900151005,5,6,15,90.592,18.552
1510847,2068832,-9222956879900151005,5,7,12,90.592,18.552


### 명목형 feature  - label encoding (brand name)

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder()

In [13]:
phone_brand_device_model.phone_brand.dropna().max()

'yuxin'

In [15]:
# 누락된 phone_brand data가 있었다!!!!!!

phone_brand_device_model.phone_brand.fillna(phone_brand_device_model.phone_brand.dropna().max(), inplace = True)


In [16]:
phone_brand_device_model['brand'] = le.fit_transform(phone_brand_device_model['phone_brand'])

In [17]:
phone_brand_device_model.device_model.max()

'黄金斗士青春版'

In [18]:
phone_brand_device_model.device_model.fillna(phone_brand_device_model.device_model.dropna().max(), inplace = True)

In [19]:
phone_brand_device_model['device'] = le.fit_transform(phone_brand_device_model['device_model'])

In [20]:
phone_brand_device_model.head()

Unnamed: 0,device_id,phone_brand,device_model,brand,device
0,-8890648629457979026,Xiaomi,红米,37,1517
1,1277779817574759137,Xiaomi,MI 2,37,749
2,5137427614288105724,samsung,Galaxy S4,85,560
3,3669464369358936369,SUGAR,时尚手机,31,1503
4,-5019277647504317457,samsung,Galaxy Note 2,85,536


In [21]:
phone_brand_label = phone_brand_device_model.copy()

In [22]:
phone_brand_label = phone_brand_label.drop('phone_brand', 1)
phone_brand_label = phone_brand_label.drop('device_model', 1)

In [23]:
phone_brand_label.head()

Unnamed: 0,device_id,brand,device
0,-8890648629457979026,37,1517
1,1277779817574759137,37,749
2,5137427614288105724,85,560
3,3669464369358936369,31,1503
4,-5019277647504317457,85,536


In [32]:
# merge1 : train + event

age_even = pd.merge(gender_age_train, events_tude_mean, how = 'left', on = 'device_id')

In [34]:
age_even.shape

(1266931, 10)

In [33]:
age_even.isnull().sum()

device_id          0
gender             0
age                0
group              0
event_id       51336
month          51336
day            51336
time           51336
longitude_y    51336
latitude_y     51336
dtype: int64

In [35]:
age_even.head()

Unnamed: 0,device_id,gender,age,group,event_id,month,day,time,longitude_y,latitude_y
0,-8076087639492063270,M,35,M32-38,,,,,,
1,-2897161552818060146,M,35,M32-38,,,,,,
2,-8260683887967679142,M,35,M32-38,2479656.0,5.0,1.0,14.0,0.0,0.0
3,-4938849341048082022,M,30,M29-31,,,,,,
4,245133531816851882,M,30,M29-31,,,,,,


In [None]:
# merge 2 : train + event + app_evnets

In [36]:
age_app = pd.merge(age_even, app_events, how = 'left', on = 'event_id')

In [37]:
age_app.head()

Unnamed: 0,device_id,gender,age,group,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active
0,-8076087639492063270,M,35,M32-38,,,,,,,,,
1,-2897161552818060146,M,35,M32-38,,,,,,,,,
2,-8260683887967679142,M,35,M32-38,2479660.0,5.0,1.0,14.0,0.0,0.0,,,
3,-4938849341048082022,M,30,M29-31,,,,,,,,,
4,245133531816851882,M,30,M29-31,,,,,,,,,


In [None]:
# merge 2 : train + event + app_evnets + app_label

In [38]:
age_label = pd.merge(age_app, app_labels, how='left', on='app_id')

In [39]:
age_label.head()

Unnamed: 0,device_id,gender,age,group,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id
0,-8076087639492063270,M,35,M32-38,,,,,,,,,,
1,-2897161552818060146,M,35,M32-38,,,,,,,,,,
2,-8260683887967679142,M,35,M32-38,2479660.0,5.0,1.0,14.0,0.0,0.0,,,,
3,-4938849341048082022,M,30,M29-31,,,,,,,,,,
4,245133531816851882,M,30,M29-31,,,,,,,,,,
