In [1]:
#-*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from pandas import DataFrame, Series
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
phone_brand_device_model = pd.read_csv('data/phone_brand_device_model.csv')
gender_age_train = pd.read_csv('data/gender_age_train.csv')
label_categories = pd.read_csv('data/label_categories.csv')
events = pd.read_csv('data/events.csv')
app_labels = pd.read_csv('data/app_labels.csv')
app_events = pd.read_csv('data/app_events.csv', dtype = {'event_id' : np.int8, 'app_id' : np.int64, 
                                                   'is_installed' : np.int8, 'is_active' : np.int8})
gender_age_test = pd.read_csv('data/gender_age_test.csv')

### TimeStamp Split

In [3]:
events['timestamp'].replace('-', '', regex = True, inplace = True)
events['timestamp'].replace(':', '', regex = True, inplace = True)
events['timestamp'].replace(' ', '', regex = True, inplace = True)

events['year'] = events['timestamp'].map(lambda x: x[0:4])
events['month'] = events['timestamp'].map(lambda x: x[4:6])
events['day'] = events['timestamp'].map(lambda x: x[6:8])
events['time'] = events['timestamp'].map(lambda x: x[8:10])

events.drop(['timestamp'], axis = 1, inplace = True)
events.drop(['year'], axis = 1, inplace = True)

event_mean = events.groupby('device_id', as_index = False).agg({'longitude' : 'mean', 
                                                   'latitude' : 'mean'})

events_tude_mean = pd.merge(events, event_mean, on = 'device_id')

events_tude_mean = events_tude_mean.drop('longitude_x', 1)
events_tude_mean = events_tude_mean.drop('latitude_x', 1)

### 명목형 변수 치환

In [4]:
phone_brand_device_model.phone_brand.fillna(phone_brand_device_model.phone_brand.dropna().max(), inplace = True)
phone_brand_device_model.device_model.fillna(phone_brand_device_model.device_model.dropna().max(), inplace = True)

In [5]:
le = LabelEncoder()

phone_brand_device_model['brand'] = le.fit_transform(phone_brand_device_model['phone_brand'])
phone_brand_device_model['device'] = le.fit_transform(phone_brand_device_model['device_model'])



phone_brand_label = phone_brand_device_model.copy()

phone_brand_label = phone_brand_label.drop('phone_brand', 1)
phone_brand_label = phone_brand_label.drop('device_model', 1)

In [26]:
# M = 1, F = 0
gender_age_train['M_F'] = le.fit_transform(gender_age_train['gender'])

In [32]:
# group 치환
gender_age_train['group_e'] = le.fit_transform(gender_age_train['group'])

In [33]:
gender_age_train.head()

Unnamed: 0,device_id,gender,age,group,M_F,group_e
0,-8076087639492063270,M,35,M32-38,1,10
1,-2897161552818060146,M,35,M32-38,1,10
2,-8260683887967679142,M,35,M32-38,1,10
3,-4938849341048082022,M,30,M29-31,1,9
4,245133531816851882,M,30,M29-31,1,9


### longitude / latitude mean값으로 변경
* dvice_id에 따른 지역 편차 줄임

In [6]:
event_mean = events.groupby('device_id', as_index = False).agg({'longitude' : 'mean', 
                                                   'latitude' : 'mean'})

events_tude_mean = pd.merge(events, event_mean, on = 'device_id')

events_tude_mean = events_tude_mean.drop('longitude_x', 1)
events_tude_mean = events_tude_mean.drop('latitude_x', 1)

### brand 별 연령대

In [35]:
age_brand = pd.merge(phone_brand_device_model, gender_age_train, how = 'left', on = 'device_id')

In [46]:
age_bran_time = pd.merge(age_brand, events, how = 'left', on = 'device_id') 

In [84]:
age_bran_time.head(20)

Unnamed: 0,device_id,phone_brand,device_model,brand,device,gender,age,group,M_F,group_e,event_id,longitude,latitude,month,day,time
0,-8890648629457979026,小米,红米,51,1517,M,33.0,M32-38,1.0,10.0,,,,,,
1,1277779817574759137,小米,MI 2,51,749,M,26.0,M23-26,1.0,7.0,,,,,,
2,5137427614288105724,三星,Galaxy S4,15,560,,,,,,571915.0,0.0,0.0,5.0,7.0,8.0
3,5137427614288105724,三星,Galaxy S4,15,560,,,,,,1414484.0,0.0,0.0,5.0,7.0,22.0
4,5137427614288105724,三星,Galaxy S4,15,560,,,,,,3049420.0,0.0,0.0,5.0,7.0,8.0
5,3669464369358936369,SUGAR,时尚手机,9,1503,,,,,,,,,,,
6,-5019277647504317457,三星,Galaxy Note 2,15,536,F,32.0,F29-32,0.0,3.0,,,,,,
7,3238009352149731868,华为,Mate,31,774,F,30.0,F29-32,0.0,3.0,,,,,,
8,-3883532755183027260,小米,MI 2S,51,752,,,,,,,,,,,
9,-2972199645857147708,华为,G610S,31,432,,,,,,,,,,,


In [48]:
# 명목 변수 제거

age_bran_time_e = age_bran_time[['device_id', 'brand', 'device', 'age', 'M_F', 'group_e', 'event_id', 'longitude', 'latitude', 'time']]

In [49]:
age_bran_time_e.head()

Unnamed: 0,device_id,brand,device,age,M_F,group_e,event_id,longitude,latitude,time
0,-8890648629457979026,51,1517,33.0,1.0,10.0,,,,
1,1277779817574759137,51,749,26.0,1.0,7.0,,,,
2,5137427614288105724,15,560,,,,571915.0,0.0,0.0,8.0
3,5137427614288105724,15,560,,,,1414484.0,0.0,0.0,22.0
4,5137427614288105724,15,560,,,,3049420.0,0.0,0.0,8.0


In [51]:
g_brand = pd.DataFrame()
g_model = pd.DataFrame()


In [43]:
# phone_brand 평균 연령대

g_brand['brand_age'] = age_brand.groupby('phone_brand')['age'].mean().astype(np.float32)
g_brand['brand_gender'] = age_brand.groupby('phone_brand')['M_F'].mean().astype(np.float32)
g_brand['brand_group'] = age_brand.groupby('phone_brand')['group_e'].mean().astype(np.float32)

g_model['model_age'] = age_brand.groupby('device_model')['age'].mean().astype(np.float32)
g_model['model_gender'] = age_brand.groupby('device_model')['M_F'].mean().astype(np.float32)
g_model['model_group'] = age_brand.groupby('device_model')['group_e'].mean().astype(np.float32)



In [68]:
# time 별 사용자 age 평균
g_time = pd.DataFrame()
g_time['time_age_median'] = age_bran_time_e.groupby('time')['age'].median().astype(np.float32)
g_time['time_gen'] = age_bran_time_e.groupby('time')['M_F'].mean().astype(np.float32)

In [74]:
g_time['time_group'] = age_bran_time_e.groupby('time')['group_e'].var()

In [122]:
# 시간대별 그룹 사용자 분포
g_tmp = pd.DataFrame()
g_tmp['group_size'] = age_bran_time_e.groupby(['time', 'group_e']).size()

In [116]:
g_time_sum = pd.DataFrame()
g_time_sum['time_sum'] = age_bran_time_e.groupby('time').size()

In [130]:
g_tmp = g_tmp.reset_index()
g_time_sum = g_time_sum.reset_index()

In [131]:
g_tmp_ = pd.merge(g_tmp, g_time_sum, on = 'time')

In [133]:
g_tmp_ = g_tmp_.drop('time_sum_x', 1)

In [135]:
g_tmp_['time_g_ratio'] = g_tmp_.group_size / g_tmp_.time_sum_y

In [146]:
g_tmp_.head(24)

Unnamed: 0,time,group_e,group_size,time_sum_y,time_g_ratio
0,0,0.0,1403,117864,0.011904
1,0,1.0,1485,117864,0.012599
2,0,2.0,1339,117864,0.011361
3,0,3.0,1893,117864,0.016061
4,0,4.0,2679,117864,0.02273
5,0,5.0,2216,117864,0.018801
6,0,6.0,3148,117864,0.026709
7,0,7.0,6546,117864,0.055539
8,0,8.0,3447,117864,0.029246
9,0,9.0,5249,117864,0.044534


### 임시로 뭉쳐 놓음

In [143]:
tmp = pd.merge(age_bran_time_e, g_tmp_, how = 'left', on = ['group_e', 'time'])

In [147]:
tmp = tmp[['device_id', 'brand', 'device', 'age', 'M_F', 'group_e', 'event_id', 'longitude', 'latitude', 'time', 'time_g_ratio']]

In [148]:
tmp.head(20)

Unnamed: 0,device_id,brand,device,age,M_F,group_e,event_id,longitude,latitude,time,time_g_ratio
0,-8890648629457979026,51,1517,33.0,1.0,10.0,,,,,
1,1277779817574759137,51,749,26.0,1.0,7.0,,,,,
2,5137427614288105724,15,560,,,,571915.0,0.0,0.0,8.0,
3,5137427614288105724,15,560,,,,1414484.0,0.0,0.0,22.0,
4,5137427614288105724,15,560,,,,3049420.0,0.0,0.0,8.0,
5,3669464369358936369,9,1503,,,,,,,,
6,-5019277647504317457,15,536,32.0,0.0,3.0,,,,,
7,3238009352149731868,31,774,30.0,0.0,3.0,,,,,
8,-3883532755183027260,51,752,,,,,,,,
9,-2972199645857147708,31,432,,,,,,,,


In [151]:
age_bran_time_app = pd.merge(age_bran_time, app_events, how = 'left', on = 'event_id')

In [152]:
age_bran_time_app_label = pd.merge(age_bran_time_app, app_labels, how = 'left', on = 'app_id')

In [153]:
age_bran_time_app_label.head()

Unnamed: 0,device_id,phone_brand,device_model,brand,device,gender,age,group,M_F,group_e,event_id,longitude,latitude,month,day,time,app_id,is_installed,is_active,label_id
0,-8890648629457979026,小米,红米,51,1517,M,33.0,M32-38,1.0,10.0,,,,,,,,,,
1,1277779817574759137,小米,MI 2,51,749,M,26.0,M23-26,1.0,7.0,,,,,,,,,,
2,5137427614288105724,三星,Galaxy S4,15,560,,,,,,571915.0,0.0,0.0,5.0,7.0,8.0,,,,
3,5137427614288105724,三星,Galaxy S4,15,560,,,,,,1414480.0,0.0,0.0,5.0,7.0,22.0,,,,
4,5137427614288105724,三星,Galaxy S4,15,560,,,,,,3049420.0,0.0,0.0,5.0,7.0,8.0,,,,


In [181]:
g_app_label = pd.DataFrame()
g_app_label['size'] = age_bran_time_app_label.groupby(['brand', 'device', 'label_id'])['label_id'].size()
g_app_label['age_mean'] = age_bran_time_app_label.groupby(['brand', 'device', 'label_id'])['age'].mean(skipna=True)

UnsupportedFunctionCall: numpy operations are not valid with groupby. Use .groupby(...).mean() instead

In [180]:
g_app_label.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,size,age_mean
brand,device,label_id,Unnamed: 3_level_1,Unnamed: 4_level_1
2,288,16.0,3,
2,288,22.0,1,
2,288,31.0,2,
2,288,36.0,3,
2,288,43.0,3,
2,288,46.0,10,
2,288,47.0,10,
2,288,70.0,4,
2,288,88.0,2,
2,288,90.0,3,


In [19]:
g_user.reset_index()

Unnamed: 0,phone_brand,brand_age
0,E人E本,
1,E派,26.9
2,HTC,31.311331
3,LG,31.522522
4,LOGO,40.473682
5,Lovme,25.799999
6,MIL,15.0
7,OPPO,29.456223
8,PPTV,32.333332
9,SUGAR,28.5


In [22]:
age_brand = age_brand.join(g_user, on = 'phone_brand', rsuffix = '_')

In [24]:
age_brand = age_brand.drop('brand_age', 1)

In [25]:
age_brand.head()

Unnamed: 0,device_id,phone_brand,device_model,brand,device,gender,age,group,brand_age_
0,-8890648629457979026,小米,红米,51,1517,M,33.0,M32-38,30.986849
1,1277779817574759137,小米,MI 2,51,749,M,26.0,M23-26,30.986849
2,5137427614288105724,三星,Galaxy S4,15,560,,,,32.966949
3,3669464369358936369,SUGAR,时尚手机,9,1503,,,,28.5
4,-5019277647504317457,三星,Galaxy Note 2,15,536,F,32.0,F29-32,32.966949


### test set 구성

In [None]:
age_even = pd.merge(gender_age_train, events_tude_mean, how = 'left', on = 'device_id')
age_app = pd.merge(age_even, app_events, how = 'left', on = 'event_id')
age_label = pd.merge(age_app, app_labels, how='left', on='app_id')
age_model = pd.merge(age_label, phone_brand_label, how = 'left', on = 'device_id')

train_f = age_model.loc[:, ['device_id', 'label_id', 'brand', 'device']]

X = train_f

age_model['group_label'] = le.fit_transform(age_model['group'])
y = age_model.group_label

X.fillna(-1, inplace = True)
y.fillna(-1, inplace = True)

### test set 구성

In [None]:
test_e = pd.merge(gender_age_test, events_tude_mean, how = 'left', on = 'device_id')
test_app = pd.merge(test_e, app_events, how = 'left', on = 'event_id')
test_app_label = pd.merge(test_app, app_labels, how='left', on = 'app_id')
test_model = pd.merge(test_app_label, phone_brand_label, how = 'left', on = 'device_id')

test_model.fillna(-1, inplace = True)

## train_test_split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest = RandomForestClassifier(criterion = 'gini', n_estimators = 10, random_state = 1, n_jobs = -1)

In [None]:
forest.fit(X_train[['label_id', 'brand', 'device']], y_train)

In [None]:
forest.feature_importances_

In [None]:
result_rf = pd.DataFrame(y_prona_rf_test, columns = le.classes_)
device_id = test_model["device_id"].values
result_rf['device_id'] = device_id  
result_rf = result_rf.set_index('device_id')
result_rf = result_rf.drop_duplicates(['device_id'], keep='first')



# 예측

In [None]:
y_prona_rf_test = forest.predict_proba(test_model[['label_id', 'brand', 'device']])

In [None]:
from sklearn.metrics import accuracy_score