In [1]:
#-*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from pandas import DataFrame, Series
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
phone_brand_device_model = pd.read_csv('data/phone_brand_device_model.csv')
gender_age_train = pd.read_csv('data/gender_age_train.csv')
label_categories = pd.read_csv('data/label_categories.csv')
events = pd.read_csv('data/events.csv')
app_labels = pd.read_csv('data/app_labels.csv')
app_events = pd.read_csv('data/app_events.csv', dtype = {'event_id' : np.int8, 'app_id' : np.int64, 
                                                   'is_installed' : np.int8, 'is_active' : np.int8})
gender_age_test = pd.read_csv('data/gender_age_test.csv')

### TimeStamp Split

In [3]:
events['timestamp'].replace('-', '', regex = True, inplace = True)
events['timestamp'].replace(':', '', regex = True, inplace = True)
events['timestamp'].replace(' ', '', regex = True, inplace = True)

events['year'] = events['timestamp'].map(lambda x: x[0:4])
events['month'] = events['timestamp'].map(lambda x: x[4:6])
events['day'] = events['timestamp'].map(lambda x: x[6:8])
events['time'] = events['timestamp'].map(lambda x: x[8:10])

events.drop(['timestamp'], axis = 1, inplace = True)
events.drop(['year'], axis = 1, inplace = True)



### 명목형 변수 치환

In [4]:
phone_brand_device_model.phone_brand.fillna(phone_brand_device_model.phone_brand.dropna().max(), inplace = True)
phone_brand_device_model.device_model.fillna(phone_brand_device_model.device_model.dropna().max(), inplace = True)

In [5]:
le_br = LabelEncoder()
le_mo = LabelEncoder()


phone_brand_device_model['brand'] = le_br.fit_transform(phone_brand_device_model['phone_brand'])
phone_brand_device_model['device'] = le_mo.fit_transform(phone_brand_device_model['device_model'])



phone_brand_label = phone_brand_device_model.copy()

phone_brand_label = phone_brand_label.drop('phone_brand', 1)
phone_brand_label = phone_brand_label.drop('device_model', 1)

In [6]:
#M = 1, F = 0
le_gen = LabelEncoder()
gender_age_train['M_F'] = le_gen.fit_transform(gender_age_train['gender'])

In [7]:
#group 치환
le_g = LabelEncoder()
gender_age_train['group_e'] = le_g.fit_transform(gender_age_train['group'])

In [8]:
# evnet_id 에서 minus 뗴어 내기

app_events = app_events[(app_events.event_id > 0)]


In [9]:
app_events.shape

(16098186, 4)

### Data set 기초자료 (train + test)

In [10]:
tmp = pd.merge(gender_age_train, phone_brand_label, how='left', on = 'device_id')
tmp_ = pd.merge(gender_age_test, phone_brand_label,  how='left', on = 'device_id')

In [11]:
con = pd.concat([tmp, tmp_])

In [12]:
tmp = pd.merge(events, con, how='left', on='device_id')

In [13]:

labels = pd.merge(app_labels, app_events, on='app_id')


In [14]:
labels = labels[['app_id', 'label_id', 'event_id']]

In [15]:
tmp = tmp[['device_id', 'age', 'M_F', 'group_e', 'brand', 'device', 'event_id', 'time']]

In [16]:
tmp = pd.merge(tmp, labels, how='left', on='event_id')

In [17]:
del labels

In [18]:
tmp = tmp[['device_id', 'age', 'M_F', 'group_e', 'brand', 'device', 'time', 'label_id']]

In [19]:
tmp = tmp.drop_duplicates(subset=['device_id', 'age', 'M_F', 'group_e', 'brand', 'device', 'time', 'label_id'], keep='first')

In [20]:
tmp.shape

(564683, 8)

In [21]:
tmp.head()

Unnamed: 0,device_id,age,M_F,group_e,brand,device,time,label_id
0,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,251.0
3,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,691.0
6,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,405.0
9,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,730.0
12,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,751.0


In [22]:
g_sam = pd.DataFrame()

g_sam['brand_m']= tmp.groupby(['brand'])['M_F'].mean()
g_sam['brand_g']= tmp.groupby(['brand'])['group_e'].mean()

g_sam.reset_index()

tmp['brand_m'] = tmp.brand.map(g_sam.brand_m)
tmp['brand_g'] = tmp.brand.map(g_sam.brand_g)

In [23]:
g_d = pd.DataFrame()

g_d['device_m'] = tmp.groupby(['device'])['M_F'].mean()
g_d['device_g']= tmp.groupby(['device'])['group_e'].mean()

g_d.reset_index()

tmp['device_m'] = tmp.device.map(g_d.device_m)
tmp['device_g'] = tmp.device.map(g_d.device_g)

In [24]:
g_t = pd.DataFrame()

g_t['time_m'] = tmp.groupby(['time'])['M_F'].mean()
g_t['time_g']= tmp.groupby(['time'])['group_e'].mean()

g_t.reset_index()

tmp['time_m'] = tmp.time.map(g_t.time_m)
tmp['time_g'] = tmp.time.map(g_t.time_g)

In [25]:
g_l = pd.DataFrame()

g_l['label_m'] = tmp.groupby(['label_id'])['M_F'].mean()
g_l['label_g']= tmp.groupby(['label_id'])['group_e'].mean()

g_l.reset_index()

tmp['label_m'] = tmp.label_id.map(g_l.label_m)
tmp['label_g'] = tmp.label_id.map(g_l.label_g)

In [26]:
tmp.head()

Unnamed: 0,device_id,age,M_F,group_e,brand,device,time,label_id,brand_m,brand_g,device_m,device_g,time_m,time_g,label_m,label_g
0,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,251.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444
3,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,691.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444
6,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,405.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444
9,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,730.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444
12,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,751.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444


In [24]:
tmp.shape

(564683, 16)

### test set 기초자료

In [7]:
t1 = pd.merge(gender_age_test, phone_brand_device_model, on = 'device_id')
t2 = pd.merge(t1, events, on = 'device_id')
t3 = pd.merge(t2, app_events, on = 'event_id')
t4 = pd.merge(t3, app_labels, on = 'app_id')


In [13]:
t4 = t4.loc[:, ['device_id', 'event_id', 'app_id', 'label_id', 'phone_brand', 'device_model', 'time']]

In [14]:
del t1, t2, t3

In [20]:
t4.head()

Unnamed: 0,device_id,event_id,app_id,label_id,brand,device,time
0,5317828258152702819,114,-7377004479023402858,549,13,1259,0
1,5317828258152702819,114,-7377004479023402858,721,13,1259,0
2,5317828258152702819,114,-7377004479023402858,704,13,1259,0
3,5317828258152702819,114,-7377004479023402858,302,13,1259,0
4,5317828258152702819,114,-7377004479023402858,303,13,1259,0


### brand 별 연령대 (미사용)

In [23]:
age_brand = pd.merge(phone_brand_device_model, gender_age_train, how = 'left', on = 'device_id')

In [24]:
age_bran_time = pd.merge(age_brand, events, how = 'left', on = 'device_id') 

In [25]:
# 명목 변수 제거

age_bran_time_e = age_bran_time[['device_id', 'brand', 'device', 'age', 'M_F', 'group_e', 'event_id', 'longitude', 'latitude', 'time']]

In [26]:
age_bran_time_e.head()

Unnamed: 0,device_id,brand,device,age,M_F,group_e,event_id,longitude,latitude,time
0,-8890648629457979026,51,1517,33.0,1.0,10.0,,,,
1,1277779817574759137,51,749,26.0,1.0,7.0,,,,
2,5137427614288105724,15,560,,,,571915.0,0.0,0.0,8.0
3,5137427614288105724,15,560,,,,1414484.0,0.0,0.0,22.0
4,5137427614288105724,15,560,,,,3049420.0,0.0,0.0,8.0


## 변수 추가
* brand별 연령대
* model별 연령대
* time(hour)별 연령대
* label별 연령대

In [27]:
tmp_sam = tmp[['device_id', 'age', 'M_F', 'group_e', 'brand', 'device', 'time', 'label_id']]

In [28]:
g_brand = pd.DataFrame()
g_model = pd.DataFrame()

g_brand_tmp = pd.DataFrame()
g_model_tmp = pd.DataFrame()

# phone_brand 평균 연령대

g_brand['brand_group_count'] = tmp_sam.groupby(['brand','group_e'])['group_e'].count()
g_brand_tmp['brand_group_sum'] = tmp_sam.groupby('brand')['group_e'].sum()

g_brand = g_brand.reset_index()
g_brand_tmp = g_brand_tmp.reset_index()
g_brand_rate = pd.merge(g_brand, g_brand_tmp, how = 'left', on = 'brand')

g_model['model_group_count'] = tmp_sam.groupby(['device', 'group_e'])['group_e'].count()
g_model_tmp['model_group_sum'] = tmp_sam.groupby('device')['group_e'].sum()

g_model = g_model.reset_index()
g_model_tmp = g_model_tmp.reset_index()
g_model_rate= pd.merge(g_model, g_model_tmp, how = 'left', on = 'device')

g_brand_rate['brand_ratio'] = g_brand_rate.brand_group_count / g_brand_rate.brand_group_sum
g_model_rate['device_ratio'] = g_model_rate.model_group_count / g_model_rate.model_group_sum

In [29]:
g_brand_rate = g_brand_rate.drop(['brand_group_count', 'brand_group_sum'], 1)

g_brand_rate['brand_ratio'] = g_brand_rate.group_e + g_brand_rate.brand_ratio

g_brand_rate = g_brand_rate.set_index(['brand', 'group_e']).unstack('group_e')

tmp = tmp.join(g_brand_rate, how='left')

#tmp['g_brand_rate'] = tmp['brand', 'group_e'].map(g_brand_rate.brand_ratio)



In [37]:
g_model_rate.head()

Unnamed: 0,device,group_e,model_group_count,model_group_sum,device_ratio
0,0.0,3.0,22,66.0,0.333333
1,1.0,2.0,1,935.0,0.00107
2,1.0,3.0,5,935.0,0.005348
3,1.0,4.0,21,935.0,0.02246
4,1.0,6.0,1,935.0,0.00107


In [30]:
g_model_rate = g_model_rate.drop(['model_group_count', 'model_group_sum'], 1)

g_model_rate['device_ratio'] = g_model_rate.group_e + g_model_rate.device_ratio

g_model_rate = g_model_rate.set_index(['device', 'group_e']).unstack('group_e')

tmp = tmp.join(g_model_rate, how='left')



### time별 그룹 비율

In [33]:
g_time_sum = pd.DataFrame()
g_tmp = pd.DataFrame()

g_tmp['group_size'] = tmp_sam.groupby(['time', 'group_e'])['group_e'].size()
g_time_sum['time_sum'] = tmp_sam.groupby('time')['group_e'].count()

g_tmp = g_tmp.reset_index()

g_time_sum = g_time_sum.reset_index()

g_time_rate = pd.merge(g_tmp, g_time_sum, on = 'time')

# 시간대별 사용자그룹 분포
g_time_rate['time_ratio'] = g_time_rate.group_size / g_time_rate.time_sum

In [34]:
g_time_rate = g_time_rate.drop(['group_size', 'time_sum'], 1)

g_time_rate['time_ratio'] = g_time_rate.group_e + g_time_rate.time_ratio

g_time_rate = g_time_rate.set_index(['time', 'group_e']).unstack('group_e')

tmp = tmp.join(g_time_rate, how='left')



### label별 그룹 ratio

In [35]:
g_app_label = pd.DataFrame()
g_app_label_sum = pd.DataFrame()

g_app_label['count'] = tmp_sam.groupby(['label_id', 'group_e'])['group_e'].count()
g_app_label_sum['sum'] = tmp_sam.groupby('label_id')['group_e'].sum()

g_app_label = g_app_label.reset_index()
g_app_label_sum = g_app_label_sum.reset_index()

g_label_group = pd.merge(g_app_label, g_app_label_sum, how = 'left', on = 'label_id')

g_label_group['label_ratio'] = g_label_group['count'] / g_label_group['sum']

In [36]:
g_label_group = g_label_group.drop(['count', 'sum'], 1)

In [37]:
g_label_group['label_ratio'] = g_label_group.group_e + g_label_group.label_ratio

g_label_group = g_label_group.set_index(['label_id', 'group_e']).unstack('group_e')

tmp = tmp.join(g_label_group, how='left')



In [38]:
tmp.head()

Unnamed: 0,device_id,age,M_F,group_e,brand,device,time,label_id,brand_m,brand_g,device_m,device_g,time_m,time_g,label_m,label_g,"(brand_ratio, 0.0)","(brand_ratio, 1.0)","(brand_ratio, 2.0)","(brand_ratio, 3.0)","(brand_ratio, 4.0)","(brand_ratio, 5.0)","(brand_ratio, 6.0)","(brand_ratio, 7.0)","(brand_ratio, 8.0)","(brand_ratio, 9.0)","(brand_ratio, 10.0)","(brand_ratio, 11.0)","(device_ratio, 0.0)","(device_ratio, 1.0)","(device_ratio, 2.0)","(device_ratio, 3.0)","(device_ratio, 4.0)","(device_ratio, 5.0)","(device_ratio, 6.0)","(device_ratio, 7.0)","(device_ratio, 8.0)","(device_ratio, 9.0)","(device_ratio, 10.0)","(device_ratio, 11.0)","(time_ratio, 0.0)","(time_ratio, 1.0)","(time_ratio, 2.0)","(time_ratio, 3.0)","(time_ratio, 4.0)","(time_ratio, 5.0)","(time_ratio, 6.0)","(time_ratio, 7.0)","(time_ratio, 8.0)","(time_ratio, 9.0)","(time_ratio, 10.0)","(time_ratio, 11.0)","(label_ratio, 0.0)","(label_ratio, 2.0)","(label_ratio, 3.0)","(label_ratio, 5.0)","(label_ratio, 6.0)","(label_ratio, 7.0)","(label_ratio, 8.0)","(label_ratio, 9.0)","(label_ratio, 10.0)","(label_ratio, 11.0)"
0,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,251.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444,,,,,,,,,,,,,,,,3.333333,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,691.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444,,1.002399,2.074696,3.004159,4.005918,5.00096,6.002879,7.013436,8.010077,9.015035,10.028471,11.017754,,1.104167,,,4.114583,,,7.0625,,,,,,,,,,,,,,,,,,,,,,,,,,
6,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,405.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444,,,,,,,,,,,,,,,2.001965,,4.003929,,6.008841,7.006876,8.018664,9.000982,10.06778,11.003929,,,,,,,,,,,,,0.005814,2.008721,3.005814,5.005814,6.008721,7.017442,8.017442,9.017442,10.014535,11.02907
9,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,730.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.005814,2.008721,3.005814,5.005814,6.008721,7.017442,8.017442,9.017442,10.014535,11.02907
12,29182687948017175,46.0,1.0,11.0,51.0,1524.0,0,751.0,0.704919,7.088218,0.666092,7.270231,0.770939,7.373082,0.8,7.644444,,,2.007484,3.000312,4.004989,5.004365,6.000312,7.020892,8.009978,9.016526,10.020892,11.032429,,,,,4.25,,,,,,,,,,,,,,,,,,,,0.005814,2.008721,3.005814,5.005814,6.008721,7.017442,8.017442,9.017442,10.014535,11.02907


### MERGE 본체

In [45]:
total = age_bran_time_app_label[['device_id', 'phone_brand', 'device_model', 'brand', 'device', 'M_F','age', 'group_e', 'event_id',
                                'longitude', 'latitude', 'time', 'app_id', 'label_id']]

In [46]:
total_b = pd.merge(total, g_brand_rate, how = 'left', on = ['phone_brand', 'group_e'])

In [47]:
total_m = pd.merge(total_b, g_model_rate, how = 'left', on = ['device_model', 'group_e'])

In [48]:
total_all = pd.merge(total_m, g_label_group, how = 'left', on = ['label_id', 'group_e'])

In [49]:
tmp = pd.merge(age_bran_time_e, g_tmp_, how = 'left', on = ['group_e', 'time'])

#tmp = tmp[['device_id', 'brand', 'device', 'age', 'M_F', 'group_e', 'event_id', 'longitude', 'latitude', 'time', 'time_g_ratio']]

In [156]:
total_all.head()

Unnamed: 0,device_id,phone_brand,device_model,brand,device,M_F,age,group_e,event_id,longitude,latitude,time,app_id,label_id,brand_group_count,brand_group_sum,brand_ratio,model_group_count,model_group_sum,device_ratio,count,sum,label_ratio
0,-8890648629457979026,小米,红米,51,1517,1.0,33.0,10.0,,,,,,,2147.0,111483.0,0.019259,129.0,6300.0,0.020476,,,
1,1277779817574759137,小米,MI 2,51,749,1.0,26.0,7.0,,,,,,,2440.0,111483.0,0.021887,88.0,3476.0,0.025316,,,
2,5137427614288105724,三星,Galaxy S4,15,560,,,,571915.0,0.0,0.0,8.0,,,,,,,,,,,
3,5137427614288105724,三星,Galaxy S4,15,560,,,,1414480.0,0.0,0.0,22.0,,,,,,,,,,,
4,5137427614288105724,三星,Galaxy S4,15,560,,,,3049420.0,0.0,0.0,8.0,,,,,,,,,,,


In [53]:
total_e = total_all.loc[:,['group_e', 'device_id', 'app_id', 'label_id', 'brand', 'device', 'M_F', 'age',  'time',  'brand_ratio', 
                           'device_ratio', 'label_ratio']]

MemoryError: 

In [None]:
# train / test 분리

In [159]:
total_train = pd.merge(gender_age_train, total_e, on = 'device_id')
total_test = pd.merge(gender_age_test, total_e, on = 'device_id')

In [160]:
total_train.head()

Unnamed: 0,device_id,gender,age_x,group,M_F_x,group_e_x,group_e_y,app_id,label_id,brand,device,M_F_y,age_y,time,brand_ratio,device_ratio,label_ratio
0,-8076087639492063270,M,35,M32-38,1,10,10.0,,,51,749,1.0,35.0,,0.019259,0.022727,
1,-2897161552818060146,M,35,M32-38,1,10,10.0,,,51,749,1.0,35.0,,0.019259,0.022727,
2,-8260683887967679142,M,35,M32-38,1,10,10.0,,,51,749,1.0,35.0,14.0,0.019259,0.022727,
3,-4938849341048082022,M,30,M29-31,1,9,9.0,,,51,1524,1.0,30.0,,0.015841,0.015513,
4,245133531816851882,M,30,M29-31,1,9,9.0,,,51,753,1.0,30.0,,0.015841,0.016329,


In [22]:
total_test.head(300)

NameError: name 'total_test' is not defined

In [21]:
total_test.isnull().sum()

NameError: name 'total_test' is not defined

### device_id 1개로 줄이기 (보류)

In [157]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
oh = OneHotEncoder(categorical_features = )

In [22]:
age_brand = age_brand.join(g_user, on = 'phone_brand', rsuffix = '_')

In [24]:
age_brand = age_brand.drop('brand_age', 1)

In [25]:
age_brand.head()

Unnamed: 0,device_id,phone_brand,device_model,brand,device,gender,age,group,brand_age_
0,-8890648629457979026,小米,红米,51,1517,M,33.0,M32-38,30.986849
1,1277779817574759137,小米,MI 2,51,749,M,26.0,M23-26,30.986849
2,5137427614288105724,三星,Galaxy S4,15,560,,,,32.966949
3,3669464369358936369,SUGAR,时尚手机,9,1503,,,,28.5
4,-5019277647504317457,三星,Galaxy Note 2,15,536,F,32.0,F29-32,32.966949


### test set 구성

In [None]:
age_even = pd.merge(gender_age_train, events_tude_mean, how = 'left', on = 'device_id')
age_app = pd.merge(age_even, app_events, how = 'left', on = 'event_id')
age_label = pd.merge(age_app, app_labels, how='left', on='app_id')
age_model = pd.merge(age_label, phone_brand_label, how = 'left', on = 'device_id')

train_f = age_model.loc[:, ['device_id', 'label_id', 'brand', 'device']] # time 붙일 수 있다.

X = train_f

age_model['group_label'] = le.fit_transform(age_model['group'])
y = age_model.group_label

X.fillna(-1, inplace = True)
y.fillna(-1, inplace = True)

### test set 구성

In [153]:
test_e = pd.merge(gender_age_test, events_tude_mean, how = 'left', on = 'device_id')
test_app = pd.merge(test_e, app_events, how = 'left', on = 'event_id')
test_app_label = pd.merge(test_app, app_labels, how='left', on = 'app_id')
test_model = pd.merge(test_app_label, phone_brand_label, how = 'left', on = 'device_id')

test_model.fillna(-1, inplace = True)

In [155]:
test_model.head(10)

Unnamed: 0,device_id,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id,brand,device
0,1002079943728939269,460577.0,5,3,21,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
1,1002079943728939269,755837.0,5,5,22,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
2,1002079943728939269,1171252.0,5,2,8,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
3,1002079943728939269,1805074.0,5,1,16,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
4,1002079943728939269,2145937.0,5,5,8,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
5,1002079943728939269,2774265.0,5,7,9,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
6,1002079943728939269,3127685.0,5,6,22,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
7,-1547860181818787117,185516.0,5,3,20,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1519
8,-1547860181818787117,202176.0,5,1,13,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1519
9,-1547860181818787117,604418.0,5,1,13,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1519


## train_test_split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest = RandomForestClassifier(criterion = 'gini', n_estimators = 10, random_state = 1, n_jobs = -1)

In [None]:
forest.fit(X_train[['label_id', 'brand', 'device']], y_train)

In [None]:
forest.feature_importances_

In [None]:
result_rf = pd.DataFrame(y_prona_rf_test, columns = le.classes_)
device_id = test_model["device_id"].values
result_rf['device_id'] = device_id  
result_rf = result_rf.set_index('device_id')
result_rf = result_rf.drop_duplicates(['device_id'], keep='first')



# 예측

In [None]:
y_prona_rf_test = forest.predict_proba(test_model[['label_id', 'brand', 'device']])

In [None]:
from sklearn.metrics import accuracy_score