In [1]:
#-*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from pandas import DataFrame, Series
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
phone_brand_device_model = pd.read_csv('data/phone_brand_device_model.csv')
gender_age_train = pd.read_csv('data/gender_age_train.csv')
label_categories = pd.read_csv('data/label_categories.csv')
events = pd.read_csv('data/events.csv')
app_labels = pd.read_csv('data/app_labels.csv')
app_events = pd.read_csv('data/app_events.csv', dtype = {'event_id' : np.int8, 'app_id' : np.int64, 
                                                   'is_installed' : np.int8, 'is_active' : np.int8})
gender_age_test = pd.read_csv('data/gender_age_test.csv')

### TimeStamp Split

In [6]:
events['timestamp'].replace('-', '', regex = True, inplace = True)
events['timestamp'].replace(':', '', regex = True, inplace = True)
events['timestamp'].replace(' ', '', regex = True, inplace = True)

events['year'] = events['timestamp'].map(lambda x: x[0:4])
events['month'] = events['timestamp'].map(lambda x: x[4:6])
events['day'] = events['timestamp'].map(lambda x: x[6:8])
events['time'] = events['timestamp'].map(lambda x: x[8:10])

events.drop(['timestamp'], axis = 1, inplace = True)
events.drop(['year'], axis = 1, inplace = True)

event_mean = events.groupby('device_id', as_index = False).agg({'longitude' : 'mean', 
                                                   'latitude' : 'mean'})

events_tude_mean = pd.merge(events, event_mean, on = 'device_id')

events_tude_mean = events_tude_mean.drop('longitude_x', 1)
events_tude_mean = events_tude_mean.drop('latitude_x', 1)

### 명목형 변수 치환

In [11]:
phone_brand_device_model.phone_brand.fillna(phone_brand_device_model.phone_brand.dropna().max(), inplace = True)
phone_brand_device_model.device_model.fillna(phone_brand_device_model.device_model.dropna().max(), inplace = True)

In [12]:
le = LabelEncoder()

phone_brand_device_model['brand'] = le.fit_transform(phone_brand_device_model['phone_brand'])
phone_brand_device_model['device'] = le.fit_transform(phone_brand_device_model['device_model'])



phone_brand_label = phone_brand_device_model.copy()

phone_brand_label = phone_brand_label.drop('phone_brand', 1)
phone_brand_label = phone_brand_label.drop('device_model', 1)

In [13]:
# M = 1, F = 0
gender_age_train['M_F'] = le.fit_transform(gender_age_train['gender'])

In [14]:
# group 치환
gender_age_train['group_e'] = le.fit_transform(gender_age_train['group'])

In [15]:
gender_age_train.head()

Unnamed: 0,device_id,gender,age,group,M_F,group_e
0,-8076087639492063270,M,35,M32-38,1,10
1,-2897161552818060146,M,35,M32-38,1,10
2,-8260683887967679142,M,35,M32-38,1,10
3,-4938849341048082022,M,30,M29-31,1,9
4,245133531816851882,M,30,M29-31,1,9


### test set 기초자료

In [17]:
t1 = pd.merge(gender_age_test, phone_brand_device_model, on = 'device_id')
t2 = pd.merge(t1, events, on = 'device_id')
t3 = pd.merge(t2, app_events, on = 'event_id')
t4 = pd.merge(t3, app_labels, on = 'app_id')


In [19]:

t4 = t4.loc[:, ['device_id', 'event_id', 'app_id', 'label_id', 'brand', 'device', 'time']]

In [20]:
t4.head()

Unnamed: 0,device_id,event_id,app_id,label_id,brand,device,time
0,5317828258152702819,114,-7377004479023402858,549,13,1259,0
1,5317828258152702819,114,-7377004479023402858,721,13,1259,0
2,5317828258152702819,114,-7377004479023402858,704,13,1259,0
3,5317828258152702819,114,-7377004479023402858,302,13,1259,0
4,5317828258152702819,114,-7377004479023402858,303,13,1259,0


### longitude / latitude mean값으로 변경
* dvice_id에 따른 지역 편차 줄임

In [9]:
event_mean = events.groupby('device_id', as_index = False).agg({'longitude' : 'mean', 
                                                   'latitude' : 'mean'})

events_tude_mean = pd.merge(events, event_mean, on = 'device_id')

events_tude_mean = events_tude_mean.drop('longitude_x', 1)
events_tude_mean = events_tude_mean.drop('latitude_x', 1)

### brand 별 연령대

In [10]:
age_brand = pd.merge(phone_brand_device_model, gender_age_train, how = 'left', on = 'device_id')

In [11]:
age_bran_time = pd.merge(age_brand, events, how = 'left', on = 'device_id') 

In [13]:
# 명목 변수 제거

age_bran_time_e = age_bran_time[['device_id', 'brand', 'device', 'age', 'M_F', 'group_e', 'event_id', 'longitude', 'latitude', 'time']]

In [14]:
age_bran_time_e.head()

Unnamed: 0,device_id,brand,device,age,M_F,group_e,event_id,longitude,latitude,time
0,-8890648629457979026,51,1517,33.0,1.0,10.0,,,,
1,1277779817574759137,51,749,26.0,1.0,7.0,,,,
2,5137427614288105724,15,560,,,,571915.0,0.0,0.0,8.0
3,5137427614288105724,15,560,,,,1414484.0,0.0,0.0,22.0
4,5137427614288105724,15,560,,,,3049420.0,0.0,0.0,8.0


## 변수 추가
* brand별 연령대
* model별 연령대
* time(hour)별 연령대
* label별 연령대

In [117]:
g_brand = pd.DataFrame()
g_model = pd.DataFrame()

g_brand_tmp = pd.DataFrame()
g_model_tmp = pd.DataFrame()

# phone_brand 평균 연령대

#g_brand['brand_age'] = age_brand.groupby('phone_brand', 'group_e')['age'].mean().astype(np.float32)
#g_brand['brand_gender'] = age_brand.groupby('phone_brand', 'group_e')['M_F'].mean().astype(np.float32)
#g_brand['brand_group'] = age_brand.groupby('phone_brand')['group_e'].mean().astype(np.float32)
g_brand['brand_group_count'] = age_brand.groupby(['phone_brand','group_e'])['group_e'].count()
g_brand_tmp['brand_group_sum'] = age_brand.groupby('phone_brand')['group_e'].sum()

g_brand = g_brand.reset_index()
g_brand_tmp = g_brand_tmp.reset_index()
g_brand_rate = pd.merge(g_brand, g_brand_tmp, how = 'left', on = 'phone_brand')

#g_model['model_age'] = age_brand.groupby('device_model', 'group_e')['age'].mean().astype(np.float32)
#g_model['model_gender'] = age_brand.groupby('device_model', 'group_e')['M_F'].mean().astype(np.float32)
#g_model['model_group'] = age_brand.groupby('device_model')['group_e'].mean().astype(np.float32)
g_model['model_group_count'] = age_brand.groupby(['device_model', 'group_e'])['group_e'].count()
g_model_tmp['model_group_sum'] = age_brand.groupby('device_model')['group_e'].sum()

g_model = g_model.reset_index()
g_model_tmp = g_model_tmp.reset_index()
g_model_rate= pd.merge(g_model, g_model_tmp, how = 'left', on = 'device_model')

g_brand_rate['brand_ratio'] = g_brand_rate.brand_group_count / g_brand_rate.brand_group_sum
g_model_rate['device_ratio'] = g_model_rate.model_group_count / g_model_rate.model_group_sum

In [122]:
g_brand_rate.head()

Unnamed: 0,phone_brand,group_e,brand_group_count,brand_group_sum,brand_ratio
0,E派,0.0,2,57.0,0.035088
1,E派,2.0,2,57.0,0.035088
2,E派,6.0,1,57.0,0.017544
3,E派,8.0,1,57.0,0.017544
4,E派,9.0,1,57.0,0.017544


In [125]:
g_model_rate.head()

Unnamed: 0,device_model,group_e,model_group_count,model_group_sum,device_ratio
0,1100,0.0,1,36.0,0.027778
1,1100,3.0,4,36.0,0.111111
2,1100,7.0,2,36.0,0.055556
3,1100,10.0,1,36.0,0.027778
4,1105,0.0,1,387.0,0.002584


### time별 그룹 비율

In [36]:
g_time_sum = pd.DataFrame()
g_time_sum['time_sum'] = age_bran_time_e.groupby('time')['group_e'].count()

g_tmp = g_tmp.reset_index()

g_time_sum = g_time_sum.reset_index()

g_tmp_ = pd.merge(g_tmp, g_time_sum, on = 'time')

# 시간대별 사용자그룹 분포
g_tmp_['time_g_ratio'] = g_tmp_.group_size / g_tmp_.time_sum

In [126]:
g_tmp_.head()

Unnamed: 0,time,group_e,group_size,time_sum,time_g_ratio
0,0,0.0,1403,45058,0.031138
1,0,1.0,1485,45058,0.032958
2,0,2.0,1339,45058,0.029717
3,0,3.0,1893,45058,0.042013
4,0,4.0,2679,45058,0.059457


### label별 그룹 ratio

In [None]:
age_bran_time_app = pd.merge(age_bran_time, app_events, how = 'left', on = 'event_id')
age_bran_time_app_label = pd.merge(age_bran_time_app, app_labels, how = 'left', on = 'app_id')

In [134]:
g_app_label = pd.DataFrame()
g_app_label_sum = pd.DataFrame()

g_app_label['count'] = age_bran_time_app_label.groupby(['label_id', 'group_e'])['group_e'].count()
g_app_label_sum['sum'] = age_bran_time_app_label.groupby('label_id')['group_e'].sum()

g_app_label = g_app_label.reset_index()
g_app_label_sum = g_app_label_sum.reset_index()

g_label_group = pd.merge(g_app_label, g_app_label_sum, how = 'left', on = 'label_id')

g_label_group['label_ratio'] = g_label_group['count'] / g_label_group['sum']

In [139]:
g_label_group.head()

Unnamed: 0,label_id,group_e,count,sum,label_ratio
0,13.0,0.0,1,332.0,0.003012
1,13.0,2.0,4,332.0,0.012048
2,13.0,3.0,2,332.0,0.006024
3,13.0,5.0,1,332.0,0.003012
4,13.0,6.0,5,332.0,0.01506


In [142]:
age_bran_time_app_label.head()

Unnamed: 0,device_id,phone_brand,device_model,brand,device,gender,age,group,M_F,group_e,event_id,longitude,latitude,month,day,time,app_id,is_installed,is_active,label_id
0,-8890648629457979026,小米,红米,51,1517,M,33.0,M32-38,1.0,10.0,,,,,,,,,,
1,1277779817574759137,小米,MI 2,51,749,M,26.0,M23-26,1.0,7.0,,,,,,,,,,
2,5137427614288105724,三星,Galaxy S4,15,560,,,,,,571915.0,0.0,0.0,5.0,7.0,8.0,,,,
3,5137427614288105724,三星,Galaxy S4,15,560,,,,,,1414480.0,0.0,0.0,5.0,7.0,22.0,,,,
4,5137427614288105724,三星,Galaxy S4,15,560,,,,,,3049420.0,0.0,0.0,5.0,7.0,8.0,,,,


### MERGE 본체

In [143]:
total = age_bran_time_app_label[['device_id', 'phone_brand', 'device_model', 'brand', 'device', 'M_F','age', 'group_e', 'event_id',
                                'longitude', 'latitude', 'time', 'app_id', 'label_id']]

In [144]:
total_b = pd.merge(total, g_brand_rate, how = 'left', on = ['phone_brand', 'group_e'])

In [146]:
total_m = pd.merge(total_b, g_model_rate, how = 'left', on = ['device_model', 'group_e'])

In [147]:
total_all = pd.merge(total_m, g_label_group, how = 'left', on = ['label_id', 'group_e'])

In [49]:
tmp = pd.merge(age_bran_time_e, g_tmp_, how = 'left', on = ['group_e', 'time'])

#tmp = tmp[['device_id', 'brand', 'device', 'age', 'M_F', 'group_e', 'event_id', 'longitude', 'latitude', 'time', 'time_g_ratio']]

In [156]:
total_all.head()

Unnamed: 0,device_id,phone_brand,device_model,brand,device,M_F,age,group_e,event_id,longitude,latitude,time,app_id,label_id,brand_group_count,brand_group_sum,brand_ratio,model_group_count,model_group_sum,device_ratio,count,sum,label_ratio
0,-8890648629457979026,小米,红米,51,1517,1.0,33.0,10.0,,,,,,,2147.0,111483.0,0.019259,129.0,6300.0,0.020476,,,
1,1277779817574759137,小米,MI 2,51,749,1.0,26.0,7.0,,,,,,,2440.0,111483.0,0.021887,88.0,3476.0,0.025316,,,
2,5137427614288105724,三星,Galaxy S4,15,560,,,,571915.0,0.0,0.0,8.0,,,,,,,,,,,
3,5137427614288105724,三星,Galaxy S4,15,560,,,,1414480.0,0.0,0.0,22.0,,,,,,,,,,,
4,5137427614288105724,三星,Galaxy S4,15,560,,,,3049420.0,0.0,0.0,8.0,,,,,,,,,,,


In [158]:
total_e = total_all.loc[:,['group_e', 'device_id', 'app_id', 'label_id', 'brand', 'device', 'M_F', 'age',  'time',  'brand_ratio', 
                           'device_ratio', 'label_ratio']]

In [None]:
# train / test 분리

In [159]:
total_train = pd.merge(gender_age_train, total_e, on = 'device_id')
total_test = pd.merge(gender_age_test, total_e, on = 'device_id')

In [160]:
total_train.head()

Unnamed: 0,device_id,gender,age_x,group,M_F_x,group_e_x,group_e_y,app_id,label_id,brand,device,M_F_y,age_y,time,brand_ratio,device_ratio,label_ratio
0,-8076087639492063270,M,35,M32-38,1,10,10.0,,,51,749,1.0,35.0,,0.019259,0.022727,
1,-2897161552818060146,M,35,M32-38,1,10,10.0,,,51,749,1.0,35.0,,0.019259,0.022727,
2,-8260683887967679142,M,35,M32-38,1,10,10.0,,,51,749,1.0,35.0,14.0,0.019259,0.022727,
3,-4938849341048082022,M,30,M29-31,1,9,9.0,,,51,1524,1.0,30.0,,0.015841,0.015513,
4,245133531816851882,M,30,M29-31,1,9,9.0,,,51,753,1.0,30.0,,0.015841,0.016329,


In [162]:
total_test.head(300)

Unnamed: 0,device_id,group_e,app_id,label_id,brand,device,M_F,age,time,brand_ratio,device_ratio,label_ratio
0,1002079943728939269,,,,51,1482,,,21.0,,,
1,1002079943728939269,,,,51,1482,,,22.0,,,
2,1002079943728939269,,,,51,1482,,,8.0,,,
3,1002079943728939269,,,,51,1482,,,16.0,,,
4,1002079943728939269,,,,51,1482,,,8.0,,,
5,1002079943728939269,,,,51,1482,,,9.0,,,
6,1002079943728939269,,,,51,1482,,,22.0,,,
7,-1547860181818787117,,,,51,1519,,,20.0,,,
8,-1547860181818787117,,,,51,1519,,,13.0,,,
9,-1547860181818787117,,,,51,1519,,,13.0,,,


### device_id 1개로 줄이기 (보류)

In [157]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
oh = OneHotEncoder(categorical_features = )

In [22]:
age_brand = age_brand.join(g_user, on = 'phone_brand', rsuffix = '_')

In [24]:
age_brand = age_brand.drop('brand_age', 1)

In [25]:
age_brand.head()

Unnamed: 0,device_id,phone_brand,device_model,brand,device,gender,age,group,brand_age_
0,-8890648629457979026,小米,红米,51,1517,M,33.0,M32-38,30.986849
1,1277779817574759137,小米,MI 2,51,749,M,26.0,M23-26,30.986849
2,5137427614288105724,三星,Galaxy S4,15,560,,,,32.966949
3,3669464369358936369,SUGAR,时尚手机,9,1503,,,,28.5
4,-5019277647504317457,三星,Galaxy Note 2,15,536,F,32.0,F29-32,32.966949


### test set 구성

In [None]:
age_even = pd.merge(gender_age_train, events_tude_mean, how = 'left', on = 'device_id')
age_app = pd.merge(age_even, app_events, how = 'left', on = 'event_id')
age_label = pd.merge(age_app, app_labels, how='left', on='app_id')
age_model = pd.merge(age_label, phone_brand_label, how = 'left', on = 'device_id')

train_f = age_model.loc[:, ['device_id', 'label_id', 'brand', 'device']] # time 붙일 수 있다.

X = train_f

age_model['group_label'] = le.fit_transform(age_model['group'])
y = age_model.group_label

X.fillna(-1, inplace = True)
y.fillna(-1, inplace = True)

### test set 구성

In [153]:
test_e = pd.merge(gender_age_test, events_tude_mean, how = 'left', on = 'device_id')
test_app = pd.merge(test_e, app_events, how = 'left', on = 'event_id')
test_app_label = pd.merge(test_app, app_labels, how='left', on = 'app_id')
test_model = pd.merge(test_app_label, phone_brand_label, how = 'left', on = 'device_id')

test_model.fillna(-1, inplace = True)

In [155]:
test_model.head(10)

Unnamed: 0,device_id,event_id,month,day,time,longitude_y,latitude_y,app_id,is_installed,is_active,label_id,brand,device
0,1002079943728939269,460577.0,5,3,21,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
1,1002079943728939269,755837.0,5,5,22,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
2,1002079943728939269,1171252.0,5,2,8,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
3,1002079943728939269,1805074.0,5,1,16,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
4,1002079943728939269,2145937.0,5,5,8,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
5,1002079943728939269,2774265.0,5,7,9,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
6,1002079943728939269,3127685.0,5,6,22,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1482
7,-1547860181818787117,185516.0,5,3,20,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1519
8,-1547860181818787117,202176.0,5,1,13,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1519
9,-1547860181818787117,604418.0,5,1,13,0.0,0.0,-1.0,-1.0,-1.0,-1.0,51,1519


## train_test_split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
forest = RandomForestClassifier(criterion = 'gini', n_estimators = 10, random_state = 1, n_jobs = -1)

In [None]:
forest.fit(X_train[['label_id', 'brand', 'device']], y_train)

In [None]:
forest.feature_importances_

In [None]:
result_rf = pd.DataFrame(y_prona_rf_test, columns = le.classes_)
device_id = test_model["device_id"].values
result_rf['device_id'] = device_id  
result_rf = result_rf.set_index('device_id')
result_rf = result_rf.drop_duplicates(['device_id'], keep='first')



# 예측

In [None]:
y_prona_rf_test = forest.predict_proba(test_model[['label_id', 'brand', 'device']])

In [None]:
from sklearn.metrics import accuracy_score