# TalkingData AdTracking 데이터의 탐색적 분석 (Exploratory data analysis)
### 0. 데이터 필드 정보
  - `ip`: 클릭한 폰의 IP 주소
  - `app`: 광고가 켜지는 앱의 ID
  - `device`: 모바일 폰 타입
  - `os`: 모바일 폰의 OS 버전
  - `channel`: 광고 채널(e.g. facebook, google ad 등등)
  - `click_time`: 광고를 클릭한 시간(UTC)
  - `attributed_time`: 만약 유저가 앱을 다운로드 하였으면, 다운로드한 시간
  - `is_attributed`: 앱의 실제 다운로드 여부를 나타냄(실제 추정 값)

### Background and objectives
  #### BUILD AN ALGORITHM THAT PREDICTS WHETHER A USER WILL DOWNLOAD AN APP AFTER CLICKING A MOBILE APP AD.
  - misleading click data and wasted money
  - Ad channels can drive up costs by simply clicking on the ad at a large scale.
  - 3 billion clicks/day, 90% are potentialy fraudulent.
    - previous: measure the journey of a user's click across their portfolio, flag IP addresses who produce lots of clicks
    - IP blacklists and device blacklist  

%%html
<!-- 에디터 폰트를 조정합니다. -->
<style type='text/css'>
.CodeMirror{
    font-size: 16px;
    font-family: Myriad Pro;
    font-weight: bold;
}
</style>

### Show data from the inputs

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os
print(os.listdir("./data"))
import gc

['train_sample.csv.zip', 'sample_submission.csv.zip', 'test.csv', 'mnt', 'train.csv.zip', 'train.csv', 'test.csv.zip', 'talkingdata-adtracking-jongmin.ipynb']


### path and data type definitions

In [3]:
path = './data/'
dtypes = {
    'ip'      : 'uint32',
    'app'     : 'uint16',
    'device'  : 'uint16',
    'os'      : 'uint16',
    'channel' : 'uint16',
    'is_attributed' : 'uint8',
    'click_id' : 'uint32'
    }

### import train data

In [4]:
train_cols = ['ip','app','device','os', 'channel', 'click_time', 'is_attributed']
test_cols = ['ip','app','device','os', 'channel', 'click_time', 'click_id']
train_data = pd.read_csv(path+'train.csv', dtype=dtypes, skiprows = range(1,131886954), usecols=train_cols)
test_data = pd.read_csv(path+"test.csv", dtype=dtypes, usecols=test_cols)
len_train = len(train_data)  #test data 와 train data를 섞어서 쓴다.
train_data = train_data.append(test_data)
print( len(train_data), len_train)
del test_data; 
gc.collect() #garbage collections
#skiprows: skip rows from the beginning of the file

71807406 53016937


25

In [5]:
train_data.head()

Unnamed: 0,app,channel,click_id,click_time,device,ip,is_attributed,os
0,11,487,,2017-11-09 00:00:00,1,201143,0.0,13
1,2,469,,2017-11-09 00:00:00,1,34684,0.0,13
2,26,477,,2017-11-09 00:00:00,1,207368,0.0,19
3,18,121,,2017-11-09 00:00:00,1,110176,0.0,8
4,12,265,,2017-11-09 00:00:00,1,109644,0.0,19


#### hour, day, wday
- click time을 hour, day, wday(요일)로 나누었다.

In [6]:
train_data['hour'] = pd.to_datetime(train_data.click_time).dt.hour.astype('uint8')
train_data['day'] = pd.to_datetime(train_data.click_time).dt.day.astype('uint8')
train_data['wday'] = pd.to_datetime(train_data.click_time).dt.dayofweek.astype('uint8')

#### 파생 변수 만들기
- ip-day-hour combination
- ip-app combination
- ip-app-os combination

In [7]:
gp = train_data[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'qty'})
train_data = train_data.merge(gp, on=['ip','day','hour'], how='left')

In [8]:
gp.head()
del gp; gc.collect()

173

In [9]:
train_data.head()

Unnamed: 0,app,channel,click_id,click_time,device,ip,is_attributed,os,hour,day,wday,qty
0,11,487,,2017-11-09 00:00:00,1,201143,0.0,13,0,9,3,70
1,2,469,,2017-11-09 00:00:00,1,34684,0.0,13,0,9,3,54
2,26,477,,2017-11-09 00:00:00,1,207368,0.0,19,0,9,3,101
3,18,121,,2017-11-09 00:00:00,1,110176,0.0,8,0,9,3,146
4,12,265,,2017-11-09 00:00:00,1,109644,0.0,19,0,9,3,393


In [10]:
#grouping by ip-app combination
gp = train_data[['ip','app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train_data = train_data.merge(gp, on=['ip','app'], how='left')
del gp; gc.collect()

53

In [11]:
# print('group by ip-app-os combination....')
gp = train_data[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
train_data = train_data.merge(gp, on=['ip','app', 'os'], how='left')
del gp; gc.collect()

117

In [12]:
# 변수 타입 정리하기
train_data['qty'] = train_data['qty'].astype('uint16')
train_data['ip_app_count'] = train_data['ip_app_count'].astype('uint16')
train_data['ip_app_os_count'] = train_data['ip_app_os_count'].astype('uint16')

In [13]:
# label encoding
from sklearn.preprocessing import LabelEncoder
train_data[['app','device','os', 'channel', 'hour', 'day', 'wday']].apply(LabelEncoder().fit_transform)
# print ('final part of preparation....')
test_data = train_data[len_train:]
train_data = train_data[:len_train]
y_train = train_data['is_attributed'].values
train_data.drop(['click_id', 'click_time','ip','is_attributed'],1,inplace=True)

Neural Network
- Get max from each columns
- Model Configuration
    - Embedding layers: Turns positive integers (indexes) into dense vectors of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
    - SpatialDropout1D
    - 2 x Droupout
    - Dense
    Model

In [15]:
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, concatenate
from keras.layers import BatchNormalization, SpatialDropout1D
from keras.callbacks import Callback
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.


In [16]:
import keras.backend.tensorflow_backend as K

In [17]:
max_app = np.max([train_data['app'].max(), test_data['app'].max()])+1
max_ch = np.max([train_data['channel'].max(), test_data['channel'].max()])+1
max_dev = np.max([train_data['device'].max(), test_data['device'].max()])+1
max_os = np.max([train_data['os'].max(), test_data['os'].max()])+1
max_h = np.max([train_data['hour'].max(), test_data['hour'].max()])+1
max_d = np.max([train_data['day'].max(), test_data['day'].max()])+1
max_wd = np.max([train_data['wday'].max(), test_data['wday'].max()])+1
max_qty = np.max([train_data['qty'].max(), test_data['qty'].max()])+1
max_c1 = np.max([train_data['ip_app_count'].max(), test_data['ip_app_count'].max()])+1
max_c2 = np.max([train_data['ip_app_os_count'].max(), test_data['ip_app_os_count'].max()])+1

In [18]:
print( max_app, max_ch, max_dev, max_os, max_h, max_d, max_wd, max_qty, max_c1, max_c2)

769 501 4228 957 17 11 5 43959 65409 16654


In [19]:
def get_keras_data(dataset):
    X = {
        'app': np.array(dataset.app),
        'ch': np.array(dataset.channel),
        'dev': np.array(dataset.device),
        'os': np.array(dataset.os),
        'h': np.array(dataset.hour),
        'd': np.array(dataset.day),
        'wd': np.array(dataset.wday),
        'qty': np.array(dataset.qty),
        'c1': np.array(dataset.ip_app_count),
        'c2': np.array(dataset.ip_app_os_count)
    }
    return X

In [20]:
train_df = get_keras_data(train_data)

with K.tf.device('/device:GPU:2'):
    emb_n = 50
    dense_n = 1000
    in_app = Input(shape=[1], name = 'app')
    emb_app = Embedding(max_app, emb_n)(in_app)
    in_ch = Input(shape=[1], name = 'ch')
    emb_ch = Embedding(max_ch, emb_n)(in_ch)
    in_dev = Input(shape=[1], name = 'dev')
    emb_dev = Embedding(max_dev, emb_n)(in_dev)
    in_os = Input(shape=[1], name = 'os')
    emb_os = Embedding(max_os, emb_n)(in_os)
    in_h = Input(shape=[1], name = 'h')
    emb_h = Embedding(max_h, emb_n)(in_h) 
    in_d = Input(shape=[1], name = 'd')
    emb_d = Embedding(max_d, emb_n)(in_d) 
    in_wd = Input(shape=[1], name = 'wd')
    emb_wd = Embedding(max_wd, emb_n)(in_wd) 
    in_qty = Input(shape=[1], name = 'qty')
    emb_qty = Embedding(max_qty, emb_n)(in_qty) 
    in_c1 = Input(shape=[1], name = 'c1')
    emb_c1 = Embedding(max_c1, emb_n)(in_c1) 
    in_c2 = Input(shape=[1], name = 'c2')
    emb_c2 = Embedding(max_c2, emb_n)(in_c2) 
    fe = concatenate([(emb_app), (emb_ch), (emb_dev), (emb_os), (emb_h), 
                     (emb_d), (emb_wd), (emb_qty), (emb_c1), (emb_c2)])
    s_dout = SpatialDropout1D(0.2)(fe) # drops entire 1D feature maps instead of individual elements
    x = Flatten()(s_dout) 
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(x))
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(x))
    outp = Dense(1,activation='sigmoid')(x)
    model = Model(inputs=[in_app,in_ch,in_dev,in_os,in_h,in_d,in_wd,in_qty,in_c1,in_c2], outputs=outp)

    batch_size = 20000
    epochs = 2
    exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
    steps = int(len(train_df) / batch_size) * epochs
    lr_init, lr_fin = 0.001, 0.0001
    lr_decay = exp_decay(lr_init, lr_fin, steps)
    optimizer_adam = Adam(lr=0.001, decay=lr_decay)
    model.compile(loss='binary_crossentropy',optimizer=optimizer_adam,metrics=[''])
    model.summary()
    model.fit(train_df, y_train, batch_size=batch_size, epochs=2, shuffle=True, verbose=2)
    del train_df, y_train; gc.collect()
    model.save_weights('dl_support.h5')

    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')
    test_df.drop(['click_id', 'click_time','ip','is_attributed'],1,inplace=True)
    test_df = get_keras_data(test_df)

    print("predicting....")
    sub['is_attributed'] = model.predict(test_df, batch_size=batch_size, verbose=2)
    del test_df; gc.collect()
    print("writing....")
    sub.to_csv('dl_support.csv',index=False)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
app (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
ch (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
dev (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
os (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
h (InputLa

ImportError: `save_weights` requires h5py.