In [1]:
# import required libraries
import numpy as np 
import pandas as pd 

from sklearn import metrics
from sklearn import preprocessing

from keras.layers import Input, Embedding, Dense, Dropout, concatenate, Flatten, LSTM
from keras.models import Model


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
ip = pd.read_csv("ip.csv")
os = pd.read_csv("os.csv")
dev = pd.read_csv("device.csv")
channel = pd.read_csv("channel.csv")
app = pd.read_csv("app.csv")

In [3]:
le_ip = preprocessing.LabelEncoder()
le_os = preprocessing.LabelEncoder()
le_dev = preprocessing.LabelEncoder()
le_ch = preprocessing.LabelEncoder()
le_app = preprocessing.LabelEncoder()

In [4]:
max_ip  = np.max(le_ip.fit_transform(ip.ip))+1
max_dev = np.max(le_dev.fit_transform(dev.device))+1
max_os  = np.max(le_os.fit_transform(os.os))+1
max_ch  = np.max(le_ch.fit_transform(channel.channel))+1
max_app = np.max(le_app.fit_transform(app.app))+1

In [5]:
# preparing data for deep learning
dtypes = {
        'ip'             : 'uint32',
        'app'            : 'uint16',
        'device'         : 'uint16',
        'os'             : 'uint16',
        'channel'        : 'uint16',
        'is_attributed'  : 'uint8',
        }
train_df = pd.read_csv("train.csv", usecols=['ip','app','device','os', 'channel', 'is_attributed'],dtype=dtypes)
X = {
        'ip': np.array(le_ip.transform(train_df.ip)),
        'os': np.array(le_os.transform(train_df.os)),
        'dev': np.array(le_dev.transform(train_df.device)),
        'ch': np.array(le_ch.transform(train_df.channel)),
        'app': np.array(le_app.transform(train_df.app))
    }
y_train = train_df.is_attributed

In [10]:
emb_n = 10
dense_n = 25

in_ip = Input(shape=[1], name = 'ip')
emb_ip = Embedding(max_ip, emb_n)(in_ip)
in_os = Input(shape=[1], name = 'os')
emb_os = Embedding(max_os, emb_n)(in_os)
in_dev = Input(shape=[1], name = 'dev')
emb_dev = Embedding(max_dev, emb_n)(in_dev)
in_ch = Input(shape=[1], name = 'ch')
emb_ch = Embedding(max_ch, emb_n)(in_ch)
in_app = Input(shape=[1], name = 'app')
emb_app = Embedding(max_app, emb_n)(in_app)
              
x = concatenate([(emb_ip), emb_os, (emb_dev), (emb_ch), (emb_app)])

x = LSTM(units = 25, return_sequences = True, input_shape = (5, 10))(x)
x = Dropout(0.2)(x)
x = LSTM(units = 25, return_sequences = False, input_shape = (5, 10))(x)
x = Dropout(0.2)(x)

x = Dense(dense_n,activation='relu')(x)
x = Dropout(0.2)(x)

outp = Dense(1,activation='sigmoid')(x)

model = Model(inputs=[in_ip,in_app,in_ch,in_dev,in_os], outputs=outp)

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ip (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
os (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
dev (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
ch (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
app (Input

In [21]:
#training
batch_size = 2048
class_weight = {0: 1.,
                1: 2.}
validation_split=0.1
epochs=1
model.fit(X, y_train, batch_size=batch_size,validation_split=validation_split, epochs=epochs,shuffle=True)

Train on 166413501 samples, validate on 18490389 samples
Epoch 1/1


<keras.callbacks.History at 0x7ff81afa12b0>

In [22]:
# evaluating
dtypes = {
        'ip'             : 'uint32',
        'app'            : 'uint16',
        'device'         : 'uint16',
        'os'             : 'uint16',
        'channel'        : 'uint16',
        'is_attributed'  : 'uint8',
        }
test_df = pd.read_csv("train.csv",nrows=1000000, usecols=['ip','app','device','os', 'channel', 'is_attributed'],dtype=dtypes)
X_test = {
        'ip': np.array(le_ip.transform(test_df.ip)),
        'os': np.array(le_os.transform(test_df.os)),
        'dev': np.array(le_dev.transform(test_df.device)),
        'ch': np.array(le_ch.transform(test_df.channel)),
        'app': np.array(le_app.transform(test_df.app))
    }
y_test = test_df.is_attributed

y_pred = model.predict(X_test)
# accuracy
cm = metrics.confusion_matrix(y_test, y_pred > 0.5)
print(cm)
# AUC
fpr, tpr, thresholds = metrics.roc_curve(y_test.values+1, y_pred, pos_label=2)
metrics.auc(fpr, tpr)

[[998244     63]
 [  1044    649]]


0.9644379890263491

In [23]:
# Submission
submission_df = pd.read_csv("test.csv",usecols=['ip','app','device','os', 'channel', 'click_id'],dtype=dtypes)
X_submission = {
    'ip': np.array(le_ip.transform(submission_df.ip)),
    'os': np.array(le_os.transform(submission_df.os)),
    'dev': np.array(le_dev.transform(submission_df.device)),
    'ch': np.array(le_ch.transform(submission_df.channel)),
    'app': np.array(le_app.transform(submission_df.app))
}
submission = pd.DataFrame()
submission['click_id'] = submission_df.click_id
submission['is_attributed'] = model.predict(X_submission)

In [None]:
submission.head()

In [24]:
submission.to_csv('submission_12.csv',index=False,float_format='%.5f')