@Copyright IQIYI 2021
http://challenge.ai.iqiyi.com/

In [2]:
import pandas as pd
import numpy as np
import json
import math
import tensorflow as tf

In [3]:
data_dir = "D:/Cooperation-qin/CCF/用户留存预测挑战赛/爱奇艺官方/"

In [4]:
class DataGenerator:
    def __init__(self, df, batch_size):
        self.data = df
        self.num = df.shape[0]
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(self.num / self.batch_size)

    def __iter__(self):
        while True:
            input_1, input_2, output = [], [], []
            for row in self.data.itertuples():
                idx = row.Index
                seq = [row.launch_seq, row.playtime_seq]
                fea = row.duration_prefer + row.interact_prefer + list(row[7:18])
                input_1.append(np.array(seq))
                input_2.append(np.array(fea))
                output.append(row.label)
                if len(input_1) == self.batch_size or idx == self.num - 1:
                    input_1 = np.array(input_1).transpose([0, 2, 1])
                    input_2 = np.array(input_2)
                    output = np.array(output)
                    yield (input_1, input_2), output
                    input_1, input_2, output = [], [], []

In [5]:
def build_model(seq_feature_num, seq_len, feature_num):
    input_1 = tf.keras.Input(shape=(seq_len, seq_feature_num))
    output_1 = tf.keras.layers.GRU(64)(input_1)

    input_2 = tf.keras.Input(shape=(feature_num, ))
    layer = tf.keras.layers.Dense(256, activation="elu")(input_2)
    layer = tf.keras.layers.Dense(128, activation="elu")(layer)
    output_2 = tf.keras.layers.Dense(64, activation="elu")(layer)

    output = tf.concat([output_1, output_2], -1)
    output = tf.keras.layers.Dense(1, activation="relu")(output)

    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    return model

# train

In [6]:
# train data
data = pd.read_csv(data_dir + "train_data.txt", sep="\t")
data["launch_seq"] = data.launch_seq.apply(lambda x: json.loads(x))
data["playtime_seq"] = data.playtime_seq.apply(lambda x: json.loads(x))
data["duration_prefer"] = data.duration_prefer.apply(lambda x: json.loads(x))
data["interact_prefer"] = data.interact_prefer.apply(lambda x: json.loads(x))

In [7]:
# shuffle data
data = data.sample(frac=1).reset_index(drop=True)
data

Unnamed: 0,user_id,end_date,label,launch_seq,playtime_seq,duration_prefer,father_id_score,cast_id_score,tag_score,device_type,device_ram,device_rom,sex,age,education,occupation_status,territory_score,interact_prefer
0,10519766,191,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.000000,-2.041925,0.343265,1.515061,-0.955892,-0.319111,2.055850,0.746096,1.817832,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,10085535,193,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.000000,0.194954,-0.746126,-0.174459,1.046141,0.828011,-0.544818,-1.340308,-1.015551,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,10493740,128,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, ...",0.000000,-1.826377,-1.025027,-2.041925,-1.541593,0.038601,-0.955892,0.828011,0.755516,0.746096,0.652215,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,10384142,182,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0.0478, 0, 0, 0.048, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",-1.230807,-0.236381,0.560509,0.194954,0.178322,-0.180185,-0.955892,-1.466234,-0.544818,-1.340308,1.716007,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,10571282,143,0,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.000000,0.194954,0.155786,-0.160641,1.046141,-0.319111,-0.544818,-1.340308,-1.143000,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599996,10410125,189,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0557, 0, 0, 0, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",-1.018896,-0.984872,-0.673679,0.194954,0.204214,-0.098779,-0.955892,0.828011,-0.544818,0.746096,-0.202917,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
599997,10422569,137,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.000000,-2.041925,0.342785,0.037481,1.046141,-0.319111,2.055850,0.746096,0.925243,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
599998,10219961,110,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.75, 0.0, 0.25...",-1.510313,0.000000,0.322952,0.194954,0.000000,-0.804012,-0.955892,-0.319111,-0.544818,0.746096,-2.287750,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
599999,10320490,125,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.000000,0.000000,0.000000,0.194954,-0.828598,-0.802015,-0.955892,-1.466234,-0.544818,-1.340308,-1.200786,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [9]:
model = build_model(seq_feature_num=2, seq_len=32, feature_num=38)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 38)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 256)          9984        input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 32, 2)]      0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          32896       dense[0][0]                      
______________________________________________________________________________________________

In [10]:
# testing DataGenerator
generator_test = DataGenerator(data[:20], batch_size=8)
for i, item in enumerate(iter(generator_test)):
    if(i == len(generator_test)):
        break
    (input_1, input_2), output = item
    print(i, input_1.shape, input_2.shape)
    print(i, output.shape, output)

0 (8, 32, 2) (8, 38)
0 (8,) [0 0 0 0 0 2 0 1]
1 (8, 32, 2) (8, 38)
1 (8,) [0 1 0 2 3 0 3 1]
2 (4, 32, 2) (4, 38)
2 (4,) [0 3 0 0]


In [11]:
train = DataGenerator(data.loc[30001:], 128)
dev = DataGenerator(data.loc[:30000], 64)

In [12]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss="mae",
    metrics=["mse"]
)

In [13]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", 
    patience=3, 
    restore_best_weights=True
)

In [14]:
model.fit(
    iter(train),
    steps_per_epoch=len(train),
    validation_data=iter(dev),
    validation_steps=len(dev),
    epochs=20,
    callbacks=[early_stopping]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<tensorflow.python.keras.callbacks.History at 0x27d009b3a90>

In [13]:
model.save(data_dir + "best_weights.h5")

# predict

In [14]:
data = pd.read_csv(data_dir + "test_data.txt", sep="\t")
data["launch_seq"] = data.launch_seq.apply(lambda x: json.loads(x))
data["playtime_seq"] = data.playtime_seq.apply(lambda x: json.loads(x))
data["duration_prefer"] = data.duration_prefer.apply(lambda x: json.loads(x))
data["interact_prefer"] = data.interact_prefer.apply(lambda x: json.loads(x))

test = DataGenerator(data, 64)
# can also load model from saved weights
# model = build_model(seq_feature_num=2, seq_len=32, feature_num=38)
# model.load_weights(data_dir + "best_weights.h5")
prediction = model.predict(iter(test), steps=len(test))
prediction

2021-11-17 09:42:17.016232: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_standard_gru_196469_specialized_for_model_gru_StatefulPartitionedCall_at___inference_distributed_function_196715' and '__inference_standard_gru_196469' both implement 'gru_22c19379-1a8e-4104-82b2-4c6257735474' but their signatures do not match.


array([[ 5.1163882e-04],
       [ 7.7824295e-04],
       [ 6.9034106e-01],
       ...,
       [ 5.7920069e-04],
       [ 5.5324608e-01],
       [-1.1333227e-03]], dtype=float32)

In [15]:
data["prediction"] = np.reshape(prediction, -1)
data = data[["user_id", "prediction"]]
# can clip outputs to [0, 7] or use other tricks
data

Unnamed: 0,user_id,prediction
0,10007813,0.000512
1,10052988,0.000778
2,10279068,0.690341
3,10546696,0.192047
4,10406659,0.000398
...,...,...
14996,10355586,0.000019
14997,10589773,6.347603
14998,10181954,0.000579
14999,10544736,0.553246


In [16]:
data.to_csv(data_dir + "baseline_submission.csv", index=False, header=False, float_format="%.2f")