In [None]:
!unzip -oq /home/aistudio/data/data308355/data.zip

In [1]:
import pandas as pd
import numpy as np
from itertools import groupby

%pylab inline
import seaborn as sns

PATH = './data/'

In [2]:
user_interaction = pd.read_csv(PATH + 'user_interaction_data.csv')
user_portrait = pd.read_csv(PATH + 'user_portrait_data.csv')
user_playback = pd.read_csv(PATH + 'user_playback_data.csv')

app_launch = pd.read_csv(PATH + 'app_launch_logs.csv')
video_related = pd.read_csv(PATH + 'video_related_data.csv')

In [3]:
user_portrait.head(2)

In [4]:
print(user_portrait.shape)
for col in user_portrait.columns:
    print(f'{col} \t {user_portrait.dtypes[col]} {user_portrait[col].nunique()}')

In [5]:
user_portrait['user_id'].value_counts()

In [6]:
user_portrait[user_portrait['user_id'] == 10268855]

In [7]:
user_portrait = user_portrait.drop_duplicates()

In [8]:
user_portrait['device_type'].value_counts()

In [9]:
user_portrait['device_ram'] = user_portrait['device_ram'].apply(lambda x: str(x).split(';')[0])
user_portrait['device_rom'] = user_portrait['device_rom'].apply(lambda x: str(x).split(';')[0])

In [10]:
sns.distplot(user_portrait['device_ram'])

In [11]:
sns.distplot(user_portrait['device_rom'])

In [12]:
user_portrait['sex'].value_counts()

In [13]:
sns.distplot(user_portrait['age'])

In [14]:
sns.distplot(user_portrait['education'])

In [15]:
sns.distplot(user_portrait['occupation_status'])

In [16]:
sns.distplot(user_portrait['territory_code'])

In [17]:
app_launch.head(2)

In [18]:
print(app_launch.shape)
for col in app_launch.columns:
    print(f'{col} \t {app_launch.dtypes[col]} {app_launch[col].nunique()}')

In [19]:
app_launch['launch_type'].value_counts()

In [20]:
app_launch.groupby('user_id')['launch_type'].mean()

In [21]:
app_launch = app_launch.sort_values(by=['user_id', 'date'])

In [22]:
app_launch.head()

In [23]:
video_related.head(2)

In [24]:
sns.distplot(video_related['duration'])

In [25]:
user_playback.head()

In [26]:
user_interaction.head(2)

In [27]:
def count_launch_by_day(day1, day2):
    u1 = set(app_launch[app_launch['date'].isin(day1)]['user_id'].unique())
    u2 = set(app_launch[app_launch['date'].isin(day2)]['user_id'].unique())

    print(len(u1&u2)/len(u1))

count_launch_by_day([131], [132])

In [28]:
app_launch['date'].min(), app_launch['date'].max()

In [29]:
app_launch[app_launch['user_id'] == 10052988]

In [30]:
test_a = pd.read_csv('./data/test-a.csv')
test_a

In [31]:
# del user_interaction, user_portrait, user_playback, app_launch, video_related

!mkdir wsdm_model_data
!python3 baseline_feature_engineering.py

In [32]:
!unzip -oq /home/aistudio/data/data308355/data.zip

In [33]:
import pandas as pd
import numpy as np
import json
import math

data_dir = "./wsdm_model_data/"

In [34]:
# train data
data = pd.read_csv(data_dir + "train_data.txt", sep="\t")
data["launch_seq"] = data.launch_seq.apply(lambda x: json.loads(x))
data["playtime_seq"] = data.playtime_seq.apply(lambda x: json.loads(x))
data["duration_prefer"] = data.duration_prefer.apply(lambda x: json.loads(x))
data["interact_prefer"] = data.interact_prefer.apply(lambda x: json.loads(x))
# shuffle data
data = data.sample(frac=1).reset_index(drop=True)

In [35]:
data['label'].plot.hist()

In [36]:
def set_weight(row):
    if row['label']== 2 or row['label']==3 :
        return 60
    elif row['label']==1  :
        return 30
    elif row['label']==4 :
        return 20
    elif row['label']==5 or row['label']==6 :
        return 10
    else:
        return 5
    
data['sample_weight'] = data.apply(set_weight,axis=1)

In [37]:
sample_data = data[(data['label']!=0) & (data['label']!=2)]
tmp = data[data['label']==2].sample(n=120000,random_state=48,replace=True).reset_index(drop=True)
sample_data = pd.concat([sample_data,tmp],axis=0)

tmp = data[data['label']==0].sample(n=150000,random_state=48,replace=False).reset_index(drop=True)
sample_data = pd.concat([sample_data,tmp],axis=0)

sample_data.drop(columns =['sample_weight'],inplace=True)
sample_data = sample_data.sample(frac=1).reset_index(drop=True)
sample_data['label'].plot.hist()

In [38]:
sample_data.columns

In [39]:
import paddle
from paddle.io import DataLoader, Dataset

class CoggleDataset(Dataset):
    def __init__(self, df):
        super(CoggleDataset, self).__init__()
        self.df = df
        self.feat_col = list(set(self.df.columns) - set(['user_id', 'end_date', 'label', 'launch_seq', 'playtime_seq', 
                'duration_prefer', 'interact_prefer']))
        self.df_feat = self.df[self.feat_col]
    
    def __getitem__(self, index):
        launch_seq = self.df['launch_seq'].iloc[index]
        playtime_seq = self.df['playtime_seq'].iloc[index]
        duration_prefer = self.df['duration_prefer'].iloc[index]
        interact_prefer = self.df['interact_prefer'].iloc[index]

        feat = self.df_feat.iloc[index].values.astype(np.float32)

        launch_seq = paddle.to_tensor(launch_seq).astype(paddle.float32)
        playtime_seq = paddle.to_tensor(playtime_seq).astype(paddle.float32)
        duration_prefer = paddle.to_tensor(duration_prefer).astype(paddle.float32)
        interact_prefer = paddle.to_tensor(interact_prefer).astype(paddle.float32)
        feat = paddle.to_tensor(feat).astype(paddle.float32)

        label = paddle.to_tensor(self.df['label'].iloc[index]).astype(paddle.float32)
        return launch_seq, playtime_seq, duration_prefer, interact_prefer, feat, label

    def __len__(self):
        return len(self.df)

In [40]:
train_dataset = CoggleDataset(sample_data.iloc[:-600])
val_dataset = CoggleDataset(sample_data.iloc[-600:])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

In [41]:
test_data = pd.read_csv(data_dir + "test_data.txt", sep="\t")
test_data["launch_seq"] = test_data.launch_seq.apply(lambda x: json.loads(x))
test_data["playtime_seq"] = test_data.playtime_seq.apply(lambda x: json.loads(x))
test_data["duration_prefer"] = test_data.duration_prefer.apply(lambda x: json.loads(x))
test_data["interact_prefer"] = test_data.interact_prefer.apply(lambda x: json.loads(x))
test_data['label'] = 0

test_dataset = CoggleDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

In [42]:
test_data.shape

In [43]:
import paddle
class CoggleModel(paddle.nn.Layer):
    def __init__(self):
        super(CoggleModel, self).__init__()
        
        self.launch_seq_gru = paddle.nn.GRU(1, 32)
        self.playtime_seq_gru = paddle.nn.GRU(1, 32)
        self.fc1 = paddle.nn.Linear(102, 64)
        self.fc2 = paddle.nn.Linear(64, 1)

    def forward(self, launch_seq, playtime_seq, duration_prefer, interact_prefer, feat):
        launch_seq = launch_seq.reshape((-1, 32, 1))
        playtime_seq = playtime_seq.reshape((-1, 32, 1))

        launch_seq_feat = self.launch_seq_gru(launch_seq)[0][:, :, 0]
        playtime_seq_feat = self.playtime_seq_gru(playtime_seq)[0][:, :, 0]
        
        all_feat = paddle.concat([launch_seq_feat, playtime_seq_feat, duration_prefer, interact_prefer, feat], 1)
        all_feat_fc1 = self.fc1(all_feat)
        all_feat_fc2 = self.fc2(all_feat_fc1)

        return all_feat_fc2

model = CoggleModel()

In [44]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.001)
criterion = paddle.nn.MSELoss()

def train(model, train_loader, optimizer, criterion):
    model.train()
    train_loss = []
    for launch_seq, playtime_seq, duration_prefer, interact_prefer, feat, label in tqdm(train_loader):
        pred = model(launch_seq, playtime_seq, duration_prefer, interact_prefer, feat)

        loss = criterion(pred, label)
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        train_loss.append(loss.item())
    return np.mean(train_loss)


def validate(model, val_loader, optimizer, criterion):
    model.eval()
    val_loss = []
    for launch_seq, playtime_seq, duration_prefer, interact_prefer, feat, label in tqdm(val_loader):
        pred = model(launch_seq, playtime_seq, duration_prefer, interact_prefer, feat)

        loss = criterion(pred, label)
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        val_loss.append(loss.item())

    return np.mean(val_loss)

def predict(model, test_loader):
    model.eval()
    test_pred = []
    for launch_seq, playtime_seq, duration_prefer, interact_prefer, feat, label in tqdm(test_loader):
        pred = model(launch_seq, playtime_seq, duration_prefer, interact_prefer, feat)
        test_pred.append(pred.numpy())

    return test_pred

In [45]:
for epoch in range(5):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = validate(model, val_loader, optimizer, criterion)

    print(epoch, train_loss, val_loss)

In [46]:
test_pred = predict(model, test_loader)
test_pred = np.vstack(test_pred)

In [47]:
test_data["prediction"] = test_pred[:, 0]
test_data = test_data[["user_id", "prediction"]]

In [48]:
test_pred.max()

In [49]:
test_data['prediction'][test_data['prediction']<0.8]=0
test_data['prediction'][test_data['prediction']>7]=7
test_data['prediction'] = np.round(test_data['prediction'])
# can clip outputs to [0, 7] or use other tricks
test_data.to_csv("./baseline_submission.csv", index=False, header=False, float_format="%.2f")

In [50]:
data['label'].plot.hist()

In [51]:
test_data['prediction'].plot.hist()