In [1]:
!nvidia-smi

Fri Nov 18 00:54:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    32W /  70W |      0MiB / 15360MiB |     11%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [25]:
import warnings
import pickle

from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf 
import tensorboard as tb 
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

#### Read Dataset

In [9]:
# %%bigquery raw_data
# select * from `maximal-furnace-783.growth_temp.raw_data_hd_v6`
# order by userId, post_index

Query complete after 0.00s: 100%|██████████| 6/6 [00:00<00:00, 3268.29query/s]                        
Downloading: 100%|██████████| 5814207/5814207 [00:04<00:00, 1220143.55rows/s]


In [13]:
# raw_data.head()

Unnamed: 0,postCreatedOn,userId,postId,language,tagId,type,tag_genre,posted_after_days,posts_till_date,post_index
0,2020-01-27,98625,669430483,Telugu,S682642,1,Health and Fitness,1527,1,1
1,2021-06-29,98625,9574195053,Telugu,S110967,1,Status and Stories,2046,3,2
2,2021-06-29,98625,9182575053,Telugu,S128522,1,Status and Stories,2046,3,3
3,2021-06-30,98625,9136701153,Telugu,S110967,0,Status and Stories,2047,5,4
4,2021-06-30,98625,5538487053,Telugu,S110967,0,Status and Stories,2047,5,5


In [3]:
raw_data = pd.read_csv("./src/creator_eng_pred_raw_data_v3.csv")

In [4]:
# raw_data["tag_genre"].fillna("others", inplace=True)

In [5]:
raw_data.shape

(671400, 16)

In [6]:
raw_data.isnull().sum()

postCreatedOn        0
userId               0
postId               0
language             0
tagId                0
type                 0
tag_genre            0
posted_after_days    0
posts_till_date      0
post_index           0
total_views          0
vplay                0
svp                  0
engs                 0
day_of_month         0
day_of_week          0
dtype: int64

In [7]:
# target_cols = ['total_views', 'vplay', 'svp', 'engs']
drop_cols = ['postCreatedOn', 'total_views', 'vplay', 'svp']

In [8]:
# targets = raw_data[target_cols]

In [9]:
raw_data_1 = raw_data.drop(drop_cols, axis=1)

In [10]:
raw_data_1.iloc[10000:, :]

Unnamed: 0,userId,postId,language,tagId,type,tag_genre,posted_after_days,posts_till_date,post_index,engs,day_of_month,day_of_week
10000,4706761,9553113584,Malayalam,S9867526,0,Wishes,2039,3699,401,3,3,2
10001,4706761,3416113584,Malayalam,S9867526,0,Wishes,2039,3699,402,3,3,2
10002,4706761,9933865584,Malayalam,S112918,0,Wishes,2039,3699,403,3,3,2
10003,4706761,1573865584,Malayalam,S112918,0,Wishes,2039,3699,404,3,3,2
10004,4706761,5604865584,Malayalam,S112918,0,Wishes,2039,3699,405,0,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...
671395,2753816537,9805806294,Hindi,S14689935,0,Sports,503,5231,596,0,29,0
671396,2753816537,7595806294,Hindi,S14689935,0,Sports,503,5231,597,0,29,0
671397,2753816537,5236806294,Hindi,S14689935,0,Sports,503,5231,598,8,29,0
671398,2753816537,5710906294,Hindi,S14689935,1,Sports,503,5231,599,34,29,0


In [11]:
g = raw_data_1.groupby("userId")
size = g.size()

In [12]:
size

userId
420176        600
1128090       600
1736412       600
1988862       600
2095296       600
             ... 
2732187135    600
2735691253    600
2740719276    600
2746716196    600
2753816537    600
Length: 1119, dtype: int64

#### EDA

#### Data Preparation

In [13]:
raw_data_1['userId'] = raw_data_1['userId'].astype(str)
raw_data_1['postId'] = raw_data_1['postId'].astype(str)
raw_data_1['type'] = raw_data_1['type'].astype(str)


In [14]:
raw_data_1['day_of_month'] = raw_data_1['day_of_month'].astype(str)
raw_data_1['day_of_week'] = raw_data_1['day_of_week'].astype(str)
raw_data_1['type'] = raw_data_1['type'].astype(str)

In [15]:
raw_data_1["index_from_start"] = list(raw_data_1.index)

In [16]:
#Hyperparameters
#batch size=64
#number heads=4, hidden sizes=160, lr=0.001, gr_clip=0.1

max_prediction_length = 2
max_encoder_length = 50
training_cutoff = raw_data_1["index_from_start"].max() - max_prediction_length

training = TimeSeriesDataSet(
    raw_data_1[lambda x: x.index_from_start <= training_cutoff],
    time_idx="index_from_start",
    target="engs",
    group_ids=["userId"],
    min_encoder_length=max_encoder_length // 2, 
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=[],
    time_varying_known_categoricals=["day_of_month", "day_of_week", "language", "tagId", "tag_genre"],
    # time_varying_unknown_categoricals=["type"],
    time_varying_known_reals=["posted_after_days", "posts_till_date", "post_index"],
    time_varying_unknown_reals=['engs'],
    target_normalizer=GroupNormalizer(
        groups=["userId"], transformation="softplus"
    ),  # we normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True
)


validation = TimeSeriesDataSet.from_dataset(training, raw_data_1, predict=True, stop_randomization=True)

# create dataloaders for  our model
batch_size = 64 
# if you have a strong GPU, feel free to increase the number of workers  
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)



In [118]:
raw_data_1[raw_data_1["postId"]=='5710906294']

Unnamed: 0,userId,postId,language,tagId,type,tag_genre,posted_after_days,posts_till_date,post_index,engs,day_of_month,day_of_week,index_from_start
671398,2753816537,5710906294,Hindi,S14689935,1,Sports,503,5231,599,34,29,0,671398


In [17]:
raw_data_1.userId.value_counts()

420176        600
1069809909    600
1100834508    600
1083463883    600
1083369579    600
             ... 
317036504     600
318441793     600
318793979     600
319349667     600
2753816537    600
Name: userId, Length: 1119, dtype: int64

In [18]:
training.index

Unnamed: 0,time_first,time_last,time_diff_to_next,index_start,time,count,group_id,index_end,sequence_length
0,442200,442799,1,0,442200,600,0,51,52
1,442200,442799,1,1,442201,600,0,52,52
2,442200,442799,1,2,442202,600,0,53,52
3,442200,442799,1,3,442203,600,0,54,52
4,442200,442799,1,4,442204,600,0,55,52
...,...,...,...,...,...,...,...,...,...
700487,441600,442199,1,670798,441600,600,1118,670844,47
700488,441600,442199,1,670798,441600,600,1118,670845,48
700489,441600,442199,1,670798,441600,600,1118,670846,49
700490,441600,442199,1,670798,441600,600,1118,670847,50


In [19]:
validation.index

Unnamed: 0,time_first,time_last,time_diff_to_next,index_start,time,count,group_id,index_end,sequence_length
548,442200,442799,1,548,442748,600,0,599,52
1148,131400,131999,1,1148,131948,600,1,1199,52
1748,132000,132599,1,1748,132548,600,2,1799,52
2348,442800,443399,1,2348,443348,600,3,2399,52
2948,132600,133199,1,2948,133148,600,4,2999,52
...,...,...,...,...,...,...,...,...,...
668948,439800,440399,1,668948,440348,600,1114,668999,52
669548,440400,440999,1,669548,440948,600,1115,669599,52
670148,130800,131399,1,670148,131348,600,1116,670199,52
670748,441000,441599,1,670748,441548,600,1117,670799,52


#### Baseline model

In [19]:
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
baseline_predictions = Baseline().predict(val_dataloader)
(actuals - baseline_predictions).abs().mean().item()

  f"Attribute {k!r} is an instance of `nn.Module` and is already saved during checkpointing."
  f"Attribute {k!r} is an instance of `nn.Module` and is already saved during checkpointing."


263.15863037109375

#### build TFT model

In [34]:
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=5, verbose=True, mode="min")
lr_logger = LearningRateMonitor()  
logger = TensorBoardLogger("lightning_logs")  

trainer = pl.Trainer(
    max_epochs=10,
    accelerator='gpu', 
    devices=1,
    enable_model_summary=True,
    gradient_clip_val=0.1,
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
    # default_root_dir="lightning_logs/lightning_logs/version_4/checkpoints/"
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=4,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # there are 7 quantiles by default: [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]
    loss=QuantileLoss(),
    log_interval=10, 
    reduce_on_plateau_patience=4
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [31]:
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

Number of parameters in network: 289.3k


In [35]:
#### load saved model from checkpoint
best_model_path='lightning_logs/lightning_logs/version_4/checkpoints/epoch=1-step=21016.ckpt'
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
# trainer = pl.Trainer(
#     max_epochs=11,
#     accelerator='gpu', 
#     devices=1,
#     enable_model_summary=True,
#     gradient_clip_val=0.1,
#     callbacks=[lr_logger, early_stop_callback],
#     logger=logger,
#     resume_from_checkpoint=best_model_path
# )

In [32]:
# best_tft.hparams

In [None]:
# fit network
trainer.fit(
    best_tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
    # ckpt_path=best_model_path,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 265 K 
3  | prescalers                         | ModuleDict                      | 128   
4  | static_variable_selection          | VariableSelectionNetwork        | 1.7 K 
5  | encoder_variable_selection         | VariableSelectionNetwork        | 4.3 K 
6  | decoder_variable_selection         | VariableSelectionNetwork        | 3.6 K 
7  | static_context_variable_selection  | GatedResidualNetwork            | 1.1 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 1.1 K 
9  | static_context_initial_cell_lstm 

Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 67.323


Validation: 0it [00:00, ?it/s]

In [30]:
pl.__version__

'1.8.1'

In [37]:
# Start tensorboard
%load_ext tensorboard
%tensorboard --logdir lightning_logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
