In [1]:
# Dcard intern hw
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# processor setting
print(torch.__version__)
print(torch.version.cuda)
if torch.cuda.is_available():
    cuda0 = torch.device(0)
    print(torch.cuda.get_device_name(cuda0))

1.12.1+cu113
11.3
NVIDIA GeForce RTX 2070


In [3]:
# load data
rawDataset = pd.read_csv("intern_homework_train_dataset.csv").reset_index(drop=True)
rawDataset.to_html("data.html")
print(rawDataset.shape)

(50000, 18)


In [None]:
# Data visualization
fig = plt.figure(figsize=(15, 15))
# forum_id and like_count_24h
ax1 = fig.add_subplot(3, 1, 1)
ax1.scatter(rawDataset["forum_id"], rawDataset["like_count_24h"])
ax1.set_xlabel("forum_id")
ax1.set_ylabel("like_count_24h")
# author_id and like_count_24h
ax1 = fig.add_subplot(3, 1, 2)
ax1.scatter(rawDataset["author_id"], rawDataset["like_count_24h"])
ax1.set_xlabel("author_id")
ax1.set_ylabel("like_count_24h")
# forum_stats and like_count_24h
ax1 = fig.add_subplot(3, 1, 3)
ax1.scatter(rawDataset["forum_stats"], rawDataset["like_count_24h"])
ax1.set_xlabel("forum_stats")
ax1.set_ylabel("like_count_24h")


In [4]:
# Word embedding
from transformers import BertTokenizer, BertModel
import gc

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese').cuda(device=cuda0)

title_list = list(rawDataset["title"])
embeddings_list = []
batch_size = 32

for i in range(0, len(title_list), batch_size):
    titles = title_list[i:i+batch_size]

    # tokenize titles
    inputs = tokenizer(titles, padding=True, truncation=True, max_length=50, return_tensors='pt').to(cuda0)

    # get embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # get embeddings from last hidden layer
    last_hidden_states = outputs.last_hidden_state
    embeddings = torch.mean(last_hidden_states, dim=1)  # average pooling

    embeddings_list.append(embeddings)

    del inputs, outputs, last_hidden_states, embeddings
    torch.cuda.empty_cache()

    print(f"Processed {i+batch_size}/{len(title_list)} titles.")

title_embeddings = torch.cat(embeddings_list, dim=0)

# check shape
print(title_embeddings.shape)

gc.collect()

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processed 32/50000 titles.
Processed 64/50000 titles.
Processed 96/50000 titles.
Processed 128/50000 titles.
Processed 160/50000 titles.
Processed 192/50000 titles.
Processed 224/50000 titles.
Processed 256/50000 titles.
Processed 288/50000 titles.
Processed 320/50000 titles.
Processed 352/50000 titles.
Processed 384/50000 titles.
Processed 416/50000 titles.
Processed 448/50000 titles.
Processed 480/50000 titles.
Processed 512/50000 titles.
Processed 544/50000 titles.
Processed 576/50000 titles.
Processed 608/50000 titles.
Processed 640/50000 titles.
Processed 672/50000 titles.
Processed 704/50000 titles.
Processed 736/50000 titles.
Processed 768/50000 titles.
Processed 800/50000 titles.
Processed 832/50000 titles.
Processed 864/50000 titles.
Processed 896/50000 titles.
Processed 928/50000 titles.
Processed 960/50000 titles.
Processed 992/50000 titles.
Processed 1024/50000 titles.
Processed 1056/50000 titles.
Processed 1088/50000 titles.
Processed 1120/50000 titles.
Processed 1152/5000

4

In [32]:
# data splitting
y_train = torch.tensor(np.array(rawDataset.iloc[:45000, -1]), device=cuda0, dtype=torch.float32).reshape(45000, 1)
y_valid = torch.tensor(np.array(rawDataset.iloc[45000:, -1]), device=cuda0, dtype=torch.float32).reshape(5000, 1)
print("y_train:", y_train.shape, y_train.type())
print("y_valid:", y_valid.shape, y_valid.type())

otherFeature = torch.tensor(np.array(rawDataset.drop(["title", "created_at", "forum_id", "author_id", "forum_stats", "like_count_24h"], axis=1)), device=cuda0, dtype=torch.float32)
dataset = torch.cat((title_embeddings, otherFeature), dim=1).to(device=cuda0)
print(dataset.size())

x_train = dataset[:45000, :].to(cuda0)
x_valid = dataset[45000:, :].to(cuda0)
print("x_train:", x_train.shape, x_train.type())
print("x_valid:", x_valid.shape, x_valid.type())


y_train: torch.Size([45000, 1]) torch.cuda.FloatTensor
y_valid: torch.Size([5000, 1]) torch.cuda.FloatTensor
torch.Size([50000, 780])
x_train: torch.Size([45000, 780]) torch.cuda.FloatTensor
x_valid: torch.Size([5000, 780]) torch.cuda.FloatTensor


In [56]:
# Neural network define
class network(nn.Module):
    def __init__(self):
        super(network, self).__init__()
        self.inputLayer = nn.Linear(780, 512).to(device=cuda0) # 768 word embedding + 12 feature(like 1~6hr + comment 1~6hr)
        self.hidden1 = nn.Linear(512, 256).to(device=cuda0)
        self.hidden2 = nn.Linear(256, 64).to(device=cuda0)
        self.hidden3 = nn.Linear(64, 16).to(device=cuda0)
        self.outputLayer = nn.Linear(16, 1).to(device=cuda0)
        self.relu = nn.ReLU().to(device=cuda0)
        self.dropout = nn.Dropout(p=0.5)
        
    def forward(self, x):
        x = self.inputLayer(x)
        x = self.relu(x)
        x = self.hidden1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.hidden2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.hidden3(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.outputLayer(x)
        return x

In [57]:
# hyperparameters
from torchmetrics import MeanAbsolutePercentageError
learning_rate = 1e-7
epochs = 300
batch_size = 32

model = network().to(cuda0)
mseLoss = nn.MSELoss().to(cuda0)
mape = MeanAbsolutePercentageError().to(cuda0)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
val_dataset = torch.utils.data.TensorDataset(x_valid, y_valid)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [58]:
for epoch in range(epochs):
    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(x_batch)
        loss = mseLoss(y_pred, y_batch)
        train_acc = mape(y_pred, y_batch)
        loss.backward()
        optimizer.step()
    
    if (epoch+1) % 5 == 0:
        print ('Epoch [{}/{}], Loss: {:.4f}, train_acc: {:2.2f}%'.format(epoch+1, epochs, loss.item(), train_acc*100))
    
    del x_batch, y_batch, y_pred, loss
    torch.cuda.empty_cache()

Epoch [5/300], Loss: 514.5679, train_acc: 101.24%
Epoch [10/300], Loss: 8806.4648, train_acc: 100.77%
Epoch [15/300], Loss: 873.0985, train_acc: 101.38%
Epoch [20/300], Loss: 20404.8457, train_acc: 101.35%
Epoch [25/300], Loss: 144.2855, train_acc: 101.62%
Epoch [30/300], Loss: 10039.4648, train_acc: 101.03%
Epoch [35/300], Loss: 572.8271, train_acc: 100.91%
Epoch [40/300], Loss: 5157.2432, train_acc: 100.85%
Epoch [45/300], Loss: 746.0383, train_acc: 100.72%
Epoch [50/300], Loss: 226.5113, train_acc: 101.16%
Epoch [55/300], Loss: 159201.2031, train_acc: 100.82%
Epoch [60/300], Loss: 20922.6250, train_acc: 100.82%
Epoch [65/300], Loss: 257.4751, train_acc: 100.91%
Epoch [70/300], Loss: 129.9521, train_acc: 101.11%
Epoch [75/300], Loss: 5820.4858, train_acc: 100.78%
Epoch [80/300], Loss: 1979.8844, train_acc: 100.58%
Epoch [85/300], Loss: 1119.9956, train_acc: 100.51%
Epoch [90/300], Loss: 1500.3088, train_acc: 100.67%
Epoch [95/300], Loss: 414.4248, train_acc: 100.53%
Epoch [100/300], 

In [59]:
MAPE = MeanAbsolutePercentageError().to(cuda0)
y_pred = model(x_valid)
MAPE(y_pred, y_valid)

tensor(0.9392, device='cuda:0', grad_fn=<SqueezeBackward0>)