In [17]:
import glob
from collections import Counter, OrderedDict

import nltk
import numpy as np
import pandas as pd
import torch
from bidict import bidict
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
class FastTextSiamese(nn.Module):
    def __init__(
            self,
            vocab_size: int,
            emb_size: int = 100,
            hidden_size: int = 10
    ) -> None:
        super(FastTextSiamese, self).__init__()
        self.embeddings = nn.EmbeddingBag(vocab_size, emb_size, mode='mean')
        self.hidden = nn.Sequential(nn.Linear(emb_size, hidden_size), nn.ReLU())
        self.out = nn.Linear(hidden_size, 2)

    def forward_once(self, text: torch.Tensor) -> torch.Tensor:
        embeddings = self.embeddings(text)
        return self.hidden(embeddings)

    def forward(self, text1: torch.Tensor, text2: torch.Tensor) -> torch.Tensor:
        out1 = self.forward_once(text1)
        out2 = self.forward_once(text2)
        dis = torch.abs(out1 - out2)
        return self.out(dis)

In [4]:
def read_parquet_folder(split):
    p = "D:/Code/Pycharm/kaggle-code-v2/kaggle-contests/quora-question-pairs/fasttext_data"
    return pd.concat(
        [pd.read_parquet(f) for f in glob.glob(f"{p}/{split}/*") if str(f).endswith('.parquet')],
        axis=0, ignore_index=True
    )

train_df, valid_df = train_test_split(read_parquet_folder('train_df'), test_size=0.2, shuffle=True)
pred_df = read_parquet_folder('pred_df')

In [5]:
word_counter = Counter()
sentence_word_limit = 30
word_min_count = 3
for ixd, i in train_df.iterrows():
    w1 = nltk.word_tokenize(str(i['question1']))
    w2 = nltk.word_tokenize(str(i['question2']))
    word_counter.update(w1)
    word_counter.update(w2)

word_id_mapping = bidict(enumerate([k for k, v in word_counter.items() if v >= word_min_count]))
word_id_mapping.inv['<pad>'] = len(word_id_mapping)
word_id_mapping.inv['<unk>'] = len(word_id_mapping)

def sentence_to_ids(text):
    words = nltk.word_tokenize(str(text))[:sentence_word_limit]
    words = [word_id_mapping.inv.get(i, word_id_mapping.inv['<unk>']) for i in words]
    words += [word_id_mapping.inv['<pad>']] * (sentence_word_limit - len(words))
    return words

In [6]:
class QuoraDataset(Dataset):
    def __init__(self, df):
        self.df_dict = df.to_dict(orient='records')

    def __len__(self):
        return len(self.df_dict)

    def __getitem__(self, idx):
        cur = self.df_dict[idx]
        t1 = sentence_to_ids(cur['question1'])
        t2 = sentence_to_ids(cur['question2'])
        label = cur.get('is_duplicate', 0)
        return torch.LongTensor(t1).to(device), torch.LongTensor(t2).to(device), torch.LongTensor([label]).to(device)


batch_size = 64
train_dl = DataLoader(QuoraDataset(train_df), batch_size, shuffle=False)
valid_dl = DataLoader(QuoraDataset(valid_df), batch_size, shuffle=False)
pred_dl = DataLoader(QuoraDataset(pred_df), batch_size, shuffle=False)

In [7]:
class Summer:
    def __init__(self):
        self.sum_num = 0.0
        self.sum_weight = 0.0

    @staticmethod
    def convert(n):
        from typing import Iterable
        if isinstance(n, Iterable):
            n = next(iter(n))
        return float(n)

    def add(self, num, weight=1.0):
        self.sum_num += self.convert(num)
        self.sum_weight += self.convert(weight)

    def __str__(self):
        return "{:.4f}".format(self.sum_num / self.sum_weight)


class MultiSummer:
    def __init__(self):
        self.summers = OrderedDict()

    def put(self, key, num, weight=1.0):
        if key not in self.summers:
            self.summers[key] = Summer()
        self.summers[key].add(num, weight)

    def get(self, key):
        return self.summers.get(key, Summer())


In [8]:
from torchsummaryX import summary

vocab_size = len(word_id_mapping)
print('vocab_size', vocab_size)
sample = torch.zeros(size=(batch_size, sentence_word_limit), dtype=torch.int64)
summary(FastTextSiamese(vocab_size=vocab_size), x=sample, text2=sample)

vocab_size 46220
                   Kernel Shape Output Shape  Params Mult-Adds
Layer                                                         
0_embeddings       [100, 46220]    [64, 100]  4.622M    4.622M
1_hidden.Linear_0     [100, 10]     [64, 10]   1.01k      1.0k
2_hidden.ReLU_1               -     [64, 10]       -         -
3_embeddings       [100, 46220]    [64, 100]       -    4.622M
4_hidden.Linear_0     [100, 10]     [64, 10]       -      1.0k
5_hidden.ReLU_1               -     [64, 10]       -         -
6_out                   [10, 2]      [64, 2]    22.0      20.0
------------------------------------------------------------------
                         Totals
Total params          4.623032M
Trainable params      4.623032M
Non-trainable params        0.0
Mult-Adds              9.24602M


  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_embeddings,"[100, 46220]","[64, 100]",4622000.0,4622000.0
1_hidden.Linear_0,"[100, 10]","[64, 10]",1010.0,1000.0
2_hidden.ReLU_1,-,"[64, 10]",,
3_embeddings,"[100, 46220]","[64, 100]",,4622000.0
4_hidden.Linear_0,"[100, 10]","[64, 10]",,1000.0
5_hidden.ReLU_1,-,"[64, 10]",,
6_out,"[10, 2]","[64, 2]",22.0,20.0


In [20]:
from sklearn.metrics import precision_score, recall_score

net = FastTextSiamese(vocab_size=vocab_size).to(device)
optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, net.parameters()))
loss_func = nn.CrossEntropyLoss()

tot_epoch = 10
for epoch in range(tot_epoch):
    def run(data_loader, is_train=True):
        net.train(is_train)
        pbar = tqdm(iterable=data_loader, colour='#1d3557' if is_train else '#457b9d')
        summer = MultiSummer()
        for t1, t2, labels in pbar:
            labels = torch.squeeze(labels)
            y_hat = net(t1, t2)
            loss = loss_func(y_hat, labels)
            # metrics
            local_real, local_pred = labels.detach().cpu(), np.argmax(y_hat.detach().cpu(), axis=1)
            summer.put('loss', loss.item())
            summer.put('precision', precision_score(local_real, local_pred, zero_division=0))
            summer.put('recall', recall_score(local_real, local_pred, zero_division=0))
            if is_train:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            pbar.set_description(
                f'Epoch {epoch + 1}/{tot_epoch} - '
                f'loss: {summer.get("loss")} '
                f'precision: {summer.get("precision")} '
                f'recall: {summer.get("recall")} '
            )


    run(train_dl, is_train=True)
    run(valid_dl, is_train=False)

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]

  0%|          | 0/5054 [00:00<?, ?it/s]

  0%|          | 0/1264 [00:00<?, ?it/s]