In [53]:
import torch
import spacy
import pandas as pd
import string
import emoji
import re
import fasttext
from gensim.models import KeyedVectors
from gensim.utils import tokenize
import gensim
from tqdm import tqdm
import numpy as np

# Data preprocessing

In [35]:
tweets = pd.read_csv('../../data/tweets_train.tsv', sep='\t', converters={'target': str, 'id_str': str})

In [None]:
vector_model = KeyedVectors.load_word2vec_format('../../data/language_models/wiki.multi.pl.vec')

In [44]:
def clean_text(text):
  regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
  url_free = re.compile(regex).sub('', text)
  tokens = tokenize(url_free, lowercase=True)
  return ' '.join(list(tokens))

def filter_text(model, text):
  return len([word for word in tokenize(text, lowercase=True) if word in model]) > 0

In [46]:
tweets['clean_text'] = tweets.apply(lambda row: clean_text(row["full_text"]), axis=1)
tweets_found = tweets[tweets.apply(lambda row: filter_text(vector_model, row['clean_text']), axis=1)]

In [58]:
def prepare(model, tweets, targets):
  sequences = []
  targets_final = []
  for tweet, target in tqdm(zip(tweets, targets)):
    words = []
    for word in tokenize(tweet, lowercase=True):
      if word in model:
        vec = model.get_vector(word)
        norm = np.linalg.norm(vec)
        words.append(vec / norm)
    if len(words) > 0:
      sequences.append(torch.tensor(words))
      targets_final.append(target)

  max_len = max(sequences, key=lambda x: x.shape[0]).shape[0]
  padded_sequences = []
  for seq in sequences:
    seq_len = seq.shape[0]
    padding = torch.zeros(max_len - seq_len, 300)
    padded_sequences.append(torch.cat((padding, seq), dim=0))

  pad_text = torch.stack(padded_sequences)
  labels_tensor = torch.tensor(targets_final)

  return pad_text, labels_tensor
  


In [60]:
%load_ext tensorboard

%tensorboard --logdir lightning_logs/

# Data module

In [75]:
import torch
import numpy as np
import pytorch_lightning as pl
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from typing import Optional, Tuple


class TweetsDataModule(pl.LightningDataModule):

    def __init__(self, tweets: pd.DataFrame, test_size: float=0.2, batch_size: int=128):
        super().__init__()
        self.tweets = tweets
        self.test_size = test_size
        self.batch_size = batch_size
        self.le = LabelEncoder()

    def setup(self, stage: Optional[str] = None):
        self.le.fit(tweets["target"])
        targets = tweets["target"].apply(lambda x: self.le.transform([x])[0])
        X, y = prepare(vector_model, tweets['clean_text'], targets)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
    
    def train_dataloader(self):
        return DataLoader(TensorDataset(self.X_train, self.y_train), batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(TensorDataset(self.X_test, self.y_test), batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(TensorDataset(self.X_test, self.y_test), batch_size=self.batch_size)

    def predict_dataloader(self):
        return DataLoader(TensorDataset(self.X_test, self.y_test), batch_size=self.batch_size)

    def get_test_sets(self):
        return self.X_test, self.y_test
    
    def get_label_encoder(self):
        return self.le

# LSTM model

In [89]:
from sklearn.metrics import f1_score, accuracy_score
import torch.nn as nn
from sklearn.metrics import classification_report

class LSTM(pl.LightningModule):
  
  def __init__(self, input_dim, hidden_dim, output_dim, bidirectional=False, learning_rate=1e-4):
      super().__init__()
      self.input_dim = input_dim  # this is the number of features
      self.hidden_dim = hidden_dim
      self.num_layers = 1
      self.lstm = torch.nn.LSTM(input_dim,
                            hidden_dim,
                            num_layers=self.num_layers,
                            batch_first=True,
                            bidirectional=bidirectional)
      self.classifier = nn.Sequential(
        nn.Linear(in_features=2*hidden_dim if bidirectional else hidden_dim, out_features=output_dim),
        nn.Softmax(dim=1)
      )
      self.learning_rate = learning_rate
      self.loss = nn.CrossEntropyLoss()

  def forward(
      self,
      x: torch.Tensor
  ) -> torch.Tensor:
      out, (hn, _) = self.lstm(x)
      return self.classifier(out[:, -1, :])

      # return out
  
  def training_step(self, batch, batch_idx):
    x, y = batch
    y = y.flatten()
    y_hat = self(x)
    loss = self.loss(y_hat, y)
    self.log('train_loss', loss, on_epoch=True, on_step=False)
    return loss

  def validation_step(self, batch, batch_idx):
    x, y = batch
    y = y.flatten()
    y_hat = self(x)
    loss = self.loss(y_hat, y)
    y_hat = torch.argmax(self(x), 1)
    f1 = f1_score(y.cpu(), y_hat.cpu(), average='micro')
    acc = accuracy_score(y.cpu(), y_hat.cpu())
    self.log("val_loss", loss, prog_bar=True)
    self.log("val_f1_micro", f1, prog_bar=True)
    self.log("val_acc", acc, prog_bar=True)
    return loss

  def test_step(self, batch, batch_idx):
    x, y = batch
    y = y.flatten()
    y_hat = self(x)
    y_hat = torch.argmax(self(x), 1)

    report = classification_report(y, y_hat, output_dict=True)
    self.log_dict(report)
    return report

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

  def predict_step(
    self,
    batch,
    batch_idx: int,
    dataloader_idx: Optional[int] = None,
  ) -> Tuple[torch.Tensor, torch.Tensor]:
    x, y = batch
    z = self(x)
    return z, y

# Training

In [92]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import loggers as pl_loggers


def train_model(dataloader, name, epochs=40, lr=1e-4, bidirectional=False):
    AVAIL_GPUS = min(1, torch.cuda.device_count())
    model = LSTM(
      input_dim=300,
      hidden_dim=384,
      output_dim=4,
      learning_rate=lr,
      bidirectional=bidirectional
    )
    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath="../../data/lstm/",
        filename=name,
        save_top_k=1,
        mode="min",
    )
    tb_logger = pl_loggers.TensorBoardLogger("lightning_logs/", name=name, log_graph=True)
    trainer = pl.Trainer(
        max_epochs=epochs,
        gpus=AVAIL_GPUS,
        callbacks=[checkpoint_callback],
        logger=tb_logger,
    )
    trainer.fit(model, dataloader)

    return trainer

In [93]:
data = TweetsDataModule(tweets=tweets, batch_size=32)
trainer = train_model(data, 'lstm_384', epochs=50, bidirectional=True, lr=0.001)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
2499it [00:01, 1417.87it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(

  | Name       | Type             | Params
------------------------------------------------
0 | lstm       | LSTM             | 2.1 M 
1 | classifier | Sequential       | 3.1 K 
2 | loss       | CrossEntropyLoss | 0     
------------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.442     Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                              

  rank_zero_warn(


Epoch 49: 100%|██████████| 79/79 [00:01<00:00, 57.45it/s, loss=0.929, v_num=3, val_loss=1.270, val_f1_micro=0.464, val_acc=0.464]
