## Char BiLSTM for MIT Movies
I was going to make this repository a package with setup.py and everything but because of my deadlines and responsibilities at my current workplace I haven't got the time to do that so I shared the structure of the project in README.md file.

In [1]:
import os
import sys
import json
import numpy as np
from tqdm import tqdm
sys.path.append("..")

from torch import nn
from torch.optim import Adam
from src.namedentityrecognizer.trainers import CharBilstmTrainer
from src.namedentityrecognizer.models.char_lstm import CharBilstm
from src.namedentityrecognizer.utils.processors import NerPreProcessor
from src.namedentityrecognizer.data.build_dataset import CharCorpus, BuildData

In [2]:
# For finding the absolute path dynamically for every other user for the sake of this notebooks paths
for path in globals()['_dh']:
    if "NamedEntityRecognizer" in path.split(os.sep):
        absolute_path = path
        break
print(absolute_path)

/home/karaz/Desktop/NamedEntityRecognizer


In [3]:
dataset = CharCorpus(
    input_folder=os.path.join(absolute_path, "data/modified/mitmovies_tab_format"),
    min_word_freq=3,
    batch_size=64,
)
print(f"Train set: {len(dataset.train_dataset)} sentences")
print(f"Test set: {len(dataset.test_dataset)} sentences")

Train set: 9775 sentences
Test set: 2443 sentences


In [4]:
# Create datasets with tab as splitter for corpus of torch text to handle - Uncomment if needed -
# Convert ->  O	good             -> to ->  good     O           
# Convert ->  B-GENRE	romantic -> to ->  romantic B-GENRE
# Convert ->  I-GENRE	comedies -> to ->  comedies I-GENRE
BuildData.create_finaldata(os.path.join(absolute_path, "data/raw/mitmovies/engtrain.bio"), os.path.join(absolute_path, "data/modified/mitmovies_tab_format/train.txt"), splits="\t")
BuildData.create_finaldata(os.path.join(absolute_path, "data/raw/mitmovies/engtest.bio"), os.path.join(absolute_path, "data/modified/mitmovies_tab_format/test.txt"), splits="\t")

In [5]:
bilstm = CharBilstm(
    input_dim=len(dataset.word_field.vocab),
    embedding_dim=300,
    char_emb_dim=25,
    char_input_dim=len(dataset.char_field.vocab),
    char_cnn_filter_num=5,
    char_cnn_kernel_size=3,
    hidden_dim=64,
    output_dim=len(dataset.tag_field.vocab),
    lstm_layers=2,
    emb_dropout=0.5,
    cnn_dropout=0.25,
    lstm_dropout=0.1,
    fc_dropout=0.25,
    word_pad_idx=dataset.word_pad_idx,
    char_pad_idx=dataset.char_pad_idx
)
bilstm.init_embeddings(
    char_pad_idx=dataset.char_pad_idx,
    word_pad_idx=dataset.word_pad_idx,
    pretrained=None,
    freeze=True
)
print(f"The model has {bilstm.count_parameters():,} trainable parameters.")
print(bilstm)

The model has 1,028,749 trainable parameters.
CharBilstm(
  (embedding): Embedding(2244, 300, padding_idx=1)
  (emb_dropout): Dropout(p=0.5, inplace=False)
  (char_emb): Embedding(39, 25, padding_idx=1)
  (char_cnn): Conv1d(25, 125, kernel_size=(3,), stride=(1,), groups=25)
  (cnn_dropout): Dropout(p=0.25, inplace=False)
  (lstm): LSTM(425, 64, num_layers=2, dropout=0.1, bidirectional=True)
  (fc_dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=128, out_features=26, bias=True)
)


In [6]:
ner = CharBilstmTrainer(
  model=bilstm,
  data=dataset,
  optimizer_cls=Adam,
  loss_fn_cls=nn.CrossEntropyLoss,
  log_file="char_bilstm_vanilla"
)
ner.train(5)

Epoch: 01 | Epoch Time: 0m 23s
	Trn Loss: 1.444 | Trn Acc: 66.13%
	Val Loss: 0.790 | Val Acc: 80.55% | Val Precision: 58.77% | Val Recall: 59.20% | Val F1 Macro: 54.81% | Val F1 Micro: 80.60%
Epoch: 02 | Epoch Time: 0m 23s
	Trn Loss: 0.513 | Trn Acc: 87.15%
	Val Loss: 0.489 | Val Acc: 87.67% | Val Precision: 73.82% | Val Recall: 76.50% | Val F1 Macro: 72.22% | Val F1 Micro: 87.06%
Epoch: 03 | Epoch Time: 0m 23s
	Trn Loss: 0.362 | Trn Acc: 90.74%
	Val Loss: 0.427 | Val Acc: 89.18% | Val Precision: 76.67% | Val Recall: 78.20% | Val F1 Macro: 74.04% | Val F1 Micro: 88.27%
Epoch: 04 | Epoch Time: 0m 23s
	Trn Loss: 0.301 | Trn Acc: 92.31%
	Val Loss: 0.392 | Val Acc: 89.93% | Val Precision: 78.82% | Val Recall: 79.45% | Val F1 Macro: 76.17% | Val F1 Micro: 88.96%
Epoch: 05 | Epoch Time: 0m 23s
	Trn Loss: 0.265 | Trn Acc: 93.18%
	Val Loss: 0.387 | Val Acc: 90.22% | Val Precision: 78.56% | Val Recall: 79.71% | Val F1 Macro: 75.83% | Val F1 Micro: 89.11%


In [8]:
ner.infer("4 star movies that Nicholas Cage is playing")

word    	unk     	pred tag
4       	        	O      
star    	        	O      
movies  	        	O      
that    	        	O      
Nicholas	        	B-ACTOR
Cage    	        	I-ACTOR
is      	        	O      
playing 	        	O      


(['4', 'star', 'movies', 'that', 'Nicholas', 'Cage', 'is', 'playing'],
 ['O', 'O', 'O', 'O', 'B-ACTOR', 'I-ACTOR', 'O', 'O'],
 [])