# Train Models
Notebook with the code needed to train and store models
to disk. This notebook has to be clean (do not define functions here, do them in
an external utils.py and import them). The notebook has to be reproducible (if
you run it twice, the same output has to be displayed and stored to disk).

## Import Utils and libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
%ls

#from google.colab import runtime
#runtime.unassign()

Mounted at /content/drive/
[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
# Appends the path to our custom module directory
import sys
sys.path.append('/content/drive/MyDrive/JaimeSanchez_AlejandroVara_BrandonAlfaro_DavidIniguez/')

In [None]:
#Libraries to access the data and save the models
import os
import pickle


In [None]:
!pip install scikit-learn
!pip install cython
!pip install nltk
!pip install datasets
import nltk
nltk.download('stopwords')

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Set the HOME environment variable to the desired path
os.environ['HOME'] = '/content/drive/MyDrive/JaimeSanchez_AlejandroVara_BrandonAlfaro_DavidIniguez/'

# Now you can use os.environ['HOME'] to access the new value
home_dir = os.environ['HOME']

In [None]:
#Import all libraries, functions and classes needed from utils
exec(open(os.path.join(home_dir, 'utils/utils.py')).read())

In [None]:
# Support libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# Skseq modules
from skseq.sequences.id_feature import IDFeatures
from skseq.sequences.extended_feature import ExtendedFeatures
from skseq.sequences.structured_perceptron import StructuredPerceptron

In [None]:
# All directories used
data_dir = os.path.join(home_dir, 'data')
fitted_models_dir = os.path.join(home_dir, 'fitted_models')

### Get Training set

In [None]:
train = pd.read_csv(os.path.join(data_dir, 'train_data_ner.csv'))

In [None]:
# Words named as 'None' are being considered NA's, fill them as str('None')
train['words'] = train['words'].fillna('None')
X_train, y_train = prepare_data(train)

In [None]:
print(f'Words of fourth sentence:  {X_train[3]}'); print(f'Tags for the words of fourth sentence: {y_train[3]}')

Words of fourth sentence:  ['U.N.', 'relief', 'coordinator', 'Jan', 'Egeland', 'said', 'Sunday', ',', 'U.S.', ',', 'Indonesian', 'and', 'Australian', 'military', 'helicopters', 'are', 'ferrying', 'out', 'food', 'and', 'supplies', 'to', 'remote', 'areas', 'of', 'western', 'Aceh', 'province', 'that', 'ground', 'crews', 'can', 'not', 'reach', '.']
Tags for the words of fourth sentence: ['B-geo', 'O', 'O', 'B-per', 'I-per', 'O', 'B-tim', 'O', 'B-geo', 'O', 'B-gpe', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


### Create Vocabulary

In [None]:
word_dict, tag_dict, _ = create_vocabulary(X_train, y_train)

In [None]:
print(f'Word Vocabulary : {word_dict}'); print(f'Tag Vocabulary: {tag_dict}')

Tag Vocabulary: {'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-tim': 3, 'B-org': 4, 'I-geo': 5, 'B-per': 6, 'I-per': 7, 'I-org': 8, 'B-art': 9, 'I-art': 10, 'I-tim': 11, 'I-gpe': 12, 'B-nat': 13, 'I-nat': 14, 'B-eve': 15, 'I-eve': 16}


### Create Sequence List

In [None]:
train_seq_path = os.path.join(fitted_models_dir, 'train_seq.pkl')

if os.path.exists(train_seq_path):
    print("Model already exists. Loading the saved model.")
    with open(train_seq_path, 'rb') as f:
        train_seq = pickle.load(f)
else:
  train_seq = create_sequence_list(X_train, y_train, word_dict, tag_dict)

  # Save the model
  with open(train_seq_path, 'wb') as f:
      pickle.dump(train_seq, f)

  print("Model saved to:", train_seq_path)

Model already exists. Loading the saved model.


In [None]:
print(f"Sequence list of fourth sentence: {train_seq[3]}\n")
print(f"Sequence to words and tags: {train_seq[3].to_words(sequence_list = train_seq)}")

Sequence list of fourth sentence: 57/1 58/0 59/0 60/6 61/7 62/0 63/3 31/0 64/1 31/0 65/2 13/0 66/2 42/0 67/0 36/0 68/0 69/0 70/0 13/0 71/0 7/0 72/0 73/0 1/0 74/0 75/1 76/0 19/0 77/0 78/0 79/0 80/0 81/0 21/0 

Sequence to words and tags: U.N./B-geo relief/O coordinator/O Jan/B-per Egeland/I-per said/O Sunday/B-tim ,/O U.S./B-geo ,/O Indonesian/B-gpe and/O Australian/B-gpe military/O helicopters/O are/O ferrying/O out/O food/O and/O supplies/O to/O remote/O areas/O of/O western/O Aceh/B-geo province/O that/O ground/O crews/O can/O not/O reach/O ./O 


## Train Models

### Structured Perceptron with Default Features

In [None]:
feature_mapper = IDFeatures(train_seq)
feature_mapper.build_features()

In [None]:
show_feats(feature_mapper, train_seq[3])

Initial features
[70] init_tag:B-geo


Transition features
[11] prev_tag:B-geo::O
[3] prev_tag:O::O
[75] prev_tag:O::B-per
[77] prev_tag:B-per::I-per
[79] prev_tag:I-per::O
[32] prev_tag:O::B-tim
[34] prev_tag:B-tim::O
[9] prev_tag:O::B-geo
[11] prev_tag:B-geo::O
[21] prev_tag:O::B-gpe
[23] prev_tag:B-gpe::O
[21] prev_tag:O::B-gpe
[23] prev_tag:B-gpe::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[9] prev_tag:O::B-geo
[11] prev_tag:B-geo::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O
[3] prev_tag:O::O


Final features
[28] final_prev_tag:O


Emission features
[71] id:U.N.::B-geo
[72] id:relief::O
[73] id:coordinator::O
[74] id:Jan::B-per
[76] id:Egeland::I-per
[78] id:said::O
[80] id:Sunday::B-tim
[40] id:,::O
[81] id:U.S.::B-geo
[40] id:,::O
[82] id:Ind

In [None]:
sp = StructuredPerceptron(word_dict, tag_dict, feature_mapper)
num_epochs = 15

In [None]:
%%time
import time

model_file_path = os.path.join(fitted_models_dir, 'sp')

if os.path.exists(model_file_path):
    print("Model already exists. Loading the saved model.")
    with open(model_file_path, 'rb') as f:
        sp = pickle.load(f)
else:
    # Fit the model
    sp.fit(feature_mapper.dataset, num_epochs)

    # Save the model
    sp.save_model(model_file_path)

    print("Model saved to:", model_file_path)

Epoch: 0 Accuracy: 0.893815
Epoch: 1 Accuracy: 0.931674
Epoch: 2 Accuracy: 0.940913
Epoch: 3 Accuracy: 0.946175
Epoch: 4 Accuracy: 0.950018
Epoch: 5 Accuracy: 0.952577
Epoch: 6 Accuracy: 0.954425
Epoch: 7 Accuracy: 0.956033
Epoch: 8 Accuracy: 0.957185
Epoch: 9 Accuracy: 0.958481
Epoch: 10 Accuracy: 0.959217
Epoch: 11 Accuracy: 0.960524
Epoch: 12 Accuracy: 0.961121
Epoch: 13 Accuracy: 0.961207
Epoch: 14 Accuracy: 0.961983
Model saved to: /content/drive/MyDrive/Segunda Entrega NLP /fitted_models/sp
CPU times: user 1h 50min 30s, sys: 3min 58s, total: 1h 54min 29s
Wall time: 1h 53min 27s


#### Profiling

In [None]:
import io
import pstats
import cProfile

sp_test = StructuredPerceptron(word_dict, tag_dict, feature_mapper)
num_epochs = 1

pr = cProfile.Profile()
pr.enable()
sp_test.fit(feature_mapper.dataset, num_epochs)
pr.disable()
s = io.StringIO()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
sys.stdout.write(s.getvalue())

Epoch: 0 Accuracy: 0.893815
         630456675 function calls in 718.185 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000  718.185  359.093 /usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py:3512(run_code)
        2    0.000    0.000  718.185  359.093 {built-in method builtins.exec}
        1    0.000    0.000  718.185  718.185 <ipython-input-106-2df70c8b34e6>:1(<cell line: 10>)
        1    0.001    0.001  718.185  718.185 /content/drive/MyDrive/Segunda Entrega NLP /skseq/sequences/structured_perceptron.py:25(fit)
        1    0.296    0.296  718.183  718.183 /content/drive/MyDrive/Segunda Entrega NLP /skseq/sequences/structured_perceptron.py:58(fit_epoch)
    38366    5.889    0.000  717.888    0.019 /content/drive/MyDrive/Segunda Entrega NLP /skseq/sequences/structured_perceptron.py:95(perceptron_update)
    38366    0.574    0.000  710.895    0.019 /content/drive/MyDriv

### Structured Perceptron with Extended Features

In [None]:
feature_mapper_ext = ExtendedFeatures(train_seq)
feature_mapper_ext.build_features()

In [None]:
show_feats(feature_mapper_ext, train_seq[3])

Initial features
[91] init_tag:B-geo


Transition features
[18] prev_tag:B-geo::O
[8] prev_tag:O::O
[99] prev_tag:O::B-per
[104] prev_tag:B-per::I-per
[106] prev_tag:I-per::O
[46] prev_tag:O::B-tim
[48] prev_tag:B-tim::O
[16] prev_tag:O::B-geo
[18] prev_tag:B-geo::O
[31] prev_tag:O::B-gpe
[33] prev_tag:B-gpe::O
[31] prev_tag:O::B-gpe
[33] prev_tag:B-gpe::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[16] prev_tag:O::B-geo
[18] prev_tag:B-geo::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O
[8] prev_tag:O::O


Final features
[39] final_prev_tag:O


Emission features
[92, 14, 93] id:U.N.::B-geo
[92, 14, 93] capitalized::B-geo
[92, 14, 93] all_capitalized::B-geo
[94, 5, 3] id:relief::O
[94, 5, 3] lower::O
[94, 5, 3] alphanum::O
[95, 5, 3] id:coordinator::O
[9

In [None]:
sp_ext = StructuredPerceptron(word_dict, tag_dict, feature_mapper_ext)
num_epochs = 15

In [None]:
%%time
model_file_path = os.path.join(fitted_models_dir, 'sp_ext')

if os.path.exists(model_file_path):
    print("Model already exists. Loading the saved model.")
    with open(model_file_path, 'rb') as f:
        sp_ext = pickle.load(f)
else:
    # Fit the model
    sp_ext.fit(feature_mapper_ext.dataset, num_epochs)

    # Save the model
    sp_ext.save_model(model_file_path)

    print("Model saved to:", model_file_path)

Epoch: 0 Accuracy: 0.930567
Epoch: 1 Accuracy: 0.944572
Epoch: 2 Accuracy: 0.948740
Epoch: 3 Accuracy: 0.951372
Epoch: 4 Accuracy: 0.953064
Epoch: 5 Accuracy: 0.954413
Epoch: 6 Accuracy: 0.955548
Epoch: 7 Accuracy: 0.956655
Epoch: 8 Accuracy: 0.957372
Epoch: 9 Accuracy: 0.958522
Epoch: 10 Accuracy: 0.958742
Epoch: 11 Accuracy: 0.959553
Epoch: 12 Accuracy: 0.960095
Epoch: 13 Accuracy: 0.960533
Epoch: 14 Accuracy: 0.961149
Model saved to: /content/drive/MyDrive/Segunda Entrega NLP /fitted_models/sp_ext
CPU times: user 1h 54min 23s, sys: 4min 7s, total: 1h 58min 31s
Wall time: 1h 57min 28s


Profiling

In [None]:
sp_test_ext = StructuredPerceptron(word_dict, tag_dict, feature_mapper_ext)
num_epochs = 1

pr = cProfile.Profile()
pr.enable()
sp_test_ext.fit(feature_mapper_ext.dataset, num_epochs)
pr.disable()
s = io.StringIO()
sortby = 'cumulative'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
sys.stdout.write(s.getvalue())

Epoch: 0 Accuracy: 0.930567
         630187159 function calls in 695.701 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000  695.700  347.850 /usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py:3512(run_code)
        2    0.000    0.000  695.700  347.850 {built-in method builtins.exec}
        1    0.000    0.000  695.700  695.700 <ipython-input-113-8beb0ab62768>:1(<cell line: 6>)
        1    0.001    0.001  695.700  695.700 /content/drive/MyDrive/Segunda Entrega NLP /skseq/sequences/structured_perceptron.py:25(fit)
        1    0.288    0.288  695.699  695.699 /content/drive/MyDrive/Segunda Entrega NLP /skseq/sequences/structured_perceptron.py:58(fit_epoch)
    38366    3.939    0.000  695.411    0.018 /content/drive/MyDrive/Segunda Entrega NLP /skseq/sequences/structured_perceptron.py:95(perceptron_update)
    38366    0.539    0.000  690.701    0.018 /content/drive/MyDrive

### Transformers

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split (X_train, y_train, test_size=0.1)
pre_train = preprocess_tuples(X_train)
pre_val = preprocess_tuples (X_val)

In [None]:
tags = np.unique(train['tags'].values)
index2tag, tag2index = create_dict (tags)

In [None]:
label_names = list (tag2index.keys())
print (label_names)

['O', 'B-art', 'I-art', 'B-eve', 'I-eve', 'B-geo', 'I-geo', 'B-gpe', 'I-gpe', 'B-nat', 'I-nat', 'B-org', 'I-org', 'B-per', 'I-per', 'B-tim', 'I-tim']


In [None]:
Y_train_labels = create_labels (Y_train, tag2index)
Y_val_labels = create_labels (Y_val, tag2index)

In [None]:
from transformers import DistilBertForTokenClassification, DistilBertTokenizerFast
from datasets import Dataset

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(label_names))
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

train_dataset, train_labels = tokenize_and_align_labels(pre_train, Y_train_labels, tokenizer)
val_dataset, val_labels = tokenize_and_align_labels(pre_val, Y_val_labels, tokenizer)

num_train_epochs = 30
batch_size = 200
learning_rate = 2e-5
weight_decay = 0.0

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Set up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Define Cross-Entropy Loss
criterion = nn.CrossEntropyLoss()

# Number of epochs with no improvement after which training will be stopped
patience = 2

model = fit_model (model, train_loader, val_loader, num_train_epochs, optimizer, criterion, patience, device = torch.device("cuda" if torch.cuda.is_available() else "cpu"))

filetransformers = os.path.join(fitted_models_dir, './ner_dl_distilbert')
model.save_pretrained(filetransformers)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/30, Training Loss: 0.48094756375847525
Epoch 1/30, Validation Loss: 0.16564376056194305
Epoch 2/30, Training Loss: 0.16047781912577636
Epoch 2/30, Validation Loss: 0.13377927504479886
Epoch 3/30, Training Loss: 0.13045718960162533
Epoch 3/30, Validation Loss: 0.12181219607591628
Epoch 4/30, Training Loss: 0.11275484150200221
Epoch 4/30, Validation Loss: 0.11726282201707364
Epoch 5/30, Training Loss: 0.09858102713189373
Epoch 5/30, Validation Loss: 0.11893790178000926
Epoch 6/30, Training Loss: 0.08611774028984108
Epoch 6/30, Validation Loss: 0.11978691816329956
Early stopping triggered after epoch 6
