# Named Entity Recognition(NER) on Twitter 

In these notewooks, I will use 5 ways to solve custom Named Entity Recognition (NER) problem on Twitter. NER is a task that seeks to locate and classify named entities mentioned in unstructured text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.

In this dataset, we have 21 different tags for sentences.

tags = ['O', 'B-musicartist', 'I-musicartist', 'B-product', 'I-product', 'B-company', 'B-person', 'B-other', 'I-other', 'B-facility',
    'I-facility', 'B-sportsteam', 'B-geo-loc', 'I-geo-loc', 'I-company', 'I-person', 'B-movie', 'I-movie', 'B-tvshow', 'I-tvshow',
    'I-sportsteam'],

where 'B-' and 'I-' prefixes stand for the beginning and inside of the entity, 'O' stands for out of tag or no tag.

### Models

In the following three notebooks, we will use five ways to examine the dataset:

- Naive Bayes multinomial model
- Conditional Random Fields (CRFs)
- Custom SpaCy
- BERT in Spark NLP
- <mark>Simple Transformer</mark> 

In this notebook we will discuss Simple Transformer. Embedding with 'bert_base_cased' and 'roberta-large'.

### Named Entity Recongnition - Simple Transformer

mount drive

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

install simpletransformers

In [None]:
#!pip install simpletransformers

install nvdia apex

In [None]:
# %%writefile setup.sh

# git clone https://github.com/NVIDIA/apex
# cd apex
# pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

In [None]:
# !sh setup.sh

#### Preprocess data

In [None]:
def read_data(file_path):
    tokens = []
    tags = []
    
    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else:
            token, tag = line.split()
            # Replace all urls with <URL> token
            # Replace all users with <USR> token

            ######################################
            ######### YOUR CODE HERE #############
            ######################################
            if token[0] == "@":
                token = "<USR>"
            elif token[:7] == "http://" or token[:8] == "https://":
                token = "<URL>"
            
            tweet_tokens.append(token)
            tweet_tags.append(tag)
            
    return tokens, tags

In [None]:
train_tokens, train_tags = read_data('drive/My Drive/NER/data/train.txt')
test_tokens, test_tags = read_data('drive/My Drive/NER/data/test.txt')

In [None]:
import pandas as pd

In [None]:
df_train_tokens = pd.DataFrame({'words':train_tokens})
df_train_tokens = df_train_tokens.explode('words')
df_train_tokens["sentence_id"] = df_train_tokens.index
df_train_tokens = df_train_tokens.reset_index(drop=True)

df_train_tags = pd.DataFrame({'tags':train_tags})
df_train_tags = df_train_tags.explode('tags').reset_index(drop=True)

In [None]:
df_train = df_train_tokens
df_train["labels"] = df_train_tags["tags"]

In [None]:
columns_titles = ['sentence_id', 'words', 'labels']
df_train = df_train.reindex(columns=columns_titles)


In [None]:
len(df_train)

In [None]:
import numpy as np
labels = df_train["labels"].unique()
labels = labels.tolist()

In [None]:
df_test_tokens = pd.DataFrame({'words':test_tokens})
df_test_tokens = df_test_tokens.explode('words')
df_test_tokens["sentence_id"] = df_test_tokens.index
df_test_tokens = df_test_tokens.reset_index(drop=True)

df_test_tags = pd.DataFrame({'tags':test_tags})
df_test_tags = df_test_tags.explode('tags').reset_index(drop=True)

In [None]:
df_test = df_test_tokens
df_test["labels"] = df_test_tags["tags"]

In [None]:
columns_titles = ['sentence_id', 'words', 'labels']
df_test = df_test.reindex(columns=columns_titles)

In [None]:
len(df_test)

#### Model Training 1:
- 'bert-base-cased' embedding + simpletransformers

In [None]:
from simpletransformers.ner import NERModel
import logging
import time

start = time.time()
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

model1 = NERModel('bert', 'bert-base-cased', 
                 labels=labels,
args={"save_eval_checkpoints": False,
      "save_steps": -1,
      "output_dir": "drive/My Drive/MODEL1",
      'overwrite_output_dir': True,
      "save_model_every_epoch": False,
      'reprocess_input_data': True, 
      "train_batch_size": 8,'num_train_epochs': 5,"max_seq_length": 256, "gradient_accumulation_steps": 1}, use_cuda=True)
model1.train_model(df_train)
print(time.time()-start) 

#### Model Training 2:
- 'roberta-large' embedding + simpletransformers

In [None]:
from simpletransformers.ner import NERModel
import logging
import time

start = time.time()
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
# train_df = pd.DataFrame(traindata, columns=['sentence_id', 'words', 'labels'])
# test_df = pd.DataFrame(testdata, columns=['sentence_id', 'words', 'labels'])
model2 = NERModel('roberta', 'roberta-large', 
                 labels=labels,
args={"save_eval_checkpoints": False,
      "save_steps": -1,
      "output_dir": "drive/My Drive/NER/MODEL2",
      'overwrite_output_dir': True,
      "save_model_every_epoch": False,
      'reprocess_input_data': True, 
      "train_batch_size": 8,'num_train_epochs': 5,"max_seq_length": 256, "gradient_accumulation_steps": 1}, use_cuda=True)
model2.train_model(df_train)
print(time.time()-start) 

#### Evaluation

In [None]:
from sklearn.metrics import classification_report
import numpy as np

In [None]:
# Evaluate the model1
result1_train, model_outputs1_train, predictions1_train = model1.eval_model(df_train)
result1_test, model_outputs1_test, predictions1_test = model1.eval_model(df_test)

In [None]:
y_train_pred1 = np.hstack(predictions1_train)
y_test_pred1 = np.hstack(predictions1_test)

In [None]:
y_train = df_train.labels.values
y_test = df_test.labels.values
classes = np.unique(y_test)
classes = classes.tolist()

In [None]:
new_classes = classes.copy()
new_classes = new_classes[:-1]
new_classes

In [None]:
from sklearn.metrics import f1_score
print('-' * 20 + ' Train set quality: ' + '-' * 20)
print(f1_score(y_pred=y_train_pred1, y_true=y_train, labels=classes, average='micro'))
print('-' * 20 + ' Test set quality: ' + '-' * 20)
print(f1_score(y_pred=y_test_pred1, y_true=y_test, labels=classes, average='micro'))

In [None]:
print('-' * 20 + ' Train set quality: ' + '-' * 20)
print(classification_report(y_pred=y_train_pred1, y_true=y_train, labels=new_classes))
print('-' * 20 + ' Test set quality: ' + '-' * 20)
print(classification_report(y_pred=y_test_pred1, y_true=y_test, labels=new_classes))

In [None]:
# Evaluate the model2
result2_train, model_outputs2_train, predictions2_train = model2.eval_model(df_train)
result2_test, model_outputs2_test, predictions2_test = model2.eval_model(df_test)

In [None]:
y_train_pred2 = np.hstack(predictions2_train)
y_test_pred2 = np.hstack(predictions2_test)

In [None]:
from sklearn.metrics import f1_score
print('-' * 20 + ' Train set quality: ' + '-' * 20)
print(f1_score(y_pred=y_train_pred2, y_true=y_train, labels=classes, average='micro'))
print('-' * 20 + ' Test set quality: ' + '-' * 20)
print(f1_score(y_pred=y_test_pred2, y_true=y_test, labels=classes, average='micro'))

In [None]:
print('-' * 20 + ' Train set quality: ' + '-' * 20)
print(classification_report(y_pred=y_train_pred2, y_true=y_train, labels=new_classes))
print('-' * 20 + ' Test set quality: ' + '-' * 20)
print(classification_report(y_pred=y_test_pred2, y_true=y_test, labels=new_classes))