## Library import

In [34]:
import pandas as pd
import os
import random

from tqdm import tqdm
import json, glob, os, random
import argparse
import logging
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
import re
import emoji
from seqeval.metrics import f1_score, precision_score, recall_score

import sys
sys.argv=['']
del sys



## pretrained NER

In [35]:
!nvidia-smi

Wed Mar 23 06:21:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   34C    P0    57W / 300W |  19642MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   43C    P0    71W / 300W |  13700MiB / 32480MiB |     31%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   

In [36]:
import torch 

torch.cuda.set_device(3)
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
device_idx = "cuda:3"
torch.cuda.current_device()

3

In [37]:
PATH = ''

### Helper Class

In [38]:
# fold here
import json, glob, os, random
import argparse
import logging
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
import re
import emoji
from seqeval.metrics import f1_score, precision_score, recall_score

import sys
sys.argv=['']
del sys


logger = logging.getLogger(__name__)
model_dict = { 'indobertweet': 'indolem/indobertweet-base-uncased',
               'indobert': 'indolem/indobert-base-uncased'}


def find_url(string):
    # with valid conditions for urls in string 
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    url = re.findall(regex,string)
    return [x[0] for x in url]

def preprocess_tweet(tweet):
    #print(tweet)
    tweet = emoji.demojize(tweet).lower()
    new_tweet = []
    for word in tweet.split():
        if word[0] == '@':
            new_tweet.append('@USER')
        elif find_url(word) != []:
            new_tweet.append('HTTPURL')
        elif word == 'httpurl':
            new_tweet.append('HTTPURL')
        else:
            new_tweet.append(word)
    return ' '.join(new_tweet)

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


class BertData():
    def __init__(self, args):
        self.tokenizer = BertTokenizer.from_pretrained(model_dict[args.bert_model], do_lower_case=True)
        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]
        self.MAX_TOKEN = args.max_token
        self.args = args

    def preprocess_one(self, src_txt, label):
        new_src = []
        new_label = []
        
        for idx, word in enumerate(src_txt):
            s = []; l = []
            for subword in self.tokenizer.tokenize(word):
                s.append(subword)
                l.append(args.vocab_label_size)
            l[0] = label[idx]
            new_src += s
            new_label += l

        src_subtokens = [self.cls_token] + new_src + [self.sep_token]
        new_label = [args.vocab_label_size] + new_label + [args.vocab_label_size]
        src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
        if len(src_subtoken_idxs) > self.MAX_TOKEN:
            src_subtoken_idxs = src_subtoken_idxs[:self.MAX_TOKEN]
            src_subtoken_idxs[-1] = self.sep_vid
            new_label = new_label[:self.MAX_TOKEN]
        else:
            len_to_add = self.MAX_TOKEN-len(src_subtoken_idxs)
            src_subtoken_idxs += [self.pad_vid] * (len_to_add)
            new_label += [self.args.vocab_label_size] * (len_to_add)
        
        segments_ids = [0] * len(src_subtoken_idxs)
        assert len(src_subtoken_idxs) == len(segments_ids) == len(new_label)
        return src_subtoken_idxs, segments_ids, new_label
    
    def preprocess(self, src_txts, labels):
        assert len(src_txts) == len(labels)
        output = []
        for idx in range(len(src_txts)):
            output.append(self.preprocess_one(src_txts[idx], labels[idx]))
        return output


class Batch():
    def __init__(self, data, idx, batch_size, device):
        cur_batch = data[idx:idx+batch_size] 
        src = torch.tensor([x[0] for x in cur_batch])
        seg = torch.tensor([x[1] for x in cur_batch])
        label = torch.tensor([x[2] for x in cur_batch])
        mask_src = 0 + (src != 0)
        
        self.src = src.to(device)
        self.seg= seg.to(device)
        self.label = label.to(device)
        self.mask_src = mask_src.to(device)

    def get(self):
        return self.src, self.seg, self.label, self.mask_src


class Model(nn.Module):
    def __init__(self, args, device):
        super(Model, self).__init__()
        self.args = args
        self.device = device
        self.bert = BertModel.from_pretrained(model_dict[args.bert_model], return_dict=False)
        self.linear = nn.Linear(self.bert.config.hidden_size, args.vocab_label_size)
        self.dropout = nn.Dropout(0.2)
        self.loss = torch.nn.CrossEntropyLoss(ignore_index=args.vocab_label_size, reduction='sum')

    def forward(self, src, seg, mask_src):
        top_vec, _ = self.bert(input_ids=src, token_type_ids=seg, attention_mask=mask_src)
        top_vec = self.dropout(top_vec)
        top_vec *= mask_src.unsqueeze(dim=-1).float()
        conclusion = self.linear(top_vec).squeeze()
        return conclusion
    
    def get_loss(self, src, seg, label, mask_src):
        output = self.forward(src, seg, mask_src)
        return self.loss(output.view(-1,self.args.vocab_label_size), label.view(-1))

    def predict(self, src, seg, mask_src):
        output = self.forward(src, seg, mask_src)
        batch_size = output.shape[0]
        prediction = torch.argmax(output.view(batch_size, -1, args.vocab_label_size), dim=-1).data.cpu().numpy().tolist()
        return prediction


def align (preds, golds, args):
    new_golds = []; new_preds = []
    for idx, gold in enumerate(golds):
        new_gold = []; new_pred = []
        for idy in range(len(gold)):
            if gold[idy] == args.vocab_label_size:
                continue
            else:
                new_gold.append(args.id2label[gold[idy]])
                new_pred.append(args.id2label[preds[idx][idy]])
        new_golds.append(new_gold)
        new_preds.append(new_pred)
    return new_preds, new_golds

def prediction(dataset, model, args):
    preds = []
    golds = []
    model.eval()
    for j in range(0, len(dataset), args.batch_size):
        src, seg, label, mask_src = Batch(dataset, j, args.batch_size, args.device).get()
        preds += model.predict(src, seg, mask_src)
        golds += label.cpu().data.numpy().tolist()
    preds = np.array(preds)
    golds = np.array(golds)
    preds, golds = align (preds, golds, args)
    return f1_score(golds, preds), preds

def train(args, train_dataset, dev_dataset, test_formal_dataset, test_informal_dataset, model, id2label):
    """ Train the model """
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    t_total = len(train_dataset) // args.batch_size * args.num_train_epochs
    args.warmup_steps = int(0.1 * t_total)
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Total optimization steps = %d", t_total)
    logger.info("  Warming up = %d", args.warmup_steps)
    logger.info("  Patience  = %d", args.patience)

    # Added here for reproductibility
    set_seed(args)
    tr_loss = 0.0
    global_step = 1
    best_f1_dev = 0; test_formal_f1 = 0; test_informal_f1 = 0
    cur_patience = 0
    for i in tqdm(range(int(args.num_train_epochs))):
        random.shuffle(train_dataset)
        epoch_loss = 0.0
        for j in range(0, len(train_dataset), args.batch_size):
            src, seg, label, mask_src = Batch(train_dataset, j, args.batch_size, args.device).get()
            model.train()
            loss = model.get_loss(src, seg, label, mask_src)
            loss = loss.sum()/args.batch_size
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            loss.backward()

            tr_loss += loss.item()
            epoch_loss += loss.item()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1
        logger.info("Finish epoch = %s, loss_epoch = %s", i+1, epoch_loss/global_step)
        dev_f1, _ = prediction(dev_dataset, model, args)
        if dev_f1 > best_f1_dev:
            best_f1_dev = dev_f1
            test_formal_f1, _ = prediction(test_formal_dataset, model, args)
            test_informal_f1, _ = prediction(test_informal_dataset, model, args)
            cur_patience = 0
            logger.info("Better, BEST F1 in DEV = %s, F1 in FORMAL_TEST = %s, F1 in INFORMAL_TEST = %s", best_f1_dev, test_formal_f1, test_informal_f1)
            torch.save(model.state_dict(), PATH+"nerbertweet.pt")
        else:
            cur_patience += 1
            if cur_patience == args.patience:
                logger.info("Early Stopping Not Better, BEST F1 in DEV = %s, F1 in FORMAL_TEST = %s, F1 in INFORMAL_TEST  = %s", best_f1_dev, test_formal_f1, test_informal_f1)
                torch.save(model.state_dict(), PATH+"nerbertweet.pt")
                break
            else:
                logger.info("Not Better, BEST F1 in DEV = %s, F1 in FORMAL_TEST = %s, F1 in INFORMAL_TEST  = %s", best_f1_dev, test_formal_f1, test_informal_f1)

    return global_step, tr_loss / global_step, best_f1_dev


args_parser = argparse.ArgumentParser()
args_parser.add_argument('--bert_model', default='indobertweet', choices=['indobert', 'indobertweet'], help='select one of models')
args_parser.add_argument('--data_path', default='data/', help='path to all train/test/dev')
args_parser.add_argument('--max_token', type=int, default=256, help='maximum token allowed for 1 instance')
args_parser.add_argument('--batch_size', type=int, default=30, help='batch size')
args_parser.add_argument('--learning_rate', type=float, default=5e-5, help='learning rate')
args_parser.add_argument('--weight_decay', type=int, default=0, help='weight decay')
args_parser.add_argument('--adam_epsilon', type=float, default=1e-8, help='adam epsilon')
args_parser.add_argument('--max_grad_norm', type=float, default=1.0)
args_parser.add_argument('--num_train_epochs', type=int, default=20, help='total epoch')
args_parser.add_argument('--warmup_steps', type=int, default=242, help='warmup_steps, the default value is 10% of total steps')
args_parser.add_argument('--logging_steps', type=int, default=200, help='report stats every certain steps')
args_parser.add_argument('--seed', type=int, default=2020)
args_parser.add_argument('--local_rank', type=int, default=-1)
args_parser.add_argument('--patience', type=int, default=5, help='patience for early stopping')
args_parser.add_argument('--no_cuda', default=False)
args = args_parser.parse_args()


# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl")
    args.n_gpu = 1
args.device = device

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
)

set_seed(args)

# Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]:
    # Make sure only the first process in distributed training will download model & vocab
    torch.distributed.barrier()

if args.local_rank == 0:
    # Make sure only the first process in distributed training will download model & vocab
    torch.distributed.barrier()

# it is not posssible to use BIO format because of the annotation procedure
# however, we add tag B in the front, for using seqeval
def standardize(tag):
    if tag in ['PERSON', 'ORGANIZATION', 'LOCATION']:
        return 'B-' + tag
    return tag

def read_data(fname):
    lines = [x.strip() for x in open(fname).readlines()]
    data = []; label = []
    for line in lines:
        words = []; tags = []
        for pair in line.split(' '):
            items = pair.split('/')
            words.append(str('/'.join(items[:-1])))
            tags.append(standardize(items[-1]))
        words = preprocess_tweet(' '.join(words)).split()
        data.append(words)
        label.append(tags)
    return data, label

def create_vocab(labels):
    new_labels = []
    for l in labels:
        new_labels += l
    unique = np.unique(new_labels)
    label2id = {}
    id2label = {}
    counter = 0
    for word in unique:
        label2id[word] = counter
        id2label[counter] = word
        counter += 1
    return label2id, id2label

def convert_label2id(label2id, labels):
    res = []
    for label in labels:
        res.append([label2id[x] for x in label])
    return res


xtrain, ytrain = read_data(args.data_path+'train.txt')
xdev, ydev = read_data(args.data_path+'dev.txt')
xtest_formal, ytest_formal = read_data(args.data_path+'test_formal.txt')
xtest_informal, ytest_informal = read_data(args.data_path+'test_informal.txt')

label2id, id2label = create_vocab (ytrain)
args.vocab_label_size = len(label2id)
args.label2id = label2id
args.id2label = id2label

ytrain =  convert_label2id (label2id, ytrain)
ydev =  convert_label2id (label2id, ydev)
ytest_formal =  convert_label2id (label2id, ytest_formal)
ytest_informal =  convert_label2id (label2id, ytest_informal)

bertdata = BertData(args)

# train_dataset = bertdata.preprocess(xtrain, ytrain)
# dev_dataset = bertdata.preprocess(xdev, ydev)
# test_formal_dataset = bertdata.preprocess(xtest_formal, ytest_formal)
# test_informal_dataset = bertdata.preprocess(xtest_informal, ytest_informal)
    
# model = Model(args, device)
# model.to(args.device)
# global_step, tr_loss, best_f1_dev= train(args, train_dataset, dev_dataset, test_formal_dataset, test_informal_dataset, model, id2label)

# print('Dev set F1', best_f1_dev)


### pretrained NER 

In [None]:
import gdown 
pretrained_NER_path = 'nerbertweet.pt'
if not os.path.exists(pretrained_NER_path):
    drive_id = "1-1JJPOqmQ6ydxv-1vZ_mv911Ftxl71xQ"
    gdown.download(id=drive_id, quiet=False, fuzzy=True)

## Tweets data

In [48]:
folderPATH ='original_data/'

In [49]:
userlvltrain = pd.read_csv(folderPATH+'userlvltrain.csv')
userlvltest = pd.read_csv(folderPATH+'userlvltest.csv')
userlvlval = pd.read_csv(folderPATH+'userlvlval.csv')

In [51]:
train_df = pd.read_csv(folderPATH+'train_df.csv')
train_df.dropna(subset = ["text"], inplace=True)
test_df = pd.read_csv(folderPATH+'test_df.csv')
test_df.dropna(subset = ["text"], inplace=True)
val_df = pd.read_csv(folderPATH+'val_df.csv')
val_df.dropna(subset = ["text"], inplace=True)

## Apply pretrained NER to data

In [52]:
myNERmodel = Model(args, device)

myNERmodel.to(args.device)

myNERmodel.load_state_dict(torch.load(pretrained_NER_path))

myNERmodel.eval()

torch.set_grad_enabled(False)

Some weights of the model checkpoint at indolem/indobertweet-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<torch.autograd.grad_mode.set_grad_enabled at 0x7f726d46c760>

In [53]:
## return location given NER prediction results for every rows 
def NER_text_to_location_features(text, NERprediction):
  data = []
  for foo, bar in zip(text, NERprediction[1]):
    d = []
    for i in range(len(foo)):
      if bar[i] == 'B-LOCATION':
        d.append(foo[i])
    data.append(d)
  return data

def NER_text_to_features(text, NERprediction):
  data = []
  for foo, bar in zip(text, NERprediction[1]):
    d = []
    for i in range(len(foo)):
      if bar[i] == 'B-LOCATION' or bar[i] == 'B-PERSON' or bar[i] == 'B-ORGANIZATION':
        d.append(foo[i])
    data.append(d)
  return data

## return X and Y for training NER 
def create_xy_for_NER(text):
  xtrain_text_preprocessed = []; ytrain_text_dummy = []
  for t in text:
    t = list(map(str, t.split(" ")))
    t = [x for x in t if x]
    xtrain_text_preprocessed.append(t)
  for data in xtrain_text_preprocessed:
    ones = []
    for d in data:
      ones.append(random.randint(0, 3))
    ytrain_text_dummy.append(ones)
  return xtrain_text_preprocessed, ytrain_text_dummy

### only tweets

In [55]:
xtrain_text_preprocessed, ytrain_text_dummy = create_xy_for_NER(train_df['text'])
xtest_text_preprocessed, ytest_text_dummy = create_xy_for_NER(test_df['text']) 
xval_text_preprocessed, yval_text_dummy = create_xy_for_NER(val_df['text'])

In [56]:
train_text_dataset = bertdata.preprocess(xtrain_text_preprocessed, ytrain_text_dummy)
test_text_dataset = bertdata.preprocess(xtest_text_preprocessed, ytest_text_dummy)
val_text_dataset = bertdata.preprocess(xval_text_preprocessed, yval_text_dummy)

In [57]:
trainNER = prediction(train_text_dataset, myNERmodel, args)
testNER = prediction(test_text_dataset, myNERmodel, args)
valNER = prediction(val_text_dataset, myNERmodel, args)

In [58]:
locXtrain = NER_text_to_location_features(xtrain_text_preprocessed, trainNER)
locXtest = NER_text_to_location_features(xtest_text_preprocessed, testNER)
locXval = NER_text_to_location_features(xval_text_preprocessed, valNER)

nerXtrain =  NER_text_to_features(xtrain_text_preprocessed, trainNER)
nerXtest = NER_text_to_features(xtest_text_preprocessed, testNER)
nerXval = NER_text_to_features(xval_text_preprocessed, valNER)

In [82]:
train_df['mentionedLocationTweets'] = locXtrain
test_df['mentionedLocationTweets'] = locXtest
val_df['mentionedLocationTweets'] = locXval

train_df['mentionedEntityTweets'] = nerXtrain
test_df['mentionedEntityTweets'] = nerXtest
val_df['mentionedEntityTweets'] = nerXval

### only username

In [61]:
train_df_username_str = train_df['display_name'].astype(str)
test_df_username_str = test_df['display_name'].astype(str)
val_df_username_str = val_df['display_name'].astype(str)

In [62]:
xtrain_text_preprocessed_username, ytrain_text_dummy_username = create_xy_for_NER(train_df_username_str)
xtest_text_preprocessed_username, ytest_text_dummy_username = create_xy_for_NER(test_df_username_str) 
xval_text_preprocessed_username, yval_text_dummy_username = create_xy_for_NER(val_df_username_str)

In [64]:
train_text_dataset_username = bertdata.preprocess(xtrain_text_preprocessed_username, ytrain_text_dummy_username)
test_text_dataset_username = bertdata.preprocess(xtest_text_preprocessed_username, ytest_text_dummy_username)
val_text_dataset_username = bertdata.preprocess(xval_text_preprocessed_username, yval_text_dummy_username)

In [65]:
trainNER_username = prediction(train_text_dataset_username, myNERmodel, args)
testNER_username = prediction(test_text_dataset_username, myNERmodel, args)
valNER_username = prediction(val_text_dataset_username, myNERmodel, args)

In [66]:
locXtrain_username = NER_text_to_location_features(xtrain_text_preprocessed_username, trainNER_username)
locXtest_username = NER_text_to_location_features(xtest_text_preprocessed_username, testNER_username)
locXval_username = NER_text_to_location_features(xval_text_preprocessed_username, valNER_username)

nerXtrain_username =  NER_text_to_features(xtrain_text_preprocessed_username, trainNER_username)
nerXtest_username = NER_text_to_features(xtest_text_preprocessed_username, testNER_username)
nerXval_username = NER_text_to_features(xval_text_preprocessed_username, valNER_username)

In [83]:
train_df['mentionedLocationDisplayname'] = locXtrain_username
test_df['mentionedLocationDisplayname'] = locXtest_username
val_df['mentionedLocationDisplayname'] = locXval_username

train_df['mentionedEntityDisplayname'] = nerXtrain_username
test_df['mentionedEntityDisplayname'] = nerXtest_username
val_df['mentionedEntityDisplayname'] = nerXval_username

### only description

In [90]:
# train_df.description = train_df['description'].fillna('')

In [88]:
train_df_userdesc_str = train_df['description'].astype(str)
test_df_userdesc_str = test_df['description'].astype(str)
val_df_userdesc_str = val_df['description'].astype(str)

In [91]:
xtrain_text_preprocessed_userdesc, ytrain_text_dummy_userdesc = create_xy_for_NER(train_df_userdesc_str)
xtest_text_preprocessed_userdesc, ytest_text_dummy_userdesc = create_xy_for_NER(test_df_userdesc_str) 
xval_text_preprocessed_userdesc, yval_text_dummy_userdesc = create_xy_for_NER(val_df_userdesc_str)

In [92]:
train_text_dataset_userdesc = bertdata.preprocess(xtrain_text_preprocessed_userdesc, ytrain_text_dummy_userdesc)
test_text_dataset_userdesc = bertdata.preprocess(xtest_text_preprocessed_userdesc, ytest_text_dummy_userdesc)
val_text_dataset_userdesc = bertdata.preprocess(xval_text_preprocessed_userdesc, yval_text_dummy_userdesc)

In [93]:
trainNER_userdesc = prediction(train_text_dataset_userdesc, myNERmodel, args)
testNER_userdesc = prediction(test_text_dataset_userdesc, myNERmodel, args)
valNER_userdesc = prediction(val_text_dataset_userdesc, myNERmodel, args)

In [94]:
locXtrain_userdesc = NER_text_to_location_features(xtrain_text_preprocessed_userdesc, trainNER_userdesc)
locXtest_userdesc = NER_text_to_location_features(xtest_text_preprocessed_userdesc, testNER_userdesc)
locXval_userdesc = NER_text_to_location_features(xval_text_preprocessed_userdesc, valNER_userdesc)

nerXtrain_userdesc =  NER_text_to_features(xtrain_text_preprocessed_userdesc, trainNER_userdesc)
nerXtest_userdesc = NER_text_to_features(xtest_text_preprocessed_userdesc, testNER_userdesc)
nerXval_userdesc = NER_text_to_features(xval_text_preprocessed_userdesc, valNER_userdesc)

In [95]:
train_df['mentionedLocationUserdesc'] = locXtrain_userdesc
test_df['mentionedLocationUserdesc'] = locXtest_userdesc
val_df['mentionedLocationUserdesc'] = locXval_userdesc

train_df['mentionedEntityUserdesc'] = nerXtrain_userdesc
test_df['mentionedEntityUserdesc'] = nerXtest_userdesc
val_df['mentionedEntityUserdesc'] = nerXval_userdesc

In [96]:
train_df

Unnamed: 0,Lvalue,isaPerson,screen_name,userLocation,userLocationCode,tweet_id,created_at,author_id,username,display_name,...,pl_country,pl_place_type,pl_geo_type,pl_geo_bbox,mentionedLocationTweets,mentionedEntityTweets,mentionedLocationDisplayname,mentionedEntityDisplayname,mentionedLocationUserdesc,mentionedEntityUserdesc
0,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525235083960321,2021-12-30 12:07:03+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,[],"[Mandiri, Sekuritas.]",[],"[mandiri, sekuritas]",[],[]
1,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525229258133504,2021-12-30 12:07:01+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,[],[],[],"[mandiri, sekuritas]",[],[]
2,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525223247704068,2021-12-30 12:07:00+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,[],[],[],"[mandiri, sekuritas]",[],[]
3,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525216951988230,2021-12-30 12:06:58+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,[],[],[],"[mandiri, sekuritas]",[],[]
4,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525210606063620,2021-12-30 12:06:57+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,[],"[Mandiri, Sekuritas]",[],"[mandiri, sekuritas]",[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85329,2,person,nurshe,jawa barat dan banten,JB&BT,1460848401692565509,2021-11-17 05:52:54+00:00,99895482,nurshe,hikari minori,...,,,,,[],[],[],[],[],[]
85330,2,person,nurshe,jawa barat dan banten,JB&BT,1460848330645245957,2021-11-17 05:52:37+00:00,99895482,nurshe,hikari minori,...,,,,,[],[],[],[],[],[]
85331,2,person,nurshe,jawa barat dan banten,JB&BT,1460846614671618049,2021-11-17 05:45:48+00:00,99895482,nurshe,hikari minori,...,,,,,[],[],[],[],[],[]
85332,2,person,nurshe,jawa barat dan banten,JB&BT,1460846186408005639,2021-11-17 05:44:06+00:00,99895482,nurshe,hikari minori,...,,,,,[],[nova],[],[],[],[]


In [104]:
train_df.to_csv('DataWithNER/train_text.csv')
test_df.to_csv('DataWithNER/test_text.csv')
val_df.to_csv('DataWithNER/val_text.csv')

In [107]:
import pandas as pd
from ast import literal_eval

Entitypath = 'DataWithNER'

train = pd.read_csv(Entitypath+'/train_text.csv', 
                    converters={'mentionedLocationTweets': literal_eval, 
                                'mentionedEntityTweets':literal_eval,
                                'mentionedLocationDisplayname':literal_eval,
                                'mentionedEntityDisplayname':literal_eval,
                                'mentionedLocationUserdesc':literal_eval,
                                'mentionedEntityUserdesc':literal_eval,},  
                    index_col=[0])

test = pd.read_csv(Entitypath+'/test_text.csv', 
                   converters={'mentionedLocationTweets': literal_eval, 
                                'mentionedEntityTweets':literal_eval,
                                'mentionedLocationDisplayname':literal_eval,
                                'mentionedEntityDisplayname':literal_eval,
                                'mentionedLocationUserdesc':literal_eval,
                                'mentionedEntityUserdesc':literal_eval,}, 
                   index_col=[0])

val = pd.read_csv(Entitypath+'/val_text.csv', 
                  converters={'mentionedLocationTweets': literal_eval, 
                                'mentionedEntityTweets':literal_eval,
                                'mentionedLocationDisplayname':literal_eval,
                                'mentionedEntityDisplayname':literal_eval,
                                'mentionedLocationUserdesc':literal_eval,
                                'mentionedEntityUserdesc':literal_eval,}, 
                  index_col=[0])

train.mentionedLocationTweets = train.mentionedLocationTweets.apply(lambda x: ' '.join(map(str, x)))
test.mentionedLocationTweets = test.mentionedLocationTweets.apply(lambda x: ' '.join(map(str, x)))
val.mentionedLocationTweets = val.mentionedLocationTweets.apply(lambda x: ' '.join(map(str, x)))

train.mentionedEntityTweets = train.mentionedEntityTweets.apply(lambda x: ' '.join(map(str, x)))
test.mentionedEntityTweets = test.mentionedEntityTweets.apply(lambda x: ' '.join(map(str, x)))
val.mentionedEntityTweets = val.mentionedEntityTweets.apply(lambda x: ' '.join(map(str, x)))


train.mentionedLocationDisplayname = train.mentionedLocationDisplayname.apply(lambda x: ' '.join(map(str, x)))
test.mentionedLocationDisplayname = test.mentionedLocationDisplayname.apply(lambda x: ' '.join(map(str, x)))
val.mentionedLocationDisplayname = val.mentionedLocationDisplayname.apply(lambda x: ' '.join(map(str, x)))

train.mentionedEntityDisplayname = train.mentionedEntityDisplayname.apply(lambda x: ' '.join(map(str, x)))
test.mentionedEntityDisplayname = test.mentionedEntityDisplayname.apply(lambda x: ' '.join(map(str, x)))
val.mentionedEntityDisplayname = val.mentionedEntityDisplayname.apply(lambda x: ' '.join(map(str, x)))

train.mentionedLocationUserdesc = train.mentionedLocationUserdesc.apply(lambda x: ' '.join(map(str, x)))
test.mentionedLocationUserdesc = test.mentionedLocationUserdesc.apply(lambda x: ' '.join(map(str, x)))
val.mentionedLocationUserdesc = val.mentionedLocationUserdesc.apply(lambda x: ' '.join(map(str, x)))

train.mentionedEntityUserdesc = train.mentionedEntityUserdesc.apply(lambda x: ' '.join(map(str, x)))
test.mentionedEntityUserdesc = test.mentionedEntityUserdesc.apply(lambda x: ' '.join(map(str, x)))
val.mentionedEntityUserdesc = val.mentionedEntityUserdesc.apply(lambda x: ' '.join(map(str, x)))


train.head(4)

Unnamed: 0,Lvalue,isaPerson,screen_name,userLocation,userLocationCode,tweet_id,created_at,author_id,username,display_name,...,pl_country,pl_place_type,pl_geo_type,pl_geo_bbox,mentionedLocationTweets,mentionedEntityTweets,mentionedLocationDisplayname,mentionedEntityDisplayname,mentionedLocationUserdesc,mentionedEntityUserdesc
0,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525235083960321,2021-12-30 12:07:03+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,,Mandiri Sekuritas.,,mandiri sekuritas,,
1,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525229258133504,2021-12-30 12:07:01+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,,,,mandiri sekuritas,,
2,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525223247704068,2021-12-30 12:07:00+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,,,,mandiri sekuritas,,
3,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525216951988230,2021-12-30 12:06:58+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,,,,mandiri sekuritas,,


In [108]:
train

Unnamed: 0,Lvalue,isaPerson,screen_name,userLocation,userLocationCode,tweet_id,created_at,author_id,username,display_name,...,pl_country,pl_place_type,pl_geo_type,pl_geo_bbox,mentionedLocationTweets,mentionedEntityTweets,mentionedLocationDisplayname,mentionedEntityDisplayname,mentionedLocationUserdesc,mentionedEntityUserdesc
0,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525235083960321,2021-12-30 12:07:03+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,,Mandiri Sekuritas.,,mandiri sekuritas,,
1,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525229258133504,2021-12-30 12:07:01+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,,,,mandiri sekuritas,,
2,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525223247704068,2021-12-30 12:07:00+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,,,,mandiri sekuritas,,
3,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525216951988230,2021-12-30 12:06:58+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,,,,mandiri sekuritas,,
4,1,nonperson,Mandiri_OLT,jabodetabek,JBD,1476525210606063620,2021-12-30 12:06:57+00:00,148198201,Mandiri_OLT,mandiri sekuritas,...,,,,,,Mandiri Sekuritas,,mandiri sekuritas,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85329,2,person,nurshe,jawa barat dan banten,JB&BT,1460848401692565509,2021-11-17 05:52:54+00:00,99895482,nurshe,hikari minori,...,,,,,,,,,,
85330,2,person,nurshe,jawa barat dan banten,JB&BT,1460848330645245957,2021-11-17 05:52:37+00:00,99895482,nurshe,hikari minori,...,,,,,,,,,,
85331,2,person,nurshe,jawa barat dan banten,JB&BT,1460846614671618049,2021-11-17 05:45:48+00:00,99895482,nurshe,hikari minori,...,,,,,,,,,,
85332,2,person,nurshe,jawa barat dan banten,JB&BT,1460846186408005639,2021-11-17 05:44:06+00:00,99895482,nurshe,hikari minori,...,,,,,,nova,,,,
