In [1]:
!pip install -U spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
df_headline = pd.read_csv('task1_headline_ABSA_train.csv')

In [4]:
df_headline.head()

Unnamed: 0,|,sentence,info_snippets,info_target,info_sentiment_score,info_aspects
0,1,Royal Mail chairman Donald Brydon set to step ...,['set to step down'],Royal Mail,-0.374,['Corporate/Appointment']
1,7,Stakes High for AstraZeneca Heart Drug Facing ...,['Facing Tough Competition'],AstraZeneca,-0.24,['Corporate/Risks']
2,8,UPDATE 1-Dairy Crest loses a third of Morrison...,['Crest loses a third of Morrisons milk contra...,Morrisons,-0.161,['Corporate/Sales/Failed Contract Discussion']
3,22,Insight hires Aviva's David Hillier for multi-...,['hires Aviva's David Hillier for multi-asset ...,Insight,0.137,['Corporate/Appointment/Executive Appointment']
4,30,Primark racks up a happy Christmas after stron...,['after strong sales'],Primark,0.704,['Corporate/Sales']


In [5]:
df_post = pd.read_csv('task1_post_ABSA_train.csv')

In [6]:
df_post.head()

Unnamed: 0,|,sentence,info_snippets,info_target,info_sentiment_score,info_aspects
0,14860,Slowly adding some $FIO here but gotta be care...,['Slowly adding some $FIO here but gotta be ca...,0.459,FIO,['Stock/Price Action/Bullish/Bull Position']
1,14864,$TRX http://stks.co/1KkK Long setup. MACD cross.,['Long setup. MACD cross.'],0.438,TRX,['Stock/Technical Analysis']
2,14867,I am not optimistic about $amzn both fundement...,['both fundementals and charts look like poopo...,-0.506,AMZN,['Stock/Price Action/Bearish']
3,14875,$GRPN might be selling off ahead of $P earning...,['might be selling off ahead'],-0.202,P,['Stock/Price Action/Bearish/Bearish Behavior']
4,14876,$IACI http://stks.co/tJU Looks good on the wee...,['Looks good on the weekly chart.'],0.379,IACI,['Stock/Technical Analysis']


In [7]:
df_headline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   |                     498 non-null    int64  
 1   sentence              498 non-null    object 
 2   info_snippets         498 non-null    object 
 3   info_target           498 non-null    object 
 4   info_sentiment_score  498 non-null    float64
 5   info_aspects          498 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 23.5+ KB


In [8]:
df_headline.describe()

Unnamed: 0,|,info_sentiment_score
count,498.0,498.0
mean,836.34739,0.093129
std,518.625844,0.375713
min,1.0,-0.938
25%,375.75,-0.18475
50%,839.0,0.189
75%,1288.75,0.36475
max,1779.0,0.975


In [9]:
df_headline.shape,df_post.shape

((498, 6), (675, 6))

In [10]:
df_headline['info_sentiment_score'].value_counts(normalize = True)

 0.000    0.026104
-0.158    0.006024
 0.284    0.006024
 0.143    0.006024
 0.214    0.006024
            ...   
 0.076    0.002008
-0.530    0.002008
 0.433    0.002008
-0.364    0.002008
 0.435    0.002008
Name: info_sentiment_score, Length: 407, dtype: float64

In [11]:
# remove URL's from train and test
df_headline['sentence'] = df_headline['sentence'].apply(lambda x: re.sub(r'http\S+', '', x))
df_headline['info_snippets'] = df_headline['info_snippets'].apply(lambda x: re.sub(r'http\S+', '', x))

df_post['sentence'] = df_post['sentence'].apply(lambda x: re.sub(r'http\S+', '', x))
df_post['info_snippets'] = df_post['info_snippets'].apply(lambda x: re.sub(r'http\S+', '', x))

In [12]:
df_headline.head()

Unnamed: 0,|,sentence,info_snippets,info_target,info_sentiment_score,info_aspects
0,1,Royal Mail chairman Donald Brydon set to step ...,['set to step down'],Royal Mail,-0.374,['Corporate/Appointment']
1,7,Stakes High for AstraZeneca Heart Drug Facing ...,['Facing Tough Competition'],AstraZeneca,-0.24,['Corporate/Risks']
2,8,UPDATE 1-Dairy Crest loses a third of Morrison...,['Crest loses a third of Morrisons milk contra...,Morrisons,-0.161,['Corporate/Sales/Failed Contract Discussion']
3,22,Insight hires Aviva's David Hillier for multi-...,['hires Aviva's David Hillier for multi-asset ...,Insight,0.137,['Corporate/Appointment/Executive Appointment']
4,30,Primark racks up a happy Christmas after stron...,['after strong sales'],Primark,0.704,['Corporate/Sales']


In [13]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_.`{|}~'

df_headline['sentence'] = df_headline['sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
df_post['sentence'] = df_post['sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

df_headline['info_snippets'] = df_headline['info_snippets'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
df_post['info_snippets'] = df_post['info_snippets'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
df_headline['sentence'] = df_headline['sentence'].str.lower()
df_post['sentence'] = df_post['sentence'].str.lower()

df_headline['info_snippets'] = df_headline['info_snippets'].str.lower()
df_post['info_snippets'] = df_post['info_snippets'].str.lower()
# remove numbers
df_headline['sentence'] = df_headline['sentence'].str.replace("[0-9]", " ")
df_post['sentence'] = df_post['sentence'].str.replace("[0-9]", " ")

df_headline['info_snippets'] = df_headline['info_snippets'].str.replace("[0-9]", " ")
df_post['info_snippets'] = df_post['info_snippets'].str.replace("[0-9]", " ")
# remove whitespaces
df_headline['sentence'] = df_headline['sentence'].apply(lambda x:' '.join(x.split()))
df_post['sentence'] = df_post['sentence'].apply(lambda x: ' '.join(x.split()))

df_headline['info_snippets'] = df_headline['info_snippets'].apply(lambda x:' '.join(x.split()))
df_post['info_snippets'] = df_post['info_snippets'].apply(lambda x: ' '.join(x.split()))



In [14]:
nlp = spacy.load("en_core_web_sm")

In [15]:
# import spaCy's language model
#nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [16]:
df_headline['sentence']  = lemmatization(df_headline['sentence'] )
df_post['sentence'] = lemmatization(df_post['sentence'] )

df_headline['info_snippets']  = lemmatization(df_headline['info_snippets'] )
df_post['info_snippets'] = lemmatization(df_post['info_snippets'] )

In [17]:
df_headline['info_sentiment_score'][df_headline['info_sentiment_score'] < 0] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
df_headline['info_sentiment_score'][df_headline['info_sentiment_score'] > 0] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
df_headline.head()

Unnamed: 0,|,sentence,info_snippets,info_target,info_sentiment_score,info_aspects
0,1,royal mail chairman donald brydon set to step ...,' set to step down ',Royal Mail,0.0,['Corporate/Appointment']
1,7,stake high for astrazeneca heart drug face tou...,' face tough competition ',AstraZeneca,0.0,['Corporate/Risks']
2,8,update dairy crest lose a third of morrison mi...,' crest lose a third of morrison milk contract ',Morrisons,0.0,['Corporate/Sales/Failed Contract Discussion']
3,22,insight hire aviva 's david hillier for multia...,' hire aviva 's david hillier for multiasset t...,Insight,1.0,['Corporate/Appointment/Executive Appointment']
4,30,primark rack up a happy christma after strong ...,' after strong sale ',Primark,1.0,['Corporate/Sales']


In [20]:
df_headline.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   |                     498 non-null    int64  
 1   sentence              498 non-null    object 
 2   info_snippets         498 non-null    object 
 3   info_target           498 non-null    object 
 4   info_sentiment_score  498 non-null    float64
 5   info_aspects          498 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 23.5+ KB


In [21]:
df_headline[['info_sentiment_score']] = df_headline[['info_sentiment_score']].apply(pd.to_numeric) 

In [22]:
# using dictionary to convert specific columns 
convert_dict = {'info_sentiment_score': int
               } 
  
df_headline = df_headline.astype(convert_dict) 
print(df_headline.dtypes)

|                        int64
sentence                object
info_snippets           object
info_target             object
info_sentiment_score     int64
info_aspects            object
dtype: object


In [23]:
df_headline.head()

Unnamed: 0,|,sentence,info_snippets,info_target,info_sentiment_score,info_aspects
0,1,royal mail chairman donald brydon set to step ...,' set to step down ',Royal Mail,0,['Corporate/Appointment']
1,7,stake high for astrazeneca heart drug face tou...,' face tough competition ',AstraZeneca,0,['Corporate/Risks']
2,8,update dairy crest lose a third of morrison mi...,' crest lose a third of morrison milk contract ',Morrisons,0,['Corporate/Sales/Failed Contract Discussion']
3,22,insight hire aviva 's david hillier for multia...,' hire aviva 's david hillier for multiasset t...,Insight,1,['Corporate/Appointment/Executive Appointment']
4,30,primark rack up a happy christma after strong ...,' after strong sale ',Primark,1,['Corporate/Sales']


In [24]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
df_headline['info_target']= label_encoder.fit_transform(df_headline['info_target']) 
  
df_headline['info_target'].unique()

array([171,  14, 142, 105, 161, 154, 199, 201, 212, 170, 187,  47, 203,
        68,  33, 150, 152, 204,  99,  95, 156,  24, 166, 139, 217,  36,
       131, 148,  15,   1,  54,  21, 133, 182, 209, 193, 194,  65, 140,
       164,  90, 123,  59,  72,  17, 169, 219,  31,  77,  76,  16,  56,
         8, 202, 177, 145,  13,  27, 197, 153, 190, 208, 163, 162,  41,
       120,  57, 144, 183, 186, 137, 138, 165, 207,  75, 192, 198,  62,
        82, 224, 216, 213,  85,  84, 103, 175, 189,  69, 151,  91,  20,
       159,  92,  45, 184, 106, 124,  28, 141, 118, 130,  81, 196,   5,
       185,  42,  10, 122, 191, 188, 128, 225, 115, 158,   7, 129, 206,
       195,  94,  35, 223, 147,  55, 211, 180,  19,  83, 167, 136,  79,
        70,  86,  38,   9, 104,  22,  40,   4, 121,  32,  12, 215,  37,
        48,  93,  98, 143,   6,   2, 100,  49, 174,  80, 146,  23, 134,
        39, 117,  73, 178, 222, 214, 155,  53,  64,  34, 116, 220,  52,
       102,  97,  58, 210,  96, 179,  29,  18, 176,  89, 108,  4

In [25]:
df_headline.head()

Unnamed: 0,|,sentence,info_snippets,info_target,info_sentiment_score,info_aspects
0,1,royal mail chairman donald brydon set to step ...,' set to step down ',171,0,['Corporate/Appointment']
1,7,stake high for astrazeneca heart drug face tou...,' face tough competition ',14,0,['Corporate/Risks']
2,8,update dairy crest lose a third of morrison mi...,' crest lose a third of morrison milk contract ',142,0,['Corporate/Sales/Failed Contract Discussion']
3,22,insight hire aviva 's david hillier for multia...,' hire aviva 's david hillier for multiasset t...,105,1,['Corporate/Appointment/Executive Appointment']
4,30,primark rack up a happy christma after strong ...,' after strong sale ',161,1,['Corporate/Sales']


In [26]:
!pip install tensorflow==1.13.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [27]:
import os
import collections
import tensorflow
print(tensorflow.__version__)
import tensorflow as tf
from datetime import datetime

1.13.2


In [28]:
#!pip install bert

In [29]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 76511521802174478
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 447115346426547023
physical_device_desc: "device: XLA_CPU device"
]


In [30]:
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

False

In [31]:
!pip install bert-tensorflow==1.0.1

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


구글의 BERT repository에 있는 파일을 다운로드 받으면 된다.

In [32]:
##use downloaded model, change path accordingly
# BERT_VOCAB= '/home/ashish/FiQA/uncased_L-12_H-768_A-12/vocab.txt'
# BERT_INIT_CHKPNT = '/home/ashish/FiQA/uncased_L-12_H-768_A-12/bert_model.ckpt'
# BERT_CONFIG = '/home/ashish/FiQA/uncased_L-12_H-768_A-12/bert_config.json'

BERT_VOCAB= 'vocab.txt'
BERT_INIT_CHKPNT = 'bert_model.ckpt'
BERT_CONFIG = 'bert_config.json'

In [33]:
tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=True)

In [34]:
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

In [35]:
ID = 'id'
DATA_COLUMN = 'sentence'
LABEL_COLUMNS = ['info_target','info_sentiment_score']

In [36]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, labels=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids, is_real_example=True):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids,
        self.is_real_example=is_real_example

In [37]:
df_headline.iloc[:,3:-1]

Unnamed: 0,info_target,info_sentiment_score
0,171,0
1,14,0
2,142,0
3,105,1
4,161,1
...,...,...
493,135,0
494,157,1
495,17,1
496,156,1


In [38]:
def create_examples(df, labels_available=True):
    """Creates examples for the training and dev sets."""
    examples = []
    for (i, row) in enumerate(df.values):
        guid = row[0]
        text_a = row[1]
        if labels_available:
            labels = row[3:-1]
        else:
            labels = [0,0]
        examples.append(
            InputExample(guid=guid, text_a=text_a, labels=labels))
    return examples

In [39]:
TRAIN_VAL_RATIO = 0.7
LEN = df_headline.shape[0]
SIZE_TRAIN = int(TRAIN_VAL_RATIO*LEN)

x_train = df_headline[:SIZE_TRAIN]
x_val = df_headline[SIZE_TRAIN:]

train_examples = create_examples(x_train)

In [40]:
def convert_examples_to_features(examples,  max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    for (ex_index, example) in enumerate(examples):
        print(example.text_a)
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        labels_ids = []
        for label in example.labels:
            labels_ids.append(int(label))

        if ex_index < 0:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %s)" % (example.labels, labels_ids))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_ids=labels_ids))
    return features


In [41]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128

In [42]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 500

In [43]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
    When running eval/predict on the TPU, we need to pad the number of examples
    to be a multiple of the batch size, because the TPU requires a fixed batch
    size. The alternative is to drop the last batch, which is bad because it means
    the entire output data won't be generated.
    We use this class instead of `None` because treating `None` as padding
    battches could cause silent errors.
    """
    
    
def convert_single_example(ex_index, example, max_seq_length,
                           tokenizer):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            input_mask=[0] * max_seq_length,
            segment_ids=[0] * max_seq_length,
            label_ids=0,
            is_real_example=False)

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    labels_ids = []
    for label in example.labels:
        labels_ids.append(int(label))


    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_ids=labels_ids,
        is_real_example=True)
    return feature


def file_based_convert_examples_to_features(
        examples, max_seq_length, tokenizer, output_file):
    """Convert a set of `InputExample`s to a TFRecord file."""

    writer = tf.python_io.TFRecordWriter(output_file)

    for (ex_index, example) in enumerate(examples):
        #if ex_index % 10000 == 0:
            #tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

        feature = convert_single_example(ex_index, example,
                                         max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])
        if isinstance(feature.label_ids, list):
            label_ids = feature.label_ids
        else:
            label_ids = feature.label_ids[0]
        features["label_ids"] = create_int_feature(label_ids)

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()


def file_based_input_fn_builder(input_file, seq_length, is_training,
                                drop_remainder):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""

    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([2], tf.int64),
        "is_real_example": tf.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        example = tf.parse_single_example(record, name_to_features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t

        return example

    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                drop_remainder=drop_remainder))

        return d

    return input_fn
    
    
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


In [44]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_examples) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [45]:
num_train_steps

43

### os.path란??
파일 경로를 생성 및 수정하고, 파일 정보를 쉽게 다룰 수 있게 해주는 모듈

In [70]:
# 현재 작업 폴더 얻기
os.getcwd()

'/content'

In [73]:
# 폴더 안에 파일 확인
os.listdir()

['.config',
 'vocab.txt',
 'bert_model.ckpt.index',
 'bert_model.ckpt.data-00000-of-00001',
 'task1_headline_ABSA_train.csv',
 'task1_post_ABSA_train.csv',
 '.ipynb_checkpoints',
 'bert_config.json',
 'Bert',
 'sample_data']

In [76]:
train_file = os.path.join('Bert/working', "train.tf_record")
#filename = Path(train_file)
if not os.path.exists(train_file):
    open(train_file, 'w').close()

In [78]:
file_based_convert_examples_to_features(
            train_examples, MAX_SEQ_LENGTH, tokenizer, train_file)
tf.logging.info("***** Running training *****")
tf.logging.info("  Num examples = %d", len(train_examples))
tf.logging.info("  Batch size = %d", BATCH_SIZE)
tf.logging.info("  Num steps = %d", num_train_steps)

In [79]:
train_input_fn = file_based_input_fn_builder(
    input_file=train_file,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)

In [91]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings):
    """Creates a classification model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)

    # In the demo, we are doing a simple classification task on the entire
    # segment.
    #
    # If you want to use the token-level output, use model.get_sequence_output()
    # instead.
    output_layer = model.get_pooled_output()

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())
    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        
        # probabilities = tf.nn.softmax(logits, axis=-1) ### multiclass case
        probabilities = tf.nn.sigmoid(logits)#### multi-label case
        
        labels = tf.cast(labels, tf.float32)
        tf.logging.info("num_labels:{};logits:{};labels:{}".format(num_labels, logits, labels))
        per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
        loss = tf.reduce_mean(per_example_loss)

        # probabilities = tf.nn.softmax(logits, axis=-1)
        # log_probs = tf.nn.log_softmax(logits, axis=-1)
        #
        # one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        #
        # per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        # loss = tf.reduce_mean(per_example_loss)

        return (loss, per_example_loss, logits, probabilities)


def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        #tf.logging.info("*** Features ***")
        #for name in sorted(features.keys()):
        #    tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]
        is_real_example = None
        if "is_real_example" in features:
             is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
        else:
             is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (total_loss, per_example_loss, logits, probabilities) = create_model(
            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
            num_labels, use_one_hot_embeddings)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            #tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, label_ids, probabilities, is_real_example):

                logits_split = tf.split(probabilities, num_labels, axis=-1)
                label_ids_split = tf.split(label_ids, num_labels, axis=-1)
                # metrics change to auc of every class
                eval_dict = {}
                for j, logits in enumerate(logits_split):
                    label_id_ = tf.cast(label_ids_split[j], dtype=tf.int32)
                    current_auc, update_op_auc = tf.metrics.auc(label_id_, logits)
                    eval_dict[str(j)] = (current_auc, update_op_auc)
                eval_dict['eval_loss'] = tf.metrics.mean(values=per_example_loss)
                return eval_dict

                ## original eval metrics
                # predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                # accuracy = tf.metrics.accuracy(
                #     labels=label_ids, predictions=predictions, weights=is_real_example)
                # loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
                # return {
                #     "eval_accuracy": accuracy,
                #     "eval_loss": loss,
                # }

            eval_metrics = metric_fn(per_example_loss, label_ids, probabilities, is_real_example)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metric_ops=eval_metrics,
                scaffold=scaffold_fn)
        else:
            print("mode:", mode,"probabilities:", probabilities)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={"probabilities": probabilities},
                scaffold=scaffold_fn)
        return output_spec

    return model_fn

In [92]:
OUTPUT_DIR = "Bert/working/output"
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    keep_checkpoint_max=1,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [93]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels= len(LABEL_COLUMNS),
  init_checkpoint=BERT_INIT_CHKPNT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=False,
  use_one_hot_embeddings=False)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

In [95]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!


Instructions for updating:
Use standard file APIs to delete files with this prefix.


Training took time  0:15:03.974373
