### Task 1: Aspect-based financial sentiment analysis

Given a text instance in the financial domain (microblog message, news statement or headline) in English, detect the 
* target aspects which are mentioned in the text (from a pre-defined list of aspect classes) and,
* predict the sentiment score for each of the mentioned targets. Sentiment scores will be defined using continuous numeric values ranged from -1(negative) to 1(positive). 

Systems will be evaluated with regard to aspect classification, sentiment classification and aspect-sentiment attachment. Participating systems will be evaluated with regard to precision, recall and F1-score for aspect classification approaches and regard to MSE and R Squared(R^2) metrics for sentiment prediction approaches.

In [1]:
import os
import re
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from string import punctuation,digits


In [2]:
train_file ="data/task1_post_ABSA_train.json"
test_file ="data/task1_post_ABSA_test.json"

In [3]:
def load_data(filename, train = True):
    df = pd.read_json(filename)
    
    aspect_pair = dict()
    ids = []
    aspects = []
    snippets = []
    sentence_targets = []
    sentences = []
    sentiment_scores = []
    
    def getAspects(aspect):
        aspect = aspect.replace('[', '')
        aspect = aspect.replace(']', '')
        aspect = aspect.replace('\'', '')
        return aspect.split('/')
    
    for id in df:
        ids.append(id)
        sentence_targets.append(df[id]['info'][0]['target'].lower())
        sentences.append(df[id]['sentence'].lower())
        snippets.append(df[id]['info'][0]['snippets'].lower())
        
        if (train):
            
            aspects.append(getAspects((df[id]['info'][0]['aspects'].lower())))
            sentiment_scores.append(float((df[id]['info'][0]['sentiment_score'])))
                                    
    if (train):
        return sentences, snippets, sentence_targets, aspects, np.asarray(sentiment_scores)
    return ids, sentences, snippets       

In [4]:
df = pd.read_json(train_file)
sentences, snippets, sentence_targets, aspects, sentiment_scores = load_data(train_file, True)

In [5]:
from scipy.interpolate import interp1d
def rescale(series,old_range,new_range):
    m = interp1d(old_range,new_range)
    return [float(m(x)) for x in series]
score = rescale(sentiment_scores,[-1,1],[0,1])
score = np.asarray(score)

In [6]:
df.T.sentence.apply(lambda x: len(x.split())).hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7f6c2793f668>

In [7]:
def remove_punctuation(s):
    list_punctuation = list(punctuation)
    for i in list_punctuation:
        s = s.replace(i,' ')
    return s
def clean_sentence(sentence):
    sentence = sentence.lower()
    #remove multiple repeat non num-aplha char !!!!!!!!!-->!
    sentence = re.sub(r'(\W)\1{2,}', r'\1', sentence) 
#     print(sentence)
    #removes alpha char repeating more than twice aaaa->aa
    sentence = re.sub(r'(\w)\1{2,}', r'\1\1', sentence)
#     print(sentence)
    #removes links
    sentence = re.sub(r'(?P<url>https?://[^\s]+)', r'', sentence)
#     print(sentence)
    # remove @usernames
    sentence = re.sub(r"\@(\w+)", "", sentence)
#     print(sentence)
    #removing stock names to see if it helps
#     sentence = re.sub(r"(?:\$|https?\://)\S+", "", sentence)
    #remove # from #tags
    sentence = sentence.replace('#','')
    sentence = sentence.replace("'s",'')
    sentence = sentence.replace("-",' ')
#     print(sentence)
    # split into tokens by white space
    tokens = sentence.split()
    # remove punctuation from each token
    tokens = [remove_punctuation(w) for w in tokens]
#     print(tokens)
    #     remove remaining tokens that are not alphabetic
#     tokens = [word for word in tokens if word.isalpha()]
#no removing non alpha words to keep stock names($ZSL)
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
#     for w in stop_words:
#         print(w)
#     print(tokens)
    # filter out short tokens
#     tokens = [word for word in tokens if len(word) > 1]
#     print(tokens)
    remove_digits = str.maketrans('', '', digits)
#     print(tokens)
    tokens = [w.translate(remove_digits) for w in tokens]
    tokens = [w.strip() for w in tokens]
    tokens = [w for w in tokens if w!=""]
#     print(tokens)
    tokens = ' '.join(tokens)
    return tokens

In [8]:
print('cleaning data set')
sentences = [clean_sentence(x) for x in sentences]

cleaning data set


In [9]:
sentences[:5]

['slowly adding fio gotta careful one biggest winners',
 'trx long setup macd cross',
 'optimistic amzn fundementals charts look like poopoo quarter',
 'grpn might selling ahead p earnings',
 'iaci looks good weekly chart']

In [10]:
print('cleaning targets')
sentence_targets = [clean_sentence(x) for x in sentence_targets]

cleaning targets


In [11]:
# sentence_targets

In [12]:
lengths = [len(s.split()) for s in sentences]
max_length = max(lengths)
max_length

19

In [13]:
import os
import h5py
import pprint
import pandas as pd 
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import backend as K
from bert.tokenization import FullTokenizer
import os 
import re
from tqdm import tqdm_notebook
import tqdm
sess=tf.Session()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [14]:
max_seq_length=max_length
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

In [15]:
# padding dummy - doesnt pad
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """


      
 
 # created bert tokenizer
def create_tokenizer_from_hub_module():
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )
    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)


# creates the formaat for input for BERT
class InputExample(object):
    def __init__(self, guid, text, labels):
        self.guid = guid
        self.text = text
        self.labels = labels
        
class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids, label_ids, is_real_example=True):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids,
        self.is_real_example=is_real_example  
        

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens = tokenizer.tokenize(example.text)
    if len(tokens) > max_seq_length - 2:
        tokens = tokens[0 : (max_seq_length - 2)]
    
    all_phrases = []
    all_tokens = []
    segment_ids = []
    all_tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens:
        all_tokens.append(token)
        segment_ids.append(1)
    all_tokens.append("[SEP]")
    segment_ids.append(1)

    all_phrases.append(example.text)
    input_ids = tokenizer.convert_tokens_to_ids(all_tokens)
    

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.labels, all_phrases

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels, all_phrases = [], [], [], [], []
    for i, example in enumerate(examples):
        input_id, input_mask, segment_id, label, phrases = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
        all_phrases.append(phrases)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels),
        np.array(all_phrases)
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    
    for text, label in zip(texts, labels):
#         print(text, label)
        InputExamples.append(
            InputExample(guid=None, text=" ".join(text), labels=label)
        )
#         print(InputExamples)
    return InputExamples

def input_fn_builder(features, seq_length, is_training, drop_remainder):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    all_label_ids = []

    for feature in features:
        all_input_ids.append(feature.input_ids)
        all_input_mask.append(feature.input_mask)
        all_segment_ids.append(feature.segment_ids)
        all_label_ids.append(feature.label_ids)

    def input_fn(params):
        batch_size = params["batch_size"]
        num_examples = len(features)

        # This is for demo purposes and does NOT scale to large data sets. We do
        # not use Dataset.from_generator() because that uses tf.py_func which is
        # not TPU compatible. The right way to load data is with TFRecordReader.
        
        d = tf.data.Dataset.from_tensor_slices({
            "input_ids":
                tf.constant(
                    all_input_ids, shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "input_mask":
                tf.constant(
                    all_input_mask,
                    shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "segment_ids":
                tf.constant(
                    all_segment_ids,
                    shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "label_ids":
                tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32)
        })

        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
        return d

    return input_fn

In [16]:
tokenizer = create_tokenizer_from_hub_module() 

W0921 06:31:59.632857 140103382267712 deprecation_wrapper.py:119] From /home/pakhi/.local/lib/python3.6/site-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



In [17]:
# score

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sentences, score, test_size=0.2, random_state=42)

In [19]:
train_examples = convert_text_to_examples(X_train, y_train)
test_examples = convert_text_to_examples(X_test, y_test)

In [20]:
(train_input_ids, train_input_masks, train_segment_ids, train_labels, train_examples) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
(test_input_ids, test_input_masks, test_segment_ids, test_labels, test_examples) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

In [21]:
from tensorflow.keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [22]:
class BertLayer(tf.keras.layers.Layer):
                    def __init__(self, n_fine_tune_layers, **kwargs):
                        self.n_fine_tune_layers = n_fine_tune_layers
                        self.trainable = True
                        self.output_size = 768
                        super(BertLayer, self).__init__(**kwargs)

                    def build(self, input_shape):
                        self.bert = hub.Module(
                        bert_path,
                        trainable=True,# did this in place of self.trainable
                        name="{}_module".format(self.name)
                      )

                        trainable_vars = self.bert.variables


                        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
                        #print("--------------------------len=",len(trainable_vars))
                        # Select how many layers to fine tune
                        trainable_vars = trainable_vars[-self.n_fine_tune_layers:]

                        # Add to trainable weights
                        for var in trainable_vars:
                            self._trainable_weights.append(var)

                        for var in self.bert.variables:
                            if var not in self._trainable_weights:
                                self._non_trainable_weights.append(var)

                        super(BertLayer, self).build(input_shape)

                    def call(self, inputs):
                        inputs = [K.cast(x, dtype="int32") for x in inputs]
                        input_ids, input_mask, segment_ids = inputs
                        bert_inputs = dict(
                          input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
                      )
                        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                          "pooled_output"
                      ]
                        return result

                    def compute_output_shape(self, input_shape):
                        return (input_shape[0], self.output_size)

In [23]:
total_epochs = 50

In [24]:

def sk_mse(y_true,y_pred):
     return K.mean(K.square(y_pred - y_true), axis=-1)
from sklearn.metrics import mean_squared_error as mse

for lr in [3e-4]:
    for epochs in [total_epochs]:
        for dropout in [0.3]:
            for layers in [2]: 
                sess=tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
                from tensorflow.keras.layers import Input,Dense
                #in_id=Input(shape=(max_seq_length,),)
                #in_mask=Input(shape=(max_seq_length,),)
                #in_segment=Input(shape=(max_seq_length,),)
                in_id = tf.keras.layers.Input(shape=(max_seq_length,))
                in_mask = tf.keras.layers.Input(shape=(max_seq_length,))
                in_segment = tf.keras.layers.Input(shape=(max_seq_length,))
                bert_inputs=[in_id,in_mask,in_segment]
                bert_outputs=BertLayer(n_fine_tune_layers=5)(bert_inputs)
                step=bert_outputs
                if layers>=3:
                    step=tf.keras.layers.Dense(512,activation='relu')(step)
                    if dropout!=0:
                        step=tf.keras.layers.Dropout(rate=dropout)(step)
                if layers>=2:
                    step=tf.keras.layers.Dense(256,activation='relu')(step)
                    if dropout!=0:
                        step=tf.keras.layers.Dropout(rate=dropout)(step)
                if layers>=1:    
                    step=tf.keras.layers.Dense(64,activation='relu')(step)
                    if dropout!=0:
                        step=tf.keras.layers.Dropout(rate=dropout)(step)
                        
                        
#                 model.add(Dense(y_train.shape[1], activation='softmax'))

# sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(loss='categorical_crossentropy',
#               optimizer=sgd,
#               metrics=['accuracy',])

                pred=tf.keras.layers.Dense(1,activation='sigmoid')(step)
                
#                 pred=tf.keras.layers.Dense(512,activation='relu')(step)
                model=tf.keras.Model(inputs=bert_inputs,outputs=pred)
    
                model.compile(loss='binary_crossentropy',
                        optimizer=tf.keras.optimizers.SGD(lr = lr),
                        metrics=[sk_mse])
                sess.run(tf.local_variables_initializer())
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                K.set_session(sess)

           
                model.fit([train_input_ids, train_input_masks, train_segment_ids],
                        train_labels,
                        epochs=epochs,
                        batch_size=64,
                        validation_data=([test_input_ids, test_input_masks, test_segment_ids],test_labels))
                print(lr," ",epochs," ",dropout," " ,layers)
                
#                 from sklearn.metrics import f1_score,accuracy_score
#                 predict=model.predict([test_input_ids, test_input_masks, test_segment_ids])>0.5
#                 print("task=",f1_score(test_labels,predict),"  acc=",accuracy_score(test_labels,predict))
                
                
                model.save('model-{}-{}-{}-{}.h5'.format(lr,epochs,dropout,layers))

W0921 06:32:10.637991 140103382267712 deprecation.py:506] From /home/pakhi/.local/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0921 06:32:10.728289 140103382267712 deprecation.py:323] From /home/pakhi/.local/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 540 samples, validate on 135 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
0.0003   50   0.3   2


In [None]:

def sk_mse(y_true,y_pred):
     return K.mean(K.square(y_pred - y_true), axis=-1)
from sklearn.metrics import mean_squared_error as mse

for lr in [3e-4]:
    for epochs in [total_epochs]:
        for dropout in [0.3]:
            for layers in [3]: 
                sess=tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
                from tensorflow.keras.layers import Input,Dense
                #in_id=Input(shape=(max_seq_length,),)
                #in_mask=Input(shape=(max_seq_length,),)
                #in_segment=Input(shape=(max_seq_length,),)
                in_id = tf.keras.layers.Input(shape=(max_seq_length,))
                in_mask = tf.keras.layers.Input(shape=(max_seq_length,))
                in_segment = tf.keras.layers.Input(shape=(max_seq_length,))
                bert_inputs=[in_id,in_mask,in_segment]
                bert_outputs=BertLayer(n_fine_tune_layers=10)(bert_inputs)
                step=bert_outputs
                if layers>=3:
                    step=tf.keras.layers.Dense(512,activation='relu')(step)
                    if dropout!=0:
                        step=tf.keras.layers.Dropout(rate=dropout)(step)
                if layers>=2:
                    step=tf.keras.layers.Dense(256,activation='relu')(step)
                    if dropout!=0:
                        step=tf.keras.layers.Dropout(rate=dropout)(step)
                if layers>=1:    
                    step=tf.keras.layers.Dense(64,activation='relu')(step)
                    if dropout!=0:
                        step=tf.keras.layers.Dropout(rate=dropout)(step)
                        
                        
#                 model.add(Dense(y_train.shape[1], activation='softmax'))

# sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
# model.compile(loss='categorical_crossentropy',
#               optimizer=sgd,
#               metrics=['accuracy',])

                pred=tf.keras.layers.Dense(1,activation='sigmoid')(step)
                
#                 pred=tf.keras.layers.Dense(512,activation='relu')(step)
                model=tf.keras.Model(inputs=bert_inputs,outputs=pred)
    
                model.compile(loss='binary_crossentropy',
                        optimizer=tf.keras.optimizers.Adam(),
                        metrics=[sk_mse])
                sess.run(tf.local_variables_initializer())
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                K.set_session(sess)

           
                model.fit([train_input_ids, train_input_masks, train_segment_ids],
                        train_labels,
                        epochs=epochs,
                        batch_size=64,
                        validation_data=([test_input_ids, test_input_masks, test_segment_ids],test_labels))
                print(lr," ",epochs," ",dropout," " ,layers)
                
#                 from sklearn.metrics import f1_score,accuracy_score
#                 predict=model.predict([test_input_ids, test_input_masks, test_segment_ids])>0.5
#                 print("task=",f1_score(test_labels,predict),"  acc=",accuracy_score(test_labels,predict))
                
                
                model.save('model-{}-{}-{}-{}.h5'.format(lr,epochs,dropout,layers))

In [None]:
from keras.models import load_model
model.load_weights("model-0.0003-50-0.3-2.h5")

In [69]:
# sentences, snippets, sentence_targets = load_data(test_file,False)

In [108]:
import tensorflow as tf
from keras import backend as K

with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    print(x.eval())

[0.04228542 0.04635481 0.04639622 0.04229137 0.0419168  0.08598778
 0.05294004 0.04566521 0.05538781 0.04919351 0.05847561 0.08086222
 0.04201136 0.04231504 0.07805026 0.04793698 0.04709451 0.04434207
 0.05671643 0.06963301 0.0447852  0.04693359 0.06271897 0.04186552
 0.07006823 0.08654392 0.04632385 0.05089989 0.06610586 0.07212201
 0.05023433 0.04173227 0.04693127 0.04175879 0.04446321 0.07106253
 0.05510156 0.0447852  0.04193075 0.0442617  0.05080562 0.04205786
 0.04177163 0.04626166 0.04174399 0.04192484 0.04172815 0.05371148
 0.06341856 0.04280535 0.06103785 0.05276686 0.04256677 0.08942407
 0.04173762 0.04176524 0.08477106 0.07470819 0.0430429  0.04373793
 0.04173628 0.0418129  0.06306099 0.04719155 0.06942317 0.04174575
 0.04193601 0.05030504 0.05649036 0.05343602 0.04174253 0.04204969
 0.04896932 0.04356697 0.05262467 0.0485943  0.04220891 0.04194627
 0.04602109 0.04564696 0.04613045 0.07988592 0.04738835 0.05239591
 0.04766217 0.07470819 0.04177954 0.04225017]


In [80]:
ss= sk_mse(y_test, preds)

In [107]:
import numpy
x = (K.mean(K.square(preds - y_test), axis=-1))

In [72]:
# # train_examples = convert_text_to_examples(X_train,y_train)
# test_examples = convert_text_to_examples(sentences, sentence_targets)
# # (train_input_ids, train_input_masks, train_segment_ids, train_labels, train_examples) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
# (test_input_ids, test_input_masks, test_segment_ids, test_labels, test_examples) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)
# # train_examples = [str(tr).replace(" ","-").replace("--"," ").replace("-","") for tr in train_examples]
# # test_examples = [str(tr).replace(" ","-").replace("--"," ").replace("-","") for tr in test_examples]

preds = model.predict([test_input_ids, 
                       test_input_masks, 
                      test_segment_ids]) 

In [42]:
pred_val = [x[0] for x in preds]
pred_val = rescale(pred_val,[-1,1],[0,1])

In [73]:
y_test[:10]

array([0.356 , 0.607 , 0.209 , 0.6435, 0.6815, 0.7595, 0.2015, 0.3445,
       0.6695, 0.6825])

In [102]:
preds.shape

(88, 1)

In [114]:
X_test

['berkshire discloses unit ties iran opens probe',
 'kraft heinz merger came together speedy weeks',
 'us dollar wipes sales gains sabmiller',
 'shire says internal synergy goals baxalta deal higher',
 'greene king third quarter sales boosted festive season',
 'astrazeneca chases acerta secure next cancer drug winner',
 'ftse drops pct glencore metals price fears',
 'kingfisher takeover mr bricolage could hit brick wall',
 'uk housing market steadies brexit dip persimmon says',
 'whitbread buys stake pure food chain',
 'companiestesco bad start xmas â€“ kantar',
 'british american tobacco drops sues pwc pollution scandal',
 'ab inbev looks win sabmiller investors',
 'retail giant kingfisher reports olid start year',
 'ftse falls month low greek debt concerns easyjet skids',
 'uk ftse worst day far bg prudential fall',
 'aviva m g suspend property funds investors panic',
 'rio tinto ceo says iron ore market equilibrium',
 'citigroup sell onemain springleaf billion',
 'update stifel buy 

In [100]:
y_test

array([0.356 , 0.607 , 0.209 , 0.6435, 0.6815, 0.7595, 0.2015, 0.3445,
       0.6695, 0.6825, 0.198 , 0.2815, 0.638 , 0.684 , 0.293 , 0.1745,
       0.0965, 0.5   , 0.6415, 0.582 , 0.825 , 0.376 , 0.0865, 0.612 ,
       0.8885, 0.779 , 0.757 , 0.7775, 0.7135, 0.1625, 0.6405, 0.251 ,
       0.642 , 0.7865, 0.569 , 0.695 , 0.301 , 0.6295, 0.343 , 0.655 ,
       0.613 , 0.7745, 0.586 , 0.682 , 0.607 , 0.5945, 0.5995, 0.625 ,
       0.421 , 0.508 , 0.5925, 0.666 , 0.795 , 0.243 , 0.313 , 0.6175,
       0.7145, 0.526 , 0.172 , 0.6185, 0.533 , 0.477 , 0.686 , 0.6405,
       0.131 , 0.429 , 0.641 , 0.5   , 0.454 , 0.6175, 0.299 , 0.3335,
       0.5995, 0.626 , 0.5675, 0.3495, 0.107 , 0.6225, 0.7475, 0.5125,
       0.62  , 0.7295, 0.937 , 0.4625, 0.318 , 0.8595, 0.4435, 0.758 ,
       0.5185, 0.3355, 0.7005, 0.5265, 0.6675, 0.625 , 0.3275, 0.2145,
       0.703 , 0.607 , 0.6635, 0.5385, 0.9235, 0.723 , 0.5645, 0.739 ,
       0.1105, 0.6065, 0.7755, 0.5855, 0.564 ])

In [101]:
preds

array([[0.5893113 ],
       [0.6145039 ],
       [0.5230806 ],
       [0.56306386],
       [0.54047114],
       [0.64143586],
       [0.6093089 ],
       [0.6004259 ],
       [0.5357079 ],
       [0.6039903 ],
       [0.60341454],
       [0.5169948 ],
       [0.5572565 ],
       [0.5291268 ],
       [0.57132906],
       [0.5365583 ],
       [0.34788162],
       [0.51631945],
       [0.58267576],
       [0.561509  ],
       [0.5421865 ],
       [0.53402257],
       [0.5311986 ],
       [0.51927257],
       [0.26019973],
       [0.63256836],
       [0.56974584],
       [0.5873601 ],
       [0.5836445 ],
       [0.57729363],
       [0.59763235],
       [0.53369987],
       [0.5459679 ],
       [0.55213094],
       [0.5272893 ],
       [0.5981059 ],
       [0.57120275],
       [0.5421865 ],
       [0.54254436],
       [0.5999911 ],
       [0.6611622 ],
       [0.6079808 ],
       [0.5244283 ],
       [0.6314275 ],
       [0.6043837 ],
       [0.52255017],
       [0.5557072 ],
       [0.528

In [102]:
result = {'results':[]}

In [103]:
for s_id,s_sn,s_score in zip(sentences,snippets,score):
    result['results'].append({'id':s_id,'snippet':s_sn,'sentiment_scores':str(s_score)})

In [104]:
result = {'results':[]}

for s_id,s_sn,s_score in zip(sentences,snippets,pred_val):
    result['results'].append({'id':s_id,'snippet':s_sn,'sentiment_scores':str(s_score)})

In [105]:
result

{'results': [{'id': '0_Cuadrilla',
   'snippet': 'cuadrilla files to delay application to frack in lancashire',
   'sentiment_scores': '0.313'},
  {'id': '1001_Sainsbury',
   'snippet': 'sainsbury chief warns of squeeze on high street retailers',
   'sentiment_scores': '0.38'},
  {'id': '1006_Barclays',
   'snippet': 'barclays fined for anti-money-laundering failings',
   'sentiment_scores': '0.4195'},
  {'id': '1007_Barclays',
   'snippet': "update 3-barclays fined for lax crime checks in 'deal of century'",
   'sentiment_scores': '0.5685'},
  {'id': '1014_GSK',
   'snippet': 'gsk aims to file up to 20 new drugs for approval by 2020',
   'sentiment_scores': '0.852'},
  {'id': '1031_National Grid',
   'snippet': 'companiesnational grid lines up sale of gas business',
   'sentiment_scores': '0.723'},
  {'id': '1034_British American Tobacco',
   'snippet': 'british american tobacco accused of bribing senior politicians to sabotage ...',
   'sentiment_scores': '0.5680000000000001'},
  {'i