In [1]:
# %load main.py
import os
import re
import shutil
from tqdm import tqdm

import numpy  as np
import pandas as pd
import joblib
import nltk
import ekphrasis
from collections import Counter

### Step1: Data Pre-processing

In [2]:
# read data
Path =os.path.dirname(os.getcwd())
data_pathA=os.path.join(Path,'Datasets/A/twitter-2016train-A.txt')

In [3]:
# transform data into df form
dataA = pd.read_table(data_pathA,sep='\t',header=None)
dataA.columns = ['ID','Sentiment','Text','Nan']
def add_label(sentiment):
    if sentiment == 'negative':
        return 0
    elif sentiment == 'neutral':
        return 1
    elif sentiment == 'positive':
        return 2

dataA['label'] = dataA.Sentiment.apply(add_label)
dataA

Unnamed: 0,ID,Sentiment,Text,Nan,label
0,619950566786113536,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T...",,1
1,619969366986235905,neutral,Order Go Set a Watchman in store or through ou...,,1
2,619971047195045888,negative,If these runway renovations at the airport pre...,,0
3,619974445185302528,neutral,If you could ask an onstage interview question...,,1
4,619987808317407232,positive,A portion of book sales from our Harper Lee/Go...,,2
...,...,...,...,...,...
20627,681877834982232064,neutral,@ShaquilleHoNeal from what I think you're aski...,,1
20628,681879579129200640,positive,"Iran ranks 1st in liver surgeries, Allah bless...",,2
20629,681883903259357184,neutral,Hours before he arrived in Saudi Arabia on Tue...,,1
20630,681904976860327936,negative,@VanityFair Alex Kim Kardashian worth how to ...,,0


In [4]:
# sentiment distribution of data
dataA.loc[:,'label'].value_counts()

1    10342
2     7059
0     3231
Name: label, dtype: int64

1Case conversion
包含“India”和“india”的语料库如果不应用小写化，机器会把它们识别为两个独立的术语，而实际上它们都是同一个单词的不同形式，并且对应于同一个国家。小写化后，仅存在一种“India”实例，即“india”，简化了在语料库中找到所有提到印度时的任务。

In [5]:
#import ekphrasis library
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HCY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def Tokenize(Texts):
    token=[]
    for Text in Texts:
        words = [sentence for sentence in text_processor.pre_process_doc(Text) if (sentence!='s' and sentence!='\'')]
        words = [word for word in words if (word not in stop)]
        token.append(words)
    words=[word for words in token for word in words]
    
    print("All words: {}".format(len(words)))
    # Create Counter
    counts = Counter(words)
    print("Unique words: {}".format(len(counts)))

    Most_common= counts.most_common()[:30]
    print("Top 30 most common words: {}".format(Most_common))
    
    vocab = {word: num for num, word in enumerate(counts, 1)}
    id2vocab = {v: k for k, v in vocab.items()}
    return token,vocab

In [8]:
token,vocab=Tokenize(dataA.Text)

All words: 333435
Unique words: 21914
Top 30 most common words: [('.', 18786), (',', 9728), ('<user>', 7027), ('<url>', 6377), ('<number>', 5262), ('<repeated>', 5142), ('<hashtag>', 5092), ('</hashtag>', 5092), ('!', 4391), ('<allcaps>', 4306), ('</allcaps>', 4306), ('may', 3563), ('-', 3550), ('tomorrow', 2953), ('?', 2787), ('"', 2731), (':', 2682), ('th', 2161), ('1', 2106), ('day', 1714), ('<date>', 1701), ('2', 1379), ('st', 1336), ('friday', 1328), ('see', 1305), ('sunday', 1285), ('&', 1282), ('night', 1267), ('like', 1207), ('going', 1162)]


### Step2: Word2Vec Pretraining

In [9]:
from gensim.models import Word2Vec, KeyedVectors
from nltk import word_tokenize
import multiprocessing
import tensorboard 

In [10]:
word2vec_model=Word2Vec(token,window=5, min_count=1,workers = multiprocessing.cpu_count())

In [11]:
word2vec_model.train(token, total_examples = len(token), epochs = 100)

(26101584, 33343500)

In [12]:
print('This is summary of Word2Vec: {}'.format(word2vec_model))

This is summary of Word2Vec: Word2Vec(vocab=21914, vector_size=100, alpha=0.025)


In [13]:
index=word2vec_model.wv.key_to_index

In [14]:
word2vec_model.wv.save_word2vec_format('Word2Vec.vector')

In [15]:
word_path=os.path.join(Path,'Datasets/datastories.twitter.100d.txt')

In [16]:
embed_matrix = np.zeros((len(index), 100))
embed_dict={}
for word, i in index.items():
    if word in word2vec_model.wv:
        embed_matrix[i] = word2vec_model.wv[word]
        embed_dict[word] = word2vec_model.wv[word]

In [17]:
del word2vec_model

In [18]:
word = pd.read_table(word_path,sep=' ',header=None)

In [19]:
word.set_index(0,inplace=True)

In [20]:
Embed_dict={}
for i in range(word.shape[0]):
    Embed_dict[word.index[i]]=word.iloc[i,:]

In [21]:
# Embed_path=os.path.join(Path,'Datasets/Embed_dict')


### Step3: Training

In [22]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SpatialDropout1D, Bidirectional

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras
import seaborn as sns
import matplotlib.pyplot as plt
from IPython import display
import transformers
import tensorflow_hub as hub
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")
display.set_matplotlib_formats('svg')

  from .autonotebook import tqdm as notebook_tqdm


Version:  2.6.0
Eager mode:  True
GPU is available


  display.set_matplotlib_formats('svg')


In [23]:
def tokenizer_lstm(X, vocab, seq_len):
    '''
    Returns tokenized tensor with left/right padding at the specified sequence length
    '''
    X_tmp = np.zeros((len(X), seq_len), dtype=np.int64)
    for i, text in enumerate(X):
        tokens = [word for word in text_processor.pre_process_doc(text) if (word!='s' and word!='\'')]
        tokens = [word for word in tokens if (word not in stop)]
        token_ids = [vocab[word] for word in tokens if word in embed_dict.keys()]###
        end_idx = min(len(token_ids), seq_len)
        start_idx = max(seq_len - len(token_ids), 0)
        X_tmp[i,start_idx:] = token_ids[:end_idx]

    return X_tmp

In [24]:
X=tokenizer_lstm(dataA.Text, vocab, 100)###

In [25]:
vocab['rock']

1654

In [26]:
Y = tf.one_hot(dataA.label, depth=3)

In [27]:
Y= np.array(Y)
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size=0.1, random_state=1000) 

In [28]:
def build_embedding_layer(vocab, embeddings_index):
    """
    Build embedding matrix and embedding layer
    :param vocab_size: vocabulary size
    :param tok: tokenizer
    :param embeddings_index: embedding index
    :return: embedding matrix and embedding layer
    """
    #Build embedding matrix
    vocab_size=len(vocab)+1
    embedding_matrix = np.zeros((vocab_size, 100))
    for word, i in vocab.items():
        # Vector corresponds to word
        embedding_vector = embed_dict.get(word)###,embed_dict['<unk>']

        if embedding_vector is not None:
            # Ensure vector of embedding_matrix row matches word index
            embedding_matrix[i] = embedding_vector
            
    # Build embedding layer
    embedding_layer = Embedding(input_dim = vocab_size, output_dim = 100, weights = [embedding_matrix], input_length = 100, trainable=False)
    return embedding_layer,embedding_matrix

In [29]:
# def build_embedding_layer(vocab, embeddings_index):
#     """
#     Build embedding matrix and embedding layer
#     :param vocab_size: vocabulary size
#     :param tok: tokenizer
#     :param embeddings_index: embedding index
#     :return: embedding matrix and embedding layer
#     """
#     #Build embedding matrix
#     vocab_size=len(vocab)+1
#     embedding_matrix = np.zeros((vocab_size, 100))
#     for word, i in vocab.items():
#         try:
#             # Vector corresponds to word
#             embedding_vector = embeddings_index.get(word)
#         except:
#             embedding_vector = embeddings_index['<unk>']#['unknown']#['<unk>']
#         if embedding_vector is not None:
#             # Ensure vector of embedding_matrix row matches word index
#             embedding_matrix[i] = embedding_vector
#     # Build embedding layer
#     embedding_layer = Embedding(input_dim = vocab_size, output_dim = 100, weights = [embedding_matrix], input_length = 100, trainable=False)
#     return embedding_layer,embedding_matrix

In [30]:
len(vocab)

21914

In [31]:
embedding_layer,embedding_matrix=build_embedding_layer(vocab, embed_dict) ###

In [32]:
len(embedding_matrix)

21915

In [33]:
del Embed_dict,embedding_matrix,embed_dict,embed_matrix

In [34]:
def model_train(X_train, y_train, embedding_layer):
        """
        Train, validate and test BiLSTM model, calculate accuracy of training and validation set
        :param X_train: tweet train data
        :param y_train: sentiment label train data
        :param embedding_layer: embedding layer
        :param X_test: tweet test data
        :param y_test: sentiment label test data
        :return: accuracy, recall, precision, F1 score and history
        """
        tf.debugging.set_log_device_placement(True)
        model = Sequential()
        model.add(embedding_layer)
        model.add(SpatialDropout1D(0.2))
        
#         LSTM(128, dropout = 0.2, recurrent_dropout = 0.5)

#         LSTM(128,activation='tanh', recurrent_activation='sigmoid',
#              use_bias=True,dropout=0.5,recurrent_dropout=0.0)
    
#         model.add(Bidirectional(LSTM(128,dropout = 0.5,return_sequences=True)))
        model.add(Bidirectional(LSTM(128,dropout = 0.2,recurrent_dropout = 0.5)))
#         model.add(Bidirectional(LSTM(128,dropout = 0.2,recurrent_dropout = 0.5)))
        
        model.add(Dense(3, activation = 'softmax'))
        model.summary()
        model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
        history = model.fit(X_train, y_train, validation_split = 0.2, epochs = 26, batch_size = 256)
        model.save('taskA.h5')
        train_acc = history.history['accuracy'][-1]
        val_acc = history.history['val_accuracy'][-1]
        return train_acc, val_acc, history

In [35]:
acc_A_train, acc_A_val, history = model_train(X_train, Y_train, embedding_layer)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          2191500   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 100, 100)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               234496    
_________________________________________________________________
dense (Dense)                (None, 3)                 771       
Total params: 2,426,767
Trainable params: 235,267
Non-trainable params: 2,191,500
_________________________________________________________________
Epoch 1/26
Epoch 2/26
Epoch 3/26
Epoch 4/26
Epoch 5/26
Epoch 6/26
Epoch 7/26
Epoch 8/26
Epoch 9/26
Epoch 10/26
Epoch 11/26
Epoch 12/26
Epoch 13/26
Epoch 14/26
Epoch 15/26
Epoch 16/26
Epoch 17/26
Epoch 18/26
Epoch 19/26
Epoch 20/26
Epoch 2

KeyboardInterrupt: 

In [None]:
def model_Train(X_train, y_train):
    
    
    model = Sequential()
    class PositionalEncoding(keras.layers.Layer):
        def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
            super().__init__(dtype=dtype, **kwargs)
            if max_dims % 2 == 1: max_dims += 1 # max_dims must be even
            p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
            pos_emb = np.empty((1, max_steps, max_dims))
            pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T
            pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T
            self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
        def call(self, inputs):
            shape = tf.shape(inputs)
            return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]


    model.add(SpatialDropout1D(0.2))   
    
    embed_size = 100; max_steps = 500; vocab_size = len(vocab)+1

    encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
    decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)

    embeddings = keras.layers.Embedding(vocab_size, embed_size,weights = [embedding_matrix])

    encoder_embeddings = embeddings(encoder_inputs)
    decoder_embeddings = embeddings(decoder_inputs)

    positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)

    encoder_in = positional_encoding(encoder_embeddings)
    decoder_in = positional_encoding(decoder_embeddings)
    
    Z = encoder_in
    for N in range(6):
        Z = keras.layers.Attention(use_scale=True)([Z, Z])

    encoder_outputs = Z
    Z = decoder_in
    for N in range(6):
        query_seq_encoding = keras.layers.Attention(use_scale=True, causal=True)([Z, Z])
        query_value_attention_seq = keras.layers.Attention(use_scale=True)([query_seq_encoding, encoder_outputs])

#     outputs = keras.layers.TimeDistributed(
#         keras.layers.Dense(vocab_size, activation="softmax"))(Z)


    # Reduce over the sequence axis to produce encodings of shape
    # [batch_size, filters].
    query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
        query_seq_encoding)
    query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
        query_value_attention_seq)

    # Concatenate query and document encodings to produce a DNN input layer.
    input_layer = tf.keras.layers.Concatenate()(
        [query_encoding, query_value_attention])
    

    #         LSTM(128, dropout = 0.2, recurrent_dropout = 0.5)

    #         LSTM(128,activation='tanh', recurrent_activation='sigmoid',
    #              use_bias=True,dropout=0.5,recurrent_dropout=0.0)

    model.add(Bidirectional(LSTM(128,dropout = 0.2, recurrent_dropout = 0.5)))
     
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    history = model.fit(X_train, y_train, validation_split = 0.2, epochs = 26, batch_size = 64)
    model.summary()
#     model.save('taskA.h5')
    train_acc = history.history['accuracy'][-1]
    val_acc = history.history['val_accuracy'][-1]
    return train_acc, val_acc, history

In [None]:
acc_A_train, acc_A_val, history = model_Train(X_train, Y_train)

In [None]:
    class PositionalEncoding(keras.layers.Layer):
        def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
            super().__init__(dtype=dtype, **kwargs)
            if max_dims % 2 == 1: max_dims += 1 # max_dims must be even
            p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
            pos_emb = np.empty((1, max_steps, max_dims))
            pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T
            pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T
            self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
        def call(self, inputs):
            shape = tf.shape(inputs)
            return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]
    
    embed_size = 100; max_steps = 500; vocab_size = len(vocab)+1

    encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
    decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)

    embeddings = keras.layers.Embedding(vocab_size, embed_size,weights = [embedding_matrix])

    encoder_embeddings = embeddings(encoder_inputs)
    decoder_embeddings = embeddings(decoder_inputs)

    positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)

    encoder_in = positional_encoding(encoder_embeddings)
    decoder_in = positional_encoding(decoder_embeddings)
    
    Z = encoder_in
    for N in range(6):
        Z = keras.layers.Attention(use_scale=True)([Z, Z])

    encoder_outputs = Z
    Z = decoder_in
    for N in range(6):
        query_seq_encoding = keras.layers.Attention(use_scale=True, causal=True)([Z, Z])
        query_value_attention_seq = keras.layers.Attention(use_scale=True)([query_seq_encoding, encoder_outputs])

#     outputs = keras.layers.TimeDistributed(
#         keras.layers.Dense(vocab_size, activation="softmax"))(Z)


    # Reduce over the sequence axis to produce encodings of shape
    # [batch_size, filters].
    query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
        query_seq_encoding)
    query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
        query_value_attention_seq)

    # Concatenate query and document encodings to produce a DNN input layer.
    input_layer = tf.keras.layers.Concatenate()(
        [query_encoding, query_value_attention])

In [None]:
encoder_inputs