In [1]:
# %load main.py
import os
import re
import shutil
from tqdm import tqdm

import numpy  as np
import pandas as pd
import joblib
import nltk
import ekphrasis
from collections import Counter

### Step1: Data Pre-processing

In [2]:
# read data
Path =os.path.dirname(os.getcwd())
data_pathA=os.path.join(Path,'Datasets/A/twitter-2016train-A.txt')

In [3]:
# transform data into df form
dataA = pd.read_table(data_pathA,sep='\t',header=None)
dataA.columns = ['ID','Sentiment','Text']
def add_label(sentiment):
    if sentiment == 'negative':
        return 0
    elif sentiment == 'neutral':
        return 1
    elif sentiment == 'positive':
        return 2

dataA['label'] = dataA.Sentiment.apply(add_label)
dataA

Unnamed: 0,ID,Sentiment,Text,label
0,628949369883000832,negative,dear @Microsoft the newOoffice for Mac is grea...,0
1,628976607420645377,negative,@Microsoft how about you make a system that do...,0
2,629023169169518592,negative,I may be ignorant on this issue but... should ...,0
3,629179223232479232,negative,"Thanks to @microsoft, I just may be switching ...",0
4,629186282179153920,neutral,If I make a game as a #windows10 Universal App...,1
...,...,...,...,...
5863,639855845958885376,positive,@Racalto_SK ok good to know. Punting at MetLif...,2
5864,639979760735662080,neutral,everyone who sat around me at metlife was so a...,1
5865,640196838260363269,neutral,what giants or niners fans would wanna go to t...,1
5866,640975710354567168,positive,Anybody want a ticket for tomorrow Colombia vs...,2


In [4]:
# sentiment distribution of data
dataA.loc[:,'label'].value_counts()

2    3017
1    2001
0     850
Name: label, dtype: int64

1Case conversion
包含“India”和“india”的语料库如果不应用小写化，机器会把它们识别为两个独立的术语，而实际上它们都是同一个单词的不同形式，并且对应于同一个国家。小写化后，仅存在一种“India”实例，即“india”，简化了在语料库中找到所有提到印度时的任务。

In [5]:
#import ekphrasis library
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [6]:
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    stop = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HCY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def Tokenize(Texts):
    token=[]
    for Text in Texts:
        words = [sentence for sentence in text_processor.pre_process_doc(Text) if (sentence!='s' and sentence!='\'')]
        words = [word for word in words if (word not in stop)]
        token.append(words)
    words=[word for words in token for word in words]
    
    print("All words: {}".format(len(words)))
    # Create Counter
    counts = Counter(words)
    print("Unique words: {}".format(len(counts)))

    Most_common= counts.most_common()[:30]
    print("Top 30 most common words: {}".format(Most_common))
    
    vocab = {word: num for num, word in enumerate(counts, 1)}
    id2vocab = {v: k for k, v in vocab.items()}
    return token,vocab

In [8]:
token,vocab=Tokenize(dataA.Text)

All words: 96351
Unique words: 11133
Top 30 most common words: [('.', 5384), (',', 2844), ('<user>', 2245), ('<url>', 2206), ('<number>', 1556), ('<hashtag>', 1482), ('</hashtag>', 1482), ('<allcaps>', 1395), ('</allcaps>', 1395), ('<repeated>', 1381), ('!', 1313), ('-', 1094), ('may', 1087), ('tomorrow', 897), (':', 888), ('?', 859), ('"', 634), ('th', 600), ('day', 572), ('1', 565), ('<date>', 526), ('going', 406), ('st', 383), ('apple', 381), ('&', 370), ('2', 369), ('see', 359), ('like', 347), ('friday', 344), ('amazon', 342)]


### Step2: Word2Vec Pretraining

In [9]:
from gensim.models import Word2Vec, KeyedVectors
from nltk import word_tokenize
import multiprocessing

In [10]:
word2vec_model=Word2Vec(token,window=5, min_count=1,workers = multiprocessing.cpu_count())

In [11]:
word2vec_model.train(token, total_examples = len(token), epochs = 100)

(7432904, 9635100)

In [12]:
print('This is summary of Word2Vec: {}'.format(word2vec_model))

This is summary of Word2Vec: Word2Vec(vocab=11133, vector_size=100, alpha=0.025)


In [13]:
index=word2vec_model.wv.key_to_index

In [14]:
word2vec_model.wv.save_word2vec_format('Word2Vec.vector')

In [15]:
word_path=os.path.join(Path,'Datasets/datastories.twitter.100d.txt')

In [16]:
embed_matrix = np.zeros((len(index), 100))
embed_dict={}
for word, i in index.items():
    if word in word2vec_model.wv:
        embed_matrix[i] = word2vec_model.wv[word]
        embed_dict[word] = word2vec_model.wv[word]

In [17]:
word = pd.read_table(word_path,sep=' ',header=None)

In [18]:
word.set_index(0,inplace=True)

In [19]:
Embed_dict={}
for i in range(word.shape[0]):
    Embed_dict[word.index[i]]=word.iloc[i,:]

### Step3: Training

In [21]:
from torch.nn import LSTM
import torch.nn as nn
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


import seaborn as sns
import matplotlib.pyplot as plt
# from transformers import AdamW as AdamW_HF#, get_linear_schedule_with_warmup


In [22]:
class Transformer(nn. Module): 
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class, weight,dim_feedforward=512, 
                 num_head=2, num_layers=2, dropout=0.1, max_len=128, activation: str = "relu"):
        super (Transformer, self).__init__()
        self. embedding_dim = embedding_dim
        self. embeddings = nn. Embedding (vocab_size, embedding_dim) # 词向量层
        embedding = nn.Embedding.from_pretrained(weight)
        self. position_embedding = PositionalEncoding (embedding_dim, dropout,max_len) #位置编码层

        #编码层:使用TransformerEncoder
        encoder_layer = nn. TransformerEncoderLayer (hidden_dim, num_head, dim_feedforward, dropout, activation)
        self.transformer = nn. TransformerEncoder (encoder_layer, num_layers)

        #输出层
        self.output = nn.Linear (hidden_dim, num_class)
        
    def forward (self, inputs, lengths): 
        inputs = torch.transpose (inputs, 0, 1)
        # 与LSTM处理情况相同,输入数据的第1维是批次,需要转换为TransformerEncoder
        #所需要的第1维是长度,第2维是批次的形状
        hidden_states = self.embeddings(inputs) 
        hidden_states = self.position_embedding (hidden_states) 
        attention_mask = length_to_mask (lengths) == False
        #根据批次中每个序列长度生成Mask矩阵
        hidden_states = self.transformer (hidden_states, src_key_padding_mask= attention_mask)
        hidden_states = hidden_states [0, :, :]
        #取第一个标记的输出结果作为分类层的输入
        output = self.output (hidden_states)
        log_probs = F.log_softmax (output, dim=1) 
        return log_probs   
    def length_to_mask(lengths):
        max_len = torch.max (lengths)
        mask = torch.arange (max_len) .expand (lengths. shape [0], max_len) < lengths.unsqueeze (1)
        return mask
    
class PositionalEncoding (nn.Module): 
    def __init__(self, d_model, dropout=0.1, max_len=512): 
        super (PositionalEncoding, self).__init__()
        
        pe = torch.zeros (max_len, d_model)
        position = torch.arange (O, max_len, dtype=torch.float).unsqueeze (1) 
        div_term = torch.exp (torch.arange (0, d_model, 2).float () * (-math.log (10000.0) / d_model))
        pe[:, 0::2] = torch.sin (position * div_term) # 对偶数位置编码
        pe[:, 1::2] = torch.cos (position * div_term) #对奇数位置编码
        pe = pe.unsqueeze (O). transpose (0, 1)
        self.register_buffer('pe', pe) # 不对位置编码层求梯度
        
    def forward (self, x):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        x= x + self.pe[:x.size (0), :] # 输入的词向量与位置编码相加
        return x 

In [None]:
vocab_size=len(vocab)+1
embedding_dim=100
hidden_dim=256
num_class=2
weight=
Transformer(vocab_size, embedding_dim, hidden_dim, num_class,weight)

In [None]:
class BiLSTM_Attention(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers, weight):

        super(BiLSTM_Attention, self).__init__()

        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        embedding = nn.Embedding.from_pretrained(weight)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, dropout=0.5)
        self.fc = nn.Linear(hidden_dim * 2, 3)
        self.dropout = nn.Dropout(0.5)

        # Initialize the hidden state
        self.w_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, hidden_dim * 2))
        self.u_omega = nn.Parameter(torch.Tensor(hidden_dim * 2, 1))

        nn.init.uniform_(self.w_omega, -0.1, 0.1)
        nn.init.uniform_(self.u_omega, -0.1, 0.1)


    def attention_net(self, x):       #x:[batch, seq_len, hidden_dim*2]

        u = torch.tanh(torch.matmul(x, self.w_omega))         #[batch, seq_len, hidden_dim*2]
        att = torch.matmul(u, self.u_omega)                   #[batch, seq_len, 1]
        att_score = F.softmax(att, dim=1)

        scored_x = x * att_score                              #[batch, seq_len, hidden_dim*2]

        context = torch.sum(scored_x, dim=1)                  #[batch, hidden_dim*2]
        return context


    def forward(self, x):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        embedding = self.dropout(self.embedding(x))       #[seq_len, batch, embedding_dim]
        #embedding = embedding + (0.2**0.5)*torch.randn(embedding.shape,device=device)

        # output: [seq_len, batch, hidden_dim*2]     hidden/cell: [n_layers*2, batch, hidden_dim]
        output, (final_hidden_state, final_cell_state) = self.rnn(embedding)
        output = output.permute(1, 0, 2)                  #[batch, seq_len, hidden_dim*2]
        
        output = output + (0.2**0.5)*torch.randn(output.shape,device=device)

        attn_output = self.attention_net(output)
        logit = self.fc(attn_output)
        return logit
    
    
# Define LSTM Tokenizer
def tokenizer_lstm(X, vocab, seq_len):
    '''
    Returns tokenized tensor with left/right padding at the specified sequence length
    '''
    X_tmp = np.zeros((len(X), seq_len), dtype=np.int64)
    for i, text in enumerate(X):
        tokens = [word for word in text_processor.pre_process_doc(text) if (word!='s' and word!='\'')]
        tokens = [word for word in tokens if (word not in stop)]
        token_ids = [vocab[word] for word in tokens if word in Embed_dict.keys()]
        end_idx = min(len(token_ids), seq_len)
        start_idx = max(seq_len - len(token_ids), 0)
        X_tmp[i,start_idx:] = token_ids[:end_idx]

    return X_tmp

In [None]:
# def tokenizer_lstm(X, vocab, seq_len, padding):
#     '''
#     Returns tokenized tensor with left/right padding at the specified sequence length
#     '''
#     X_tmp = np.zeros((len(X), seq_len), dtype=np.int64)
#     for i, text in enumerate(X):
#         tokens = tokenize_text(text, 3) 
#         token_ids = [vocab[word] for word in tokens if word in word2idx.keys()]
#         end_idx = min(len(token_ids), seq_len)
#         if padding == 'right':
#             X_tmp[i,:end_idx] = token_ids[:end_idx]
#         elif padding == 'left':
#             start_idx = max(seq_len - len(token_ids), 0)
#             X_tmp[i,start_idx:] = token_ids[:end_idx]

#     return torch.tensor(X_tmp, dtype=torch.int64)

In [None]:
X=tokenizer_lstm(dataA.Text, vocab, 100)###

In [None]:
Y = tf.one_hot(dataA.label, depth=3)

In [None]:
Y= np.array(Y)
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size=0.1, random_state=1000) 

In [None]:
def build_embedding_layer(vocab, embeddings_index):
    """
    Build embedding matrix and embedding layer
    :param vocab_size: vocabulary size
    :param tok: tokenizer
    :param embeddings_index: embedding index
    :return: embedding matrix and embedding layer
    """
    #Build embedding matrix
    vocab_size=len(vocab)+1
    embedding_matrix = np.zeros((vocab_size, 100))
    for word, i in vocab.items():
        try:
            # Vector corresponds to word
            embedding_vector = embeddings_index.get(word)
        except:
            embedding_vector = embeddings_index['<unk>']#['unknown']#['<unk>']
        if embedding_vector is not None:
            # Ensure vector of embedding_matrix row matches word index
            embedding_matrix[i] = embedding_vector
    # Build embedding layer
    embedding_layer = Embedding(input_dim = vocab_size, output_dim = 100, weights = [embedding_matrix], input_length = 100, trainable=False)
    return embedding_layer,embedding_matrix

In [None]:
embedding_layer,embedding_matrix=build_embedding_layer(vocab, Embed_dict) ###

In [None]:
def model_train(X_train, y_train, embedding_layer):
        """
        Train, validate and test BiLSTM model, calculate accuracy of training and validation set
        :param X_train: tweet train data
        :param y_train: sentiment label train data
        :param embedding_layer: embedding layer
        :param X_test: tweet test data
        :param y_test: sentiment label test data
        :return: accuracy, recall, precision, F1 score and history
        """
        tf.debugging.set_log_device_placement(True)
        model = Sequential()
        model.add(embedding_layer)
        model.add(SpatialDropout1D(0.2))
        
#         LSTM(128, dropout = 0.2, recurrent_dropout = 0.5)

#         LSTM(128,activation='tanh', recurrent_activation='sigmoid',
#              use_bias=True,dropout=0.5,recurrent_dropout=0.0)
    
        model.add(Bidirectional(LSTM(128,dropout = 0.2, recurrent_dropout = 0.5,return_sequences=True)))
        model.add(Bidirectional(LSTM(128,dropout = 0.2,recurrent_dropout = 0.5)))
        
        model.add(Dense(3, activation = 'softmax'))
        model.summary()
        model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
        history = model.fit(X_train, y_train, validation_split = 0.2, epochs = 26, batch_size = 256)
        model.save('taskA.h5')
        train_acc = history.history['accuracy'][-1]
        val_acc = history.history['val_accuracy'][-1]
        return train_acc, val_acc, history

In [None]:
acc_A_train, acc_A_val, history = model_train(X_train, Y_train, embedding_layer)

In [None]:
def model_Train(X_train, y_train):
    
    
    model = Sequential()
    class PositionalEncoding(keras.layers.Layer):
        def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
            super().__init__(dtype=dtype, **kwargs)
            if max_dims % 2 == 1: max_dims += 1 # max_dims must be even
            p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
            pos_emb = np.empty((1, max_steps, max_dims))
            pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T
            pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T
            self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
        def call(self, inputs):
            shape = tf.shape(inputs)
            return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]


    model.add(SpatialDropout1D(0.2))   
    
    embed_size = 100; max_steps = 500; vocab_size = len(vocab)+1

    encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
    decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)

    embeddings = keras.layers.Embedding(vocab_size, embed_size,weights = [embedding_matrix])

    encoder_embeddings = embeddings(encoder_inputs)
    decoder_embeddings = embeddings(decoder_inputs)

    positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)

    encoder_in = positional_encoding(encoder_embeddings)
    decoder_in = positional_encoding(decoder_embeddings)
    
    Z = encoder_in
    for N in range(6):
        Z = keras.layers.Attention(use_scale=True)([Z, Z])

    encoder_outputs = Z
    Z = decoder_in
    for N in range(6):
        query_seq_encoding = keras.layers.Attention(use_scale=True, causal=True)([Z, Z])
        query_value_attention_seq = keras.layers.Attention(use_scale=True)([query_seq_encoding, encoder_outputs])

#     outputs = keras.layers.TimeDistributed(
#         keras.layers.Dense(vocab_size, activation="softmax"))(Z)


    # Reduce over the sequence axis to produce encodings of shape
    # [batch_size, filters].
    query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
        query_seq_encoding)
    query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
        query_value_attention_seq)

    # Concatenate query and document encodings to produce a DNN input layer.
    input_layer = tf.keras.layers.Concatenate()(
        [query_encoding, query_value_attention])
    

    #         LSTM(128, dropout = 0.2, recurrent_dropout = 0.5)

    #         LSTM(128,activation='tanh', recurrent_activation='sigmoid',
    #              use_bias=True,dropout=0.5,recurrent_dropout=0.0)

    model.add(Bidirectional(LSTM(128,dropout = 0.2, recurrent_dropout = 0.5)))
     
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    history = model.fit(X_train, y_train, validation_split = 0.2, epochs = 26, batch_size = 64)
    model.summary()
#     model.save('taskA.h5')
    train_acc = history.history['accuracy'][-1]
    val_acc = history.history['val_accuracy'][-1]
    return train_acc, val_acc, history

In [None]:
acc_A_train, acc_A_val, history = model_Train(X_train, Y_train)