In [1]:
# %load main.py
import os
import re
import shutil

import numpy  as np
import pandas as pd
import joblib
import nltk
import ekphrasis
from collections import Counter

### Step1: Data Pre-processing

In [2]:
# read data
Path =os.path.dirname(os.getcwd())
data_pathA=os.path.join(Path,'Datasets/A/twitter-2016train-A.txt')

In [3]:
# transform data into df form
dataA = pd.read_table(data_pathA,sep='\t',header=0)
dataA.columns = ['ID','Sentiment','Text']
def add_label(sentiment):
    if sentiment == 'negative':
        return 0
    elif sentiment == 'neutral':
        return 1
    elif sentiment == 'positive':
        return 2

dataA['label'] = dataA.Sentiment.apply(add_label)
dataA

Unnamed: 0,ID,Sentiment,Text,label
0,628976607420645377,negative,@Microsoft how about you make a system that do...,0
1,629023169169518592,negative,I may be ignorant on this issue but... should ...,0
2,629179223232479232,negative,"Thanks to @microsoft, I just may be switching ...",0
3,629186282179153920,neutral,If I make a game as a #windows10 Universal App...,1
4,629226490152914944,positive,"Microsoft, I may not prefer your gaming branch...",2
...,...,...,...,...
5862,639855845958885376,positive,@Racalto_SK ok good to know. Punting at MetLif...,2
5863,639979760735662080,neutral,everyone who sat around me at metlife was so a...,1
5864,640196838260363269,neutral,what giants or niners fans would wanna go to t...,1
5865,640975710354567168,positive,Anybody want a ticket for tomorrow Colombia vs...,2


In [4]:
# sentiment distribution of data
dataA.loc[:,'label'].value_counts()

2    3017
1    2001
0     849
Name: label, dtype: int64

1Case conversion
包含“India”和“india”的语料库如果不应用小写化，机器会把它们识别为两个独立的术语，而实际上它们都是同一个单词的不同形式，并且对应于同一个国家。小写化后，仅存在一种“India”实例，即“india”，简化了在语料库中找到所有提到印度时的任务。

In [5]:
#import ekphrasis library
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [6]:
def Tokenize(Texts):
    token=[]
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    stop = set(stopwords.words('english'))
    
    for Text in Texts:
        words = [sentence for sentence in text_processor.pre_process_doc(Text) if (sentence!='s' and sentence!='\'')]
        words = [word for word in words if (word not in stop)]
        token.append(words)
    words=[word for words in token for word in words]
    
    print("All words: {}".format(len(words)))
    # Create Counter
    counts = Counter(words)
    print("Unique words: {}".format(len(counts)))

    Most_common= counts.most_common()[:30]
    print("Top 30 most common words: {}".format(Most_common))
    
    vocab = {word: num for num, word in enumerate(counts, 1)}
    id2vocab = {v: k for k, v in vocab.items()}
    return token,vocab

In [7]:
token,vocab=Tokenize(dataA.Text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HCY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


All words: 96339
Unique words: 11131
Top 30 most common words: [('.', 5383), (',', 2843), ('<user>', 2244), ('<url>', 2206), ('<number>', 1556), ('<hashtag>', 1482), ('</hashtag>', 1482), ('<allcaps>', 1395), ('</allcaps>', 1395), ('<repeated>', 1381), ('!', 1313), ('-', 1094), ('may', 1087), ('tomorrow', 897), (':', 888), ('?', 858), ('"', 634), ('th', 600), ('day', 572), ('1', 565), ('<date>', 526), ('going', 406), ('st', 383), ('apple', 381), ('&', 370), ('2', 369), ('see', 359), ('like', 347), ('friday', 344), ('amazon', 342)]


### Step2: Word2Vec Pretraining

In [8]:
from gensim.models import Word2Vec, KeyedVectors
from nltk import word_tokenize
import multiprocessing
import tensorboard 

In [9]:
word2vec_model=Word2Vec(token,window=5, min_count=1,workers = multiprocessing.cpu_count())

In [10]:
word2vec_model.train(token, total_examples = len(token), epochs = 100)

(7432664, 9633900)

In [11]:
print('This is summary of Word2Vec: {}'.format(word2vec_model))

This is summary of Word2Vec: Word2Vec(vocab=11131, vector_size=100, alpha=0.025)


In [12]:
index=word2vec_model.wv.key_to_index

In [13]:
word2vec_model.wv.save_word2vec_format('Word2Vec.vector')

### Step3: Training

In [14]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SpatialDropout1D, Bidirectional

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras
import seaborn as sns
import matplotlib.pyplot as plt
from IPython import display
import transformers
import tensorflow_hub as hub
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")
display.set_matplotlib_formats('svg')

  from .autonotebook import tqdm as notebook_tqdm


Version:  2.6.0
Eager mode:  True
GPU is available


  display.set_matplotlib_formats('svg')


In [15]:
embed_matrix = np.zeros((len(index), 100))
embed_dict={}
for word, i in index.items():
    if word in word2vec_model.wv:
        embed_matrix[i] = word2vec_model.wv[word]
        embed_dict[word] = word2vec_model.wv[word]

In [16]:
# Vectorize texts
def tokenize(tweet):
    """
    Vectorize texts
    :param df_tweet: The tweet text df['tweet']
    :return: Tweet texts after vectorizing, vocabulary size
    """
    tok = Tokenizer()
    # Create vocabulary index based on word frequency
    tok.fit_on_texts(tweet)
    # Convert each text to a sequence of integers
    X = pad_sequences(tok.texts_to_sequences(tweet), maxlen=100)
    # Vocabulary size
    vocab_size = len(tok.word_index) + 1
    return X, vocab_size, tok

In [17]:
X, vocab_size, tok = tokenize(dataA.Text)

In [18]:
token,vocab=Tokenize(dataA.Text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HCY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


All words: 96339
Unique words: 11131
Top 30 most common words: [('.', 5383), (',', 2843), ('<user>', 2244), ('<url>', 2206), ('<number>', 1556), ('<hashtag>', 1482), ('</hashtag>', 1482), ('<allcaps>', 1395), ('</allcaps>', 1395), ('<repeated>', 1381), ('!', 1313), ('-', 1094), ('may', 1087), ('tomorrow', 897), (':', 888), ('?', 858), ('"', 634), ('th', 600), ('day', 572), ('1', 565), ('<date>', 526), ('going', 406), ('st', 383), ('apple', 381), ('&', 370), ('2', 369), ('see', 359), ('like', 347), ('friday', 344), ('amazon', 342)]


In [19]:
def tokenizer_lstm(X, vocab, seq_len, padding):
    '''
    Returns tokenized tensor with left/right padding at the specified sequence length
    '''
    X_tmp = np.zeros((len(X), seq_len), dtype=np.int64)
    for i, text in enumerate(X):
        tokens = [word for word in text_processor.pre_process_doc(text) if (word!='s' and word!='\'')]
        token_ids = [vocab[word] for word in tokens if word in embed_dict.keys()]
        end_idx = min(len(token_ids), seq_len)
        if padding == 'right':
            X_tmp[i,:end_idx] = token_ids[:end_idx]
        elif padding == 'left':
            start_idx = max(seq_len - len(token_ids), 0)
            X_tmp[i,start_idx:] = token_ids[:end_idx]

    return X_tmp

In [20]:
X=tokenizer_lstm(dataA.Text, vocab, 100, 'left')

In [21]:
vocab_size=len(vocab)+1

In [22]:
Y = tf.one_hot(dataA.label, depth=3)

In [23]:
Y= np.array(Y)
X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size=0.1, random_state=1000) 

In [24]:
def build_embedding_layer(vocab_size, tok, embeddings_index):
    """
    Build embedding matrix and embedding layer
    :param vocab_size: vocabulary size
    :param tok: tokenizer
    :param embeddings_index: embedding index
    :return: embedding matrix and embedding layer
    """
    #Build embedding matrix
    embedding_matrix = np.zeros((vocab_size, 100))
    for word, i in tok.word_index.items():
        try:
            # Vector corresponds to word
            embedding_vector = embeddings_index.get(word)
        except:
            embedding_vector = embeddings_index['unknown']
        if embedding_vector is not None:
            # Ensure vector of embedding_matrix row matches word index
            embedding_matrix[i] = embedding_vector
    # Build embedding layer
    embedding_layer = Embedding(input_dim = vocab_size, output_dim = 100, weights = [embedding_matrix], input_length = 100, trainable=False)
    return embedding_layer

In [25]:
def build_embedding_layer(vocab_size, tok, embeddings_index):
    """
    Build embedding matrix and embedding layer
    :param vocab_size: vocabulary size
    :param tok: tokenizer
    :param embeddings_index: embedding index
    :return: embedding matrix and embedding layer
    """
    #Build embedding matrix
    embedding_matrix = np.zeros((vocab_size, 100))
    for word, i in vocab.items(): #tok.word_index.items():
        try:
            # Vector corresponds to word
            embedding_vector = embeddings_index.get(word)
        except:
            embedding_vector = embeddings_index['unknown']
        if embedding_vector is not None:
            # Ensure vector of embedding_matrix row matches word index
            embedding_matrix[i] = embedding_vector
    # Build embedding layer
    embedding_layer = Embedding(input_dim = vocab_size, output_dim = 100, weights = [embedding_matrix], input_length = 100, trainable=False)
    return embedding_layer

In [26]:
#Build embedding matrix
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tok.word_index.items():
    try:
        # Vector corresponds to word
        embedding_vector = embed_dict.get(word)
    except:
        embedding_vector = embed_dict['unknown']
    if embedding_vector is not None:
        # Ensure vector of embedding_matrix row matches word index
        embedding_matrix[i] = embedding_vector
# Build embedding layer

IndexError: index 11132 is out of bounds for axis 0 with size 11132

In [None]:
vocab_size

In [None]:
embedding_layer=build_embedding_layer(vocab_size, tok, embed_dict)

In [None]:
def model_train(X_train, y_train, embedding_layer):
        """
        Train, validate and test BiLSTM model, calculate accuracy of training and validation set
        :param X_train: tweet train data
        :param y_train: sentiment label train data
        :param embedding_layer: embedding layer
        :param X_test: tweet test data
        :param y_test: sentiment label test data
        :return: accuracy, recall, precision, F1 score and history
        """
        model = Sequential()
        model.add(embedding_layer)
        model.add(SpatialDropout1D(0.2))
        
#         model.add(Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout = 0.5)))
        forward_layer = LSTM(10, return_sequences=True,dropout = 0.2, recurrent_dropout = 0.5)
        backward_layer = LSTM(10, activation='relu', return_sequences=True,dropout = 0.2, recurrent_dropout = 0.5,
                           go_backwards=True)
        model.add(Bidirectional(forward_layer, backward_layer=backward_layer))
        
        model.add(Dense(3, activation = 'softmax'))
        model.summary()
        model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
        
        history = model.fit(X_train, y_train, validation_split = 0.2, epochs = 26, batch_size = 256)
        model.save('taskA.h5')
        train_acc = history.history['accuracy'][-1]
        val_acc = history.history['val_accuracy'][-1]
        return train_acc, val_acc, history

In [None]:
def model_train(X_train, y_train, embedding_layer):
        """
        Train, validate and test BiLSTM model, calculate accuracy of training and validation set
        :param X_train: tweet train data
        :param y_train: sentiment label train data
        :param embedding_layer: embedding layer
        :param X_test: tweet test data
        :param y_test: sentiment label test data
        :return: accuracy, recall, precision, F1 score and history
        """
        model = Sequential()
        model.add(embedding_layer)
        model.add(SpatialDropout1D(0.2))
        
        model.add(Bidirectional(LSTM(128, dropout = 0.5, recurrent_dropout = 0.0,activation='tanh',recurrent_activation='sigmoid')))
#         forward_layer = LSTM(10, return_sequences=True)
#         backward_layer = LSTM(10, activation='relu', return_sequences=True,
#                            go_backwards=True)
#         model.add(Bidirectional(forward_layer, backward_layer=backward_layer,input_shape=(5, 10)))
        
        model.add(Dense(3, activation = 'softmax'))
        model.summary()
        model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
        
        history = model.fit(X_train, y_train, validation_split = 0.2, epochs = 26, batch_size = 256)
        model.save('taskA.h5')
        train_acc = history.history['accuracy'][-1]
        val_acc = history.history['val_accuracy'][-1]
        return train_acc, val_acc, history

In [None]:
# model = Sequential()
# model.add(Bidirectional(LSTM(10, return_sequences=True), input_shape=(5, 10)))
# model.add(Bidirectional(LSTM(10)))
# model.add(Dense(5))
# model.add(Activation('softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# # With custom backward layer
# model = Sequential()
# forward_layer = LSTM(10, return_sequences=True)
# backward_layer = LSTM(10, activation='relu', return_sequences=True,
#                    go_backwards=True)
# model.add(Bidirectional(forward_layer, backward_layer=backward_layer,
#                      input_shape=(5, 10)))
# model.add(Dense(5))
# model.add(Activation('softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
acc_A_train, acc_A_val, history = model_train(X_train, Y_train, embedding_layer)

In [None]:
from transformers import AdamW as AdamW_HF, get_linear_schedule_with_warmup