In [1]:
# %load main.py
import os
import re
import shutil
from tqdm import tqdm

import numpy  as np
import pandas as pd
import joblib
import nltk
import ekphrasis
from collections import Counter

### Step1: Data Pre-processing

In [2]:
# read data
Path =os.path.dirname(os.getcwd())
data_pathB=os.path.join(Path,'Datasets/B/twitter-2016train-BD.txt')

In [3]:
data_pathB

'D:\\UCL-ELEC0135\\Assignments\\Datasets/B/twitter-2016train-BD.txt'

In [4]:
# transform data into df form
dataB = pd.read_table(data_pathB,sep='\t',header=None)
dataB.columns = ['ID','Topic','Sentiment','Text','label']
def add_label(sentiment):
    if sentiment == 'negative':
        return 0
    elif sentiment == 'positive':
        return 1

dataB['label'] = dataB.Sentiment.apply(add_label)
dataB

Unnamed: 0,ID,Topic,Sentiment,Text,label
0,681563394940473347,amy schumer,negative,@MargaretsBelly Amy Schumer is the stereotypic...,0
1,675847244747177984,amy schumer,negative,@dani_pitter I mean I get the hype around JLaw...,0
2,672827854279843840,amy schumer,negative,Amy Schumer at the #GQmenoftheyear2015 party i...,0
3,662755012129529858,amy schumer,negative,Amy Schumer is on Sky Atlantic doing one of th...,0
4,671502639671042048,amy schumer,negative,"Amy Schumer may have brought us Trainwreck, bu...",0
...,...,...,...,...,...
10546,638032969383309312,zayn,positive,tomorrow I've to wake up early so Zayn's erfo...,1
10547,634711870570500096,zayn,positive,with Zayn gone I can now definitively say that...,1
10548,637134671797690368,zayn,positive,yo don't ever say that! god forbid! may it not...,1
10549,636413565780557824,zayn,positive,you may call me a bad fan but I sobbed so hard...,1


In [5]:
# sentiment distribution of data
dataB.loc[:,['Sentiment','label']].value_counts().to_dict()

{('positive', 1): 8212, ('negative', 0): 2339}

1Case conversion
包含“India”和“india”的语料库如果不应用小写化，机器会把它们识别为两个独立的术语，而实际上它们都是同一个单词的不同形式，并且对应于同一个国家。小写化后，仅存在一种“India”实例，即“india”，简化了在语料库中找到所有提到印度时的任务。

In [6]:
#import ekphrasis library
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [7]:
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# stop = set(stopwords.words('english'))

In [8]:
def Tokenize(Texts):
    token=[]
    for Text in Texts:
        words = [sentence for sentence in text_processor.pre_process_doc(Text) if (sentence!='s' and sentence!='\'')]
#         words = [word for word in words if (word not in stop)]
        token.append(words)
    words=[word for words in token for word in words]
    
    print("All words: {}".format(len(words)))
    # Create Counter
    counts = Counter(words)
    print("Unique words: {}".format(len(counts)))

    Most_common= counts.most_common()[:30]
    print("Top 30 most common words: {}".format(Most_common))
    
    vocab = {word: num for num, word in enumerate(counts, 1)}
    id2vocab = {v: k for k, v in vocab.items()}
    return token,vocab

In [9]:
token,vocab=Tokenize(dataB.Text)

All words: 252636
Unique words: 14251
Top 30 most common words: [('.', 9717), ('the', 8614), ('to', 4925), (',', 4803), ('i', 4709), ('<user>', 3385), ('a', 3298), ('and', 3226), ('!', 3152), ('in', 2985), ('<url>', 2814), ('is', 2712), ('of', 2673), ('on', 2633), ('<repeated>', 2578), ('<hashtag>', 2537), ('</hashtag>', 2537), ('<number>', 2336), ('it', 2291), ('for', 2245), ('<allcaps>', 2193), ('</allcaps>', 2193), ('you', 2183), ('be', 1883), ('may', 1849), ('tomorrow', 1819), ('not', 1771), ('with', 1726), ('-', 1565), ('my', 1308)]


### Step2: Word2Vec Pretraining

In [10]:
from gensim.models import Word2Vec, KeyedVectors
from nltk import word_tokenize
import multiprocessing
import tensorboard 

In [11]:
word2vec_model=Word2Vec(token,window=5, min_count=1,workers = multiprocessing.cpu_count())

In [12]:
word2vec_model.train(token, total_examples = len(token), epochs = 100)

(18710948, 25263600)

In [13]:
print('This is summary of Word2Vec: {}'.format(word2vec_model))

This is summary of Word2Vec: Word2Vec(vocab=14251, vector_size=100, alpha=0.025)


In [14]:
index=word2vec_model.wv.key_to_index

In [15]:
word2vec_model.wv.save_word2vec_format('Word2Vec.vector')

In [16]:
embed_matrix = np.zeros((len(index), 100))
embed_dict={}
for word, i in index.items():
    if word in word2vec_model.wv:
        embed_matrix[i] = word2vec_model.wv[word]
        embed_dict[word] = word2vec_model.wv[word]

In [17]:
del word2vec_model

In [18]:
# word = pd.read_table(word_path,sep=' ',header=None)
# word.set_index(0,inplace=True)
# Embed_dict={}
# for i in range(word.shape[0]):
#     Embed_dict[word.index[i]]=word.iloc[i,:]

In [19]:
# Embed_path=os.path.join(Path,'Datasets/Embed_dict')


### Step3: Training

In [20]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SpatialDropout1D, Bidirectional

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
import datetime
import keras
import seaborn as sns
import matplotlib.pyplot as plt
from IPython import display
import transformers
import tensorflow_hub as hub
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")
display.set_matplotlib_formats('svg')

  from .autonotebook import tqdm as notebook_tqdm


Version:  2.6.0
Eager mode:  True
GPU is available


  display.set_matplotlib_formats('svg')


In [21]:
def tokenizer_lstm(X, vocab, seq_len):
    '''
    Returns tokenized tensor with left/right padding at the specified sequence length
    '''
    X_tmp = np.zeros((len(X), seq_len), dtype=np.int64)
    for i, text in enumerate(X):
        tokens = [word for word in text_processor.pre_process_doc(text) if (word!='s' and word!='\'')]
#         tokens = [word for word in tokens if (word not in stop)]
        token_ids = [vocab[word] for word in tokens if word in embed_dict.keys()]###
        end_idx = min(len(token_ids), seq_len)
        start_idx = max(seq_len - len(token_ids), 0)
        X_tmp[i,start_idx:] = token_ids[:end_idx]

    return X_tmp

In [22]:
X=tokenizer_lstm(dataB.Text, vocab, 100)###

In [23]:
Y = tf.one_hot(dataB.label, depth=3)
Y= np.array(Y)

In [24]:
X_train, X_test,Y_train, Y_test = train_test_split (X, Y, test_size=0.1, random_state=1000) 

In [25]:
def build_embedding_layer(vocab, embed_dict):
    """
    Build embedding matrix and embedding layer
    :param vocab_size: vocabulary size
    :param tok: tokenizer
    :param embed_dict: embedding index
    :return: embedding matrix and embedding layer
    """
    #Build embedding matrix
    vocab_size=len(vocab)+1
    embedding_matrix = np.zeros((vocab_size, 100))
    for word, i in vocab.items():
        # Vector corresponds to word
        embedding_vector = embed_dict.get(word)###,embed_dict['<unk>']

        if embedding_vector is not None:
            # Ensure vector of embedding_matrix row matches word index
            embedding_matrix[i] = embedding_vector
            
    # Build embedding layer
    embedding_layer = Embedding(input_dim = vocab_size, output_dim = 100, weights = [embedding_matrix], input_length = 100, trainable=False)
    return embedding_layer

In [26]:
embedding_layer=build_embedding_layer(vocab, embed_dict) ###

In [27]:
def model_train(X_train, y_train, embedding_layer):
        """
        Train, validate and test BiLSTM model, calculate accuracy of training and validation set
        :param X_train: tweet train data
        :param y_train: sentiment label train data
        :param embedding_layer: embedding layer
        :param X_test: tweet test data
        :param y_test: sentiment label test data
        :return: accuracy, recall, precision, F1 score and history
        """
        tf.debugging.set_log_device_placement(True)
        model = Sequential()
        model.add(embedding_layer)
        model.add(SpatialDropout1D(0.2))
        
#         LSTM(128, dropout = 0.2, recurrent_dropout = 0.5)

#         LSTM(128,activation='tanh', recurrent_activation='sigmoid',
#              use_bias=True,dropout=0.5,recurrent_dropout=0.0)
    
#         model.add(Bidirectional(LSTM(128,dropout = 0.5,return_sequences=True)))
#         model.add(Bidirectional(LSTM(64,dropout = 0.5)))   27  loss: 0.7183 - accuracy: 0.6815 - val_loss: 0.7699 - val_accuracy: 0.6634


#         model.add(Bidirectional(LSTM(128,dropout = 0.5))) 26 loss: 0.7517 - accuracy: 0.6596 - val_loss: 0.7827 - val_accuracy: 0.6532
        
#         model.add(Bidirectional(LSTM(128,dropout = 0.5,return_sequences=True)))
#         model.add(Bidirectional(LSTM(128,dropout = 0.0)))   28 loss: 0.6897 - accuracy: 0.6925 - val_loss: 0.7774 - val_accuracy: 0.6610

#         model.add(Bidirectional(LSTM(128,dropout = 0.5,return_sequences=True)))
#         model.add(Bidirectional(LSTM(64,dropout = 0.5)))  29 loss: 0.7080 - accuracy: 0.6759 - val_loss: 0.7618 - val_accuracy: 0.6653

        model.add(Bidirectional(LSTM(128,dropout = 0.5,return_sequences=True)))
        model.add(Bidirectional(LSTM(64,dropout = 0.5)))  #27 loss: 0.6735 - accuracy: 0.7028 - val_loss: 0.7640 - val_accuracy: 0.6669

#         model.add(Bidirectional(LSTM(128,dropout = 0.3,recurrent_dropout = 0.5)))
    
        model.add(Dense(3, activation = 'softmax'))
        model.summary()
        
        batch_size = 128
        epochs = 50

        model.compile(optimizer='adam', loss='categorical_crossentropy', 
                      metrics = ['Recall','Accuracy','Precision'])
        history = model.fit(X_train, y_train, validation_split = 0.2, epochs = epochs, batch_size = batch_size)
        model.save('taskA.h5'.format(25))
        train_acc = history.history['Accuracy'][-1]
        val_acc = history.history['val_Accuracy'][-1]
        
        train_rec = history.history['recall'][-1]
        val_rec = history.history['val_recall'][-1]
        
        train_pre = history.history['precision'][-1]
        val_pre = history.history['val_precision'][-1]
        return train_acc, val_acc, train_rec, val_rec, train_pre, val_pre,history

In [28]:
train_acc, val_acc, train_rec, val_rec, train_pre, val_pre,history= model_train(X_train, Y_train, embedding_layer)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          1425200   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 100, 100)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 256)          234496    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               164352    
_________________________________________________________________
dense (Dense)                (None, 3)                 387       
Total params: 1,824,435
Trainable params: 399,235
Non-trainable params: 1,425,200
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 1

Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
class_names = ['0: Negative','1: Neutra;', '2: Positive']

model = load_model('taskA.h5')
Y_pred = model.predict(X_test)
Metrcs = model.evaluate(X_test, Y_test, return_dict=True)



In [30]:
Metrcs

{'loss': 0.5304638743400574,
 'recall': 0.8125,
 'Accuracy': 0.8125,
 'precision': 0.8125}

In [None]:
def pred_eval(model):
    Y_pred = model.predict(X_test)
    Metrcs = model.evaluate(X_test, Y_test, return_dict=True)
    
    return