<a href="https://colab.research.google.com/github/ksb25395/CAP6779/blob/master/Pretraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We continue with previous neural network model for sentiment analysis. We had got a F-Score of 0.617 with a LSTM based deep neural network utilizing a layer of pretrained embeddings. Here, we pretrain the same model for a related task of 5-class fine-grained sentiment analysis. We choose the Stanford Sentiment TreeBank dataset for fine-grained sentiment analysis. 

We hope that pretraining the final model for fine-grained sentiment analysis will help us get a performance boost in terms of 3-class sentiment analysis. The intuition behind this idea is that by training with a fine-grained-sentiment analysis task, the model learns better sentiment features in the input text, and we can use this pretraining for better twitter sentiment analysis.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
from gensim.models import KeyedVectors
EMBEDDING_FILE = '/content/gdrive/My Drive/Colab Notebooks/datasets/Google/GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# Loading SST data. We concatenate the train, test and dev datasets
import pandas as pd
import glob

path = r'/content/gdrive/My Drive/Colab Notebooks/datasets/SST-5' 
all_files = glob.glob(path + "/*.txt")

li = []

for filename in all_files:
    df = pd.read_csv(filename, sep="\t", index_col=None, header=None)
    li.append(df)

sst_df = pd.concat(li, axis=0, ignore_index=True)

In [0]:
sst_df.columns = ['sentiment', 'text']

In [0]:
sst_df.head()

Unnamed: 0,sentiment,text
0,__label__3,Effective but too-tepid biopic
1,__label__4,If you sometimes like to go to the movies to h...
2,__label__5,"Emerges as something rare , an issue movie tha..."
3,__label__3,The film provides some great insight into the ...
4,__label__5,Offers that rare combination of entertainment ...


In [0]:
import numpy as np
import re
import nltk
import spacy
import string

In [0]:
# preprocessing methods
# Removal of URLs
def remove_urls(text):
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  return url_pattern.sub(r'', text)

# Remove @ mentions
def remove_mentions(text):
  text = re.sub(r"(?:\@|https?\://)\S+", "", text)
  return text

# strip links
def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

# strip entities
def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

# convert emojis
!pip install emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
def convert_emoticons(text):
  for emot in EMOTICONS:
    re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
  return text

# convert emoji to word
!pip install emoji
import emoji
def convert_emojis(text):
    return emoji.demojize(text)

# expand contractions
!pip install -q contractions
import contractions
def expand_contractions(text):
  return contractions.fix(text)

# remove punctuations
PUNCT = string.punctuation
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', PUNCT))

# remove stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# remove frequent words
def remove_freqwords(text, FREQ_WORDS):
  return " ".join([word for word in str(text).split() if word not in FREQ_WORDS])

# remove rare words
def remove_rarewords(text, RAREWORDS):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

# lemmatize words
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

# spelling correction

Collecting emot
  Downloading https://files.pythonhosted.org/packages/49/07/20001ade19873de611b7b66a4d5e5aabbf190d65abea337d5deeaa2bc3de/emot-2.1-py3-none-any.whl
Installing collected packages: emot
Successfully installed emot-2.1
Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/40/8d/521be7f0091fe0f2ae690cc044faf43e3445e0ff33c574eae752dd7e39fa/emoji-0.5.4.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 2.5MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.5.4-cp36-none-any.whl size=42175 sha256=66abc5511ad0faf48ae2e337faa2d6c243acec2a01e23b472dcda9cf3599bd64
  Stored in directory: /root/.cache/pip/wheels/2a/a9/0a/4f8e8cce8074232aba240caca3fade315bb49fac68808d1a9c
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.5.4
[K     |████████████████████████████████| 317kB 6.5MB/s 
[K     |███████████

In [0]:
def preprocess(df):
  # Lower casing
  df["text"] = df["text"].str.lower()
  df.head()
  # remove urls
  df["text"] = df["text"].apply(lambda x : remove_urls(x))
  # remove mentions
  df["text"] = df["text"].apply(lambda x: remove_mentions(x))
  # strip links
  df["text"] = df["text"].apply(lambda x: strip_links(x))
  # strip all entities
  df["text"] = df["text"].apply(lambda x: strip_all_entities(x))
  # convert all emoticons
  df["text"] = df["text"].apply(lambda x : convert_emoticons(x))
  # convert all emojis
  df["text"] = df["text"].apply(lambda x : convert_emojis(x))
  # expand all contractions
  df["text"] = df["text"].apply(lambda x : expand_contractions(x))
  # remove all punctuations
  df["text"] = df["text"].apply(lambda x : remove_punctuation(x))
  # remove all stopwords
  df["text"] = df["text"].apply(lambda x : remove_stopwords(x))
  # remove frequent words
  from collections import Counter
  cnt = Counter()
  for text in df["text"].values:
    for word in text.split():
      cnt[word] += 1
  FREQ_WORDS = set([word for (word, count) in cnt.most_common(10)])
  df["text"] = df["text"].apply(lambda x : remove_freqwords(x, FREQ_WORDS))
  # remove rare words
  n_rare_words = 10
  RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
  df["text"] = df["text"].apply(lambda text: remove_rarewords(text, RAREWORDS))
  # lemmatize words
  df["text"] = df["text"].apply(lambda text: lemmatize_words(text))
  # spelling correction
  return df

In [0]:
sst_df = preprocess(sst_df)
sst_df.head()

Unnamed: 0,sentiment,text
0,__label__3,effective tepid biopic
1,__label__4,sometimes go movie fun wasabi place start
2,__label__5,emerges something rare issue honest keenly obs...
3,__label__3,provide great insight neurotic mindset comic r...
4,__label__5,offer rare combination entertainment education


In [0]:
sst_df.sentiment = sst_df.sentiment.astype('category')

In [0]:
dict( enumerate(sst_df['sentiment'].cat.categories ) )

{0: '__label__1',
 1: '__label__2',
 2: '__label__3',
 3: '__label__4',
 4: '__label__5'}

In [0]:
sst_df["sentiment"] = sst_df["sentiment"].cat.codes

In [0]:
sst_df.head()

Unnamed: 0,sentiment,text
0,2,effective tepid biopic
1,3,sometimes go movie fun wasabi place start
2,4,emerges something rare issue honest keenly obs...
3,2,provide great insight neurotic mindset comic r...
4,4,offer rare combination entertainment education


In [0]:
sst_texts = sst_df["text"]

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
tokenizer.fit_on_texts(sst_texts)
sequences = tokenizer.texts_to_sequences(sst_texts)
word_index = tokenizer.word_index
print(len(word_index))

Using TensorFlow backend.


14740


In [0]:
X_train = pad_sequences(sequences)
y_train = to_categorical(list(sst_df["sentiment"]))

In [0]:
import numpy as np
EMBED_DIM = 300
vocab_size = len(word_index) + 1
embedding_matrix =np.random.normal(0,np.sqrt(0.25),[vocab_size, EMBED_DIM])

count = []
for word, i in word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vec = word2vec[word]
        embedding_matrix[i] = embedding_vec
    except KeyError:
        count.append(word)
        embedding_matrix[i] = np.random.normal(0,np.sqrt(0.25),EMBED_DIM)

In [0]:
count
len(count)

1554

In [0]:
from keras.layers import Embedding
embedding_layer = Embedding(vocab_size,
                            EMBED_DIM,
                            weights=[embedding_matrix],
                            trainable=True)
seq_len = X_train.shape[1]
from keras.layers import Input
from keras.layers import Flatten
from keras.layers import Dense
from keras.models import Model
from keras.layers import LSTM
from keras.layers import Dropout
inputs = Input(shape=(seq_len,))
embeddings = embedding_layer(inputs)
# flattened = Flatten()(embeddings)
lstm = LSTM(128, dropout=0.5)(embeddings)
dense = Dense(16, activation="relu")(lstm)
dropout = Dropout(rate=0.5)(dense)
output = Dense(y_train.shape[1], activation='softmax')(dropout)
model = Model(inputs, output)











Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
from keras.optimizers import Adam
adam = Adam(lr=1e-3)

model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])





In [0]:
model.fit(X_train, y_train, batch_size=1000, epochs=20, verbose=1, shuffle=True)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa87333ad30>

In [0]:
model.save('/content/gdrive/My Drive/Colab Notebooks/pretrained.h5')

Now we have trained the model with SST-5 dataset. We have saved the model and we can now use it to train a model Twitter Sentiment analysis.

In [0]:
semeval_df=pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/datasets/Semeval-2017-4A-English/SemEval2017-task4-dev.subtask-A.english.INPUT.txt', sep="\t", header=None)
semeval_df.head()

Unnamed: 0,0,1,2,3
0,619950566786113536,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T...",
1,619969366986235905,neutral,Order Go Set a Watchman in store or through ou...,
2,619971047195045888,negative,If these runway renovations at the airport pre...,
3,619974445185302528,neutral,If you could ask an onstage interview question...,
4,619987808317407232,positive,A portion of book sales from our Harper Lee/Go...,


In [0]:
semeval_df = semeval_df.drop(columns=3)

In [0]:
semeval_df.columns = ['id', 'sentiment', 'text']

In [0]:
semeval_df = preprocess(semeval_df)
semeval_df.head()

Unnamed: 0,id,sentiment,text
0,619950566786113536,neutral,picturehouse pink floyd roger water walll open...
1,619969366986235905,neutral,order go set watchman store website tuesday ge...
2,619971047195045888,negative,runway renovation airport prevent see taylor s...
3,619974445185302528,neutral,could ask onstage interview question miss usa ...
4,619987808317407232,positive,portion book sale harper lee go set watchman r...


In [0]:
semeval_df.sentiment = semeval_df.sentiment.astype('category')
dict( enumerate(semeval_df['sentiment'].cat.categories ) )

{0: 'negative', 1: 'neutral', 2: 'positive'}

In [0]:
semeval_df["sentiment"] = semeval_df["sentiment"].cat.codes
semeval_df.head()

Unnamed: 0,id,sentiment,text
0,619950566786113536,1,picturehouse pink floyd roger water walll open...
1,619969366986235905,1,order go set watchman store website tuesday ge...
2,619971047195045888,0,runway renovation airport prevent see taylor s...
3,619974445185302528,1,could ask onstage interview question miss usa ...
4,619987808317407232,2,portion book sale harper lee go set watchman r...


In [0]:
semeval_texts = semeval_df["text"]

In [0]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
tokenizer.fit_on_texts(semeval_texts)
sequences = tokenizer.texts_to_sequences(semeval_texts)
word_index = tokenizer.word_index
print(len(word_index))

18610


In [0]:
X_train = pad_sequences(sequences)
y_train = to_categorical(list(semeval_df["sentiment"]))

In [0]:
seq_len_new = X_train.shape[1]
print(seq_len_new)

29


In [0]:
EMBED_DIM = 300
vocab_size = len(word_index) + 1
embedding_matrix_new =np.random.normal(0,np.sqrt(0.25),[vocab_size, EMBED_DIM])

count = []
for word, i in word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vec = word2vec[word]
        embedding_matrix_new[i] = embedding_vec
    except KeyError:
        count.append(word)
        embedding_matrix_new[i] = np.random.normal(0,np.sqrt(0.25),EMBED_DIM)

In [0]:
embedding_layer = Embedding(vocab_size,
                            EMBED_DIM,
                            weights=[embedding_matrix_new],
                            trainable=True)
inputs = Input(shape=(seq_len_new,))
embeddings = embedding_layer(inputs)
lstm_weights = model.layers[2].get_weights()
lstm = LSTM(128, dropout=0.5)(embeddings)
dense_weights = model.layers[3].get_weights()
dense = Dense(16, activation="relu")(lstm)
dropout = Dropout(rate=0.5)(dense)
# prev_output_weights = model.layers[5].get_weights()
# prev_output = Dense(prev_output_weights[1].shape[0], activation='relu')(dropout)
output = Dense(y_train.shape[1], activation='softmax')(dropout)
final_model = Model(inputs, output)

In [0]:
final_model.layers[2].set_weights(lstm_weights)
final_model.layers[3].set_weights(dense_weights)
# final_model.layers[5].set_weights(prev_output_weights)

In [0]:
X_train.shape

(20632, 29)

In [0]:
y_train.shape

(20632, 3)

In [0]:
adam = Adam(lr=2e-3)

final_model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['acc'])

In [0]:
final_model.fit(X_train, y_train, batch_size=1000, epochs=10, verbose=1, validation_split=0.1, shuffle=True)

Train on 18568 samples, validate on 2064 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa86bda8f98>

NameError: ignored