In [None]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, Bidirectional
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras import optimizers

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# DATASET
DATASET_ENCODING = "ISO-8859-1"
DATASET_COLUMNS = ["sentiment", "ids", "date", "flag", "user", "text"]

TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 128

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "teacher_model.h5"
WORD2VEC_MODEL = "w2v_model.w2v"
TOKENIZER_MODEL = "teacher_tokenizer.pkl"
ENCODER_MODEL = "teacher_encoder.pkl"

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Data science/cp_dataset/tweet_hotel.csv",
                 encoding = DATASET_ENCODING)

In [None]:
df

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,0,âvirus leak from hotel quarantine which be m...,1
1,1,the doctor and a sexy fish stay at a hotel in ...,4
2,2,hotel emporium have create the perfect replace...,5
3,3,sunainapatnaik sloganmurugan you have to pay r...,1
4,4,worldâs first hotel which run on electricity...,5
...,...,...,...
101155,101155,drnaumanniaz bhattimajid during practice match...,1
101156,101156,iâm annoy the ac die a few day ago iâm thi...,3
101157,101157,misfitpoise i legit have see this movie time a...,1
101158,101158,anthonyhotels have you see the indoor waterpar...,1


In [None]:
df['sentiment'] = df['sentiment'].replace(2,1)
df['sentiment'] = df['sentiment'].replace(4,5)
df.drop(df[df['sentiment'] == 3].index, inplace = True)

2021-09-15 10:00:40,420 : INFO : NumExpr defaulting to 2 threads.


In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = stopwords.words("english")
nltk.download('sentiwordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tag import pos_tag

def preprocess(text, lem=True):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    lemmatizer = WordNetLemmatizer()
    split_text = text.split()
    tokens = []
    if lem:
        for word, tag in pos_tag(split_text):
            if tag.startswith('NN'):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
            tokens.append(lemmatizer.lemmatize(word, pos))
    else:
        tokens.append(split_text)
    return " ".join(tokens)

In [None]:
%%time
df.text = df.text.apply(lambda x: preprocess(x))

CPU times: user 1min 36s, sys: 1.24 s, total: 1min 37s
Wall time: 1min 39s


In [None]:
def preprocessno(text):
    text = re.sub(r'[0-9]', ' ', str(text).lower()).strip()
    return (text)

In [None]:
%%time
df.text = df.text.apply(lambda x: preprocessno(x))

CPU times: user 215 ms, sys: 971 µs, total: 216 ms
Wall time: 216 ms


In [None]:
df_train, df_test = train_test_split(df, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 70918
TEST size: 17730


In [None]:
df["sentiment"].unique()

array([1, 5])

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append("hotel")
stopwords.append("room")
stopwords.append("get")
stopwords.append("go")
stopwords.append("stay")
stopwords.append("one")

In [None]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

def remove_stopwords(text):
    tokenized_text = tokenizer.tokenize(text)
    text = [w for w in tokenized_text if w not in stopwords]
    combined_text = ' '.join(text)
    return combined_text

df.text = df.text.apply(str).apply(lambda x: remove_stopwords(x))

In [None]:
print("Dataset size:", len(df))

Dataset size: 88648


In [None]:
res = df['text'].str.split().str.len().max()

print("The maximum length in words are : " +  str(res))

The maximum length in words are : 22


In [None]:
%%time
documents = [_text.split() for _text in df_train.text]

CPU times: user 337 ms, sys: 38.1 ms, total: 375 ms
Wall time: 375 ms


In [None]:
w2v_model = gensim.models.word2vec.Word2Vec(size=92, 
                                            window=W2V_WINDOW, 
                                            min_count=50, 
                                            workers=8)

In [None]:
w2v_model.build_vocab(documents)

2021-09-15 10:02:26,299 : INFO : collecting all words and their counts
2021-09-15 10:02:26,301 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-09-15 10:02:26,350 : INFO : PROGRESS: at sentence #10000, processed 155279 words, keeping 20439 word types
2021-09-15 10:02:26,397 : INFO : PROGRESS: at sentence #20000, processed 310700 words, keeping 32642 word types
2021-09-15 10:02:26,443 : INFO : PROGRESS: at sentence #30000, processed 465561 words, keeping 42757 word types
2021-09-15 10:02:26,494 : INFO : PROGRESS: at sentence #40000, processed 620132 words, keeping 52095 word types
2021-09-15 10:02:26,539 : INFO : PROGRESS: at sentence #50000, processed 775991 words, keeping 60881 word types
2021-09-15 10:02:26,583 : INFO : PROGRESS: at sentence #60000, processed 929890 words, keeping 68740 word types
2021-09-15 10:02:26,631 : INFO : PROGRESS: at sentence #70000, processed 1086082 words, keeping 76409 word types
2021-09-15 10:02:26,636 : INFO : collected 77

In [None]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 1900


In [None]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

2021-09-15 10:02:27,165 : INFO : training model with 8 workers on 1900 vocabulary and 92 features, using sg=0 hs=0 sample=0.001 negative=5 window=7
2021-09-15 10:02:28,195 : INFO : EPOCH 1 - PROGRESS: at 61.78% examples, 371813 words/s, in_qsize 14, out_qsize 4
2021-09-15 10:02:28,600 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-09-15 10:02:28,619 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-09-15 10:02:28,642 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-09-15 10:02:28,649 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-09-15 10:02:28,651 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-09-15 10:02:28,658 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-09-15 10:02:28,661 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-09-15 10:02:28,663 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-09-15

CPU times: user 1min 26s, sys: 715 ms, total: 1min 27s
Wall time: 46.5 s


(19468926, 35205312)

In [None]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 77123
CPU times: user 1.65 s, sys: 28 ms, total: 1.68 s
Wall time: 1.69 s


In [None]:
%%time
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=128)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=128)

CPU times: user 1.89 s, sys: 36.9 ms, total: 1.93 s
Wall time: 1.93 s


In [None]:
encoder = LabelEncoder()
encoder.fit(df_train.sentiment.tolist())

y_train = encoder.transform(df_train.sentiment.tolist())
y_test = encoder.transform(df_test.sentiment.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

y_train (70918, 1)
y_test (17730, 1)


In [None]:
def listToString(s): 
    
    # initialize an empty string
    str1 = "" 
    
    # traverse in the string  
    for ele in s: 
        str1 += ele  
    
    # return string  
    return str1 

In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

def senti(word):
  splits = str.split(word)
  postag = nltk.pos_tag(splits)
  tagg = [tag for ( _, tag) in postag] 
  wn_tag = penn_to_wn(listToString(tagg))

  try:
    temp = wn.synsets(word, pos=wn_tag)
    synset = temp[0]
    swn_synset = swn.senti_synset(synset.name())
    if swn_synset.pos_score() >= swn_synset.neg_score():
      v_senti = np.array([swn_synset.pos_score()])
    elif swn_synset.pos_score() < swn_synset.neg_score():
      v_senti = np.array([-swn_synset.neg_score()])

  except LookupError:
    v_senti = np.array([0]) 

  return(v_senti)

In [None]:
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

import numpy as np

embedding_matrix = np.zeros((vocab_size, 128))

for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    
    v_senti2 = senti(word) 
    v_multi = np.concatenate((v_senti2,v_senti2,v_senti2,v_senti2,v_senti2,v_senti2,v_senti2,v_senti2,v_senti2))   
    v_w2v = w2v_model.wv[word] 
    v_concat = np.concatenate((v_w2v,v_multi,v_multi,v_multi,v_multi)) 

    embedding_matrix[i] = v_concat
print(embedding_matrix.shape)

(77123, 128)


In [None]:
temp = wn.synsets("bad", pos="a")
synset = temp[0]
swn_synset = swn.senti_synset(synset.name())
-swn_synset.neg_score()

-0.625

In [None]:
embedding_layer = Embedding(vocab_size, 128, 
                            weights=[embedding_matrix], 
                            input_length=128, trainable=True)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128,  return_sequences=True)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 128, 128)          9871744   
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 256)          263168    
_________________________________________________________________
dropout (Dropout)            (None, 128, 256)          0         
_________________________________________________________________
dense (Dense)                (None, 128, 1)            257       
Total params: 10,135,169
Trainable params: 10,135,169
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="Adam",
              metrics=['accuracy'])

In [None]:
from keras import callbacks
earlystopping = callbacks.EarlyStopping(monitor ="val_loss", 
                                        mode ="min", patience = 1,
                                        restore_best_weights = True)

In [None]:
%%time
history = model.fit(x_train, y_train,
                    batch_size=128,
                    epochs=25,
                    validation_split=0.3,
                    verbose=1,
                    callbacks=[earlystopping])

Epoch 1/25
Epoch 2/25
CPU times: user 26min 10s, sys: 1min 43s, total: 27min 53s
Wall time: 14min 47s


In [None]:
%%time
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])


ACCURACY: 0.8311799168586731
LOSS: 0.3815140426158905
CPU times: user 1min 10s, sys: 3.8 s, total: 1min 13s
Wall time: 41 s


In [None]:
#def predict(text):
#    start_at = time.time()
#    # Tokenize text
#    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=128)
    # Predict
#    score = model.predict([x_test])


#    return (score) 

In [None]:
model.save("project_model.h5")
w2v_model.save("project.w2v")

2021-09-15 10:19:25,728 : INFO : saving Word2Vec object under project.w2v, separately None
2021-09-15 10:19:25,732 : INFO : not storing attribute vectors_norm
2021-09-15 10:19:25,734 : INFO : not storing attribute cum_table
2021-09-15 10:19:25,782 : INFO : saved project.w2v
