#Sentiment Analysis model development v0.2
###For TrendSentimentAnalysis (tsa.)
Joyster Rodrigues | 2018487 

#Importing Dependencies & Datasets, etc.

In [None]:
#imports
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
import re
from bs4 import BeautifulSoup
from gensim.models import Word2Vec, Phrases
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
#import data
dataset = pd.read_csv('/content/drive/MyDrive/AI/ML/datasets/imdb_dataset/IMDB Dataset.csv')
print(dataset.head)

In [None]:
print(len(dataset['review']), len(dataset['sentiment']))

##Pre-processing dataset

In [None]:
def sentiment_id(sentiment):
  if sentiment == 'positive':
    sent_id = 1
  else:
    sent_id = 0
  return sent_id

In [None]:
data_df = dataset.copy()
data_df['sentiment_id'] = data_df['sentiment'].apply(lambda x: sentiment_id(x))

In [None]:
#shuffle data
data_df.sample

In [None]:
data_df

In [None]:
data_df.describe()

In [None]:
len(data_df.review) 

In [None]:
#assign dataset & split
split_data = round(len(data_df.review) * 0.80)
x_train = data_df.review[0:split_data]
y_train = data_df.sentiment_id[0:split_data]

x_test = data_df.review[split_data:]
y_test = data_df.sentiment_id[split_data:]

In [None]:
print(len(x_train),len(y_train), ' | ' ,len(x_test),len(y_test))

In [None]:
#preprocessing data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

REPLACE_WITH_SPACE = re.compile(r'[^A-Za-z\s]')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
from nltk.tokenize import word_tokenize
def clean_text(raw_input):
  clean_text = BeautifulSoup(raw_input, 'lxml').get_text()

  letters_only = REPLACE_WITH_SPACE.sub(' ', clean_text)

  lowercase_only = letters_only.lower()

  return lowercase_only


def lemmatize(tokens):
  tokens = list(map(lemmatizer.lemmatize, tokens))

  lemmatized_tokens = list(map(lambda x: lemmatizer.lemmatize(x, 'v'), tokens))
  
  meaningful_words = list(filter(lambda x: not x in stop_words, lemmatized_tokens))
  
  return meaningful_words


def pre_processing(review):
  total = 0

  review = clean_text(review)

  tokens = word_tokenize(review)

  lemmas = lemmatize(tokens)

  return lemmas


In [None]:
x_train_clean = np.array(list(map(lambda x: pre_processing(x), x_train)))
x_test_clean = np.array(list(map(lambda x: pre_processing(x), x_test)))

In [None]:
#ngrams transformation
bigrams = Phrases(sentences= x_train_clean)
trigrams = Phrases(sentences=bigrams[x_train_clean])

In [None]:
bigram_model = Word2Vec(
    sentences = bigrams[x_train_clean],
    size = 256,
    min_count=3, window=5, workers=4
)

In [None]:
trigram_model = Word2Vec(
    sentences = trigrams[bigrams[x_train_clean]],
    size = 256,
    min_count=3, window=5, workers=4
)

In [None]:
x_train_ngram = trigrams[bigrams[x_train_clean]]

In [None]:
#lemmatize & stemming
#alternate method

In [None]:
# #text to vector / tokenization (Method1)
# tokenizer = Tokenizer(num_words=10000, oov_token='<00V>')
# tokenizer.fit_on_texts(x_train_clean)

# word_index = tokenizer.word_index

# x_train_seq = tokenizer.texts_to_sequences(x_train_clean)
# x_train_pad = pad_sequences(x_train_seq, maxlen=150, padding='post', truncating='post')

In [None]:
#text to vector / tokenization (main)
tokenizer1 = Tokenizer(num_words=10000, oov_token='<00V>')
tokenizer1.fit_on_texts(x_train_ngram)

word_index = tokenizer1.word_index

x_train_seq = tokenizer1.texts_to_sequences(x_train_ngram)
x_train_pad = pad_sequences(x_train_seq, maxlen=150, padding='post', truncating='post')

In [None]:
x_test_seq = tokenizer1.texts_to_sequences(x_test_clean)
x_test_pad = pad_sequences(x_test_seq, maxlen=150, padding='post', truncating='post')

In [None]:
#np arrays for tf2.0+


#Model Architecture v0.2
Using LSTM model for sentiment analysis

In [None]:
#model architecture (LSTM)
from tensorflow.keras.layers import LSTM

model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim = bigram_model.wv.vectors.shape[0],
        output_dim = bigram_model.wv.vectors.shape[1],
        input_length = 150,
        weights = [bigram_model.wv.vectors],
        trainable = False
    ),
    tf.keras.layers.Bidirectional(LSTM(128)),
    tf.keras.layers.Dropout(rate = 0.25),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dropout(rate = 0.25),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

In [None]:
#compile model
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [None]:
model1.summary()

In [None]:
#fit on data
ep0chs = 2
hist = model1.fit(x_train_pad, y_train, epochs=ep0chs, validation_data=(x_test_pad, y_test))

In [None]:
#model results

#Evaluating model with quantitative + qualitative analysis

##Model architecture diag.

In [None]:
tf.keras.utils.plot_model(
    model1,
    to_file='model1.png',
    show_shapes=False,
    show_dtype=False,
    show_layer_names=True,
    rankdir='TB',
    expand_nested=False,
    dpi=96,
    layer_range=None,
    show_layer_activations=False
)

##graphs of acc and val loss etc

##confusion matrix

##sensitivity analysis

#Testing model

In [None]:
#model eval
test_sub = []

review1 = 'i am in love with this iPhone 12. It is such a beautiful piece of hardware and a sophisticated blend of software, haramonising together to give the best possible smartphone experience.'

review2 = 'i cannot believe i spend so much money on this chair. the quality is poor, it could do much better. '

test_sub.append(review1)
test_sub.append(review2)

In [None]:
test_sub_clean = np.array(list(map(lambda x: pre_processing(x), test_sub)))
test_sub_seq = tokenizer1.texts_to_sequences(test_sub_clean)
test_sub_pad = pad_sequences(test_sub_seq, maxlen=150, padding='post', truncating='post')

In [None]:
#def clean_tok_pad(test_sentences): 
#function like above

In [None]:
model1.predict(test_sub_pad)

In [None]:
def translate_emo(numeric_rep):
  for item in numeric_rep:
    if item > 0.5:
      emo_out =[]
      emo_out.append('positive')
    if item < 0.5:
      emo_out.append('negative')
    return emo_out

In [None]:
model_output = model1.predict(test_sub_pad)
print(model_output)

In [None]:
def emo(emo_in):
  emo_out = []
  for item in emo_in:
    if item > 0.5:
      emo_out.append('positive')
    else:
      emo_out.append('negative')
  return emo_out

In [None]:
print(emo(model_output))

In [None]:
#just more tests: quick

In [None]:
example_text = 'this is the worst day of my life. iOS 16 is killing my battery.'
demo_only = []
demo_only.append(example_text)
out = tokenizer1.texts_to_sequences(example_text)
out = pad_sequences(out)
model_out2 = model1.predict(out)
print(emo(model_out2))

#(Testing) tsa performing sentiment analysis on recent 20 tweets

In [None]:
df_demo = pd.read_csv('/content/results.csv')
print(df_demo)

In [None]:
demo_test = []
for item in df_demo['Tweet']:
  item = re.sub(r'http\S+', '', item)
  demo_test.append(item)

In [None]:
demo_test

In [None]:
demo_out_clean = np.array(list(map(lambda x: pre_processing(x), demo_test)))
demo_out_seq = tokenizer1.texts_to_sequences(demo_out_clean)
demo_out_pad = pad_sequences(demo_out_seq, maxlen=150, padding='post', truncating='post')

In [None]:
demo_results = model1.predict(demo_out_pad)
print(emo(demo_results))

In [None]:
pos = 0
neg = 0

for item in demo_results:
  if item >= 0.5:
    pos += 1
  else:
    neg += 1


#tsa. development / integration

#Generating Graphs for tsa.

##bar chart

In [None]:
sents = ['positive', 'negative']
scores = [pos,neg]
fig = plt.figure()
bar = fig.add_axes([0,0,1,1])
colors = ['green', 'red']
bar.bar(sents, scores, color=colors)
plt.title('iOS 16.3.1 tsa results')
plt.xlabel('sentiments')
plt.ylabel('score')
plt.show()

##pie chart

In [None]:
pie_labels = ['positive', 'negative']
pie = plt.pie(scores, labels=pie_labels)
plt.title('iOS 16.3.1 tsa results')
plt.show()

##generate wordcloud

In [None]:
# import / 
import pandas as pd
import matplotlib.pyplot as plt

df_demo = pd.read_csv('')
df_demo.head

In [None]:
# prep data:
combi_string = ''

for item in df_demo.CONTENT:
  item = str(item)
  tokens = item.split()

  for i in range(len(tokens)):
    tokens[i] = tokens[i].lower()

  combi_string += " ".join(tokens) + " "

In [None]:
from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS)

wordcloud = WordCloud(
    width = 1000,
    height = 1000,
    background_color = 'white',
    stopwords = stopwords,
    min_font_size = 10
).generate(combi_string)

In [None]:
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad = 0)

plt.show()