# Twitter Sentiment Analysis
This pipeline is based around: https://www.kaggle.com/code/paoloripamonti/twitter-sentiment-analysis <br>


In [48]:
#!sudo pip3 uninstall gensim

In [49]:
#!pip install gensim
#!pip3 install keras==2.7.0rc2
#!pip install -U scikit-learn scipy statsmodels
#!pip3 install torchtext
#!pip3 install imblearn

In [50]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Pre-trained embeddings
import torchtext.vocab as vocab
import torch

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils

from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import tensorflow as tf

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import pickle

# Drive things
#from google.colab import drive

# Oversampling 
from imblearn.over_sampling import RandomOverSampler


In [51]:
nltk.download('stopwords')
#drive.mount('/content/myDrive')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christianrasmussen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
#cd myDrive/MyDrive/Colab Notebooks/project/notebooks

In [53]:
#use_tpu = True #@param {type:"boolean"}
#
#if use_tpu:
#    assert 'COLAB_TPU_ADDR' in os.environ, 'Missing TPU; did you request a TPU in Notebook Settings?'
#
#if 'COLAB_TPU_ADDR' in os.environ:
#  TF_MASTER = 'grpc://{}'.format(os.environ['COLAB_TPU_ADDR'])
#else:
#  TF_MASTER=''
## Model specific parameters
#
## TPU address
#tpu_address = TF_MASTER
#

### Settings

In [54]:
# Parameter to change depending on whether or not to use a pretrained model.
# Preffered if running on local hardware btw. 
pretrained_model = True

#Choose which domain to be source domain
source = 'NB'

In [55]:
# Reproducability
seed = 1234
tf.random.set_seed(seed)

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
embedding_dim = 300

# model settings
SEQUENCE_LENGTH = 300
epochs = 30
batch_size = 75

# EXPORT
KERAS_MODEL = f"{source}-ros-e30-bs75.h5"

### Read Dataset

In [56]:
# Reading needed datasets
source_domain = {'LR': '../data_synthesising/synthesized_data/LR_data.csv',
                  'NB' : '../data_synthesising/synthesized_data/NB_data.csv',
                  'JS' : '../data_synthesising/synthesized_data/JS_data.csv',
                  'EA' : '../data_synthesising/synthesized_data/equal_amounts.csv'}
validation_domain = '../data_synthesising/val_data/kindle_reviews.csv'
test_domain = '../data_synthesising/test_data/amazon_reviews.csv'

# Path to pretrained_model
model_path = f'../pretrained_models/{source}-ROS-E30-BS75.h5'

train = pd.read_csv(source_domain[source]) # Change string to swap out source domain
test = pd.read_csv(test_domain) # Kindle review domain
val = pd.read_csv(validation_domain)


# Oversampling data to match weight amounts
ros = RandomOverSampler(random_state=seed)
x, y = ros.fit_resample(train, train['label'])

train = x
print("Dataset size:", len(train))
print("Dataset size:", len(val))
print("Dataset size:", len(test))


Dataset size: 8074
Dataset size: 1000
Dataset size: 1000


### Pre-Process dataset

In [57]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [58]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [59]:
train.text = train.text.apply(lambda x: preprocess(x))
test.text = test.text.apply(lambda x: preprocess(x))
val.text = val.text.apply(lambda x: preprocess(x))

### Split train and test

In [60]:
df_train = train
df_test = test
df_val = val
print("TRAIN size:", len(df_train))

TRAIN size: 8074


### Word2Vec 

In [61]:
# Pretrained attempt: 
glove = vocab.GloVe('6B', dim = embedding_dim)
def get_word(word):
    return glove.vectors[glove.stoi[word]]
def closest(vec, n=10):
    """
    Find the closest words for a given vector
    """
    all_dists = [(w, torch.dist(vec, get_word(w))) for w in glove.itos]
    return sorted(all_dists, key=lambda t: t[1])[:n]
def print_tuples(tuples):
    for tuple in tuples:
        print('(%.4f) %s' % (tuple[1], tuple[0]))

2022-05-27 02:49:27,135 : INFO : Loading vectors from .vector_cache/glove.6B.300d.txt.pt


### Tokenize Text

In [62]:
print(type(df_train.text))
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)
tokenizer.fit_on_texts(df_test.text)
tokenizer.fit_on_texts(df_val.text)
vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

<class 'pandas.core.series.Series'>
Total words 29320


In [63]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen=SEQUENCE_LENGTH)
y_train = df_train['label']
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text), maxlen=SEQUENCE_LENGTH)
y_test = df_test['label']
x_val =  pad_sequences(tokenizer.texts_to_sequences(df_val.text), maxlen=SEQUENCE_LENGTH)
y_val = df_val['label']

### Embedding layer

In [64]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    try:
        embedding_matrix[i] = get_word(word)
    except:
        pass
print(embedding_matrix.shape)

(29320, 300)


In [65]:
embedding_layer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

In [66]:
#resolver = tf.distribute.cluster_resolver.TPUClusterResolver(TF_MASTER)
#tf.config.experimental_connect_to_cluster(resolver)
#tf.tpu.experimental.initialize_tpu_system(resolver)
#strategy = tf.distribute.experimental.TPUStrategy(resolver)


### Training and building our model:

In [70]:
if not pretrained_model:
    model = Sequential()
    model.add(embedding_layer)
    model.add(Dropout(0.5))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])
    callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
    history = model.fit(x_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        #validation_split=0.1, #Untab to do validation splits on source domain
                        verbose=1,
                        callbacks=callbacks)
else:
    model = keras.models.load_model(model_path)

In [95]:
LSTM?

### Evaluate

In [87]:
score1 = model.evaluate(x_val, y_val, batch_size=50)
score = model.evaluate(x_test, y_test, batch_size=50)
print()
print("VALIDATION ACCURACY:",score1[1])
print("VALIDATION LOSS:",score1[0])
print()
print("TEST ACCURACY:",score[1])
print("TEST LOSS:",score[0])


VALIDATION ACCURACY: 0.7829999923706055
VALIDATION LOSS: 0.9311715364456177

TEST ACCURACY: 0.7229999899864197
TEST LOSS: 1.3022006750106812


In [72]:
if not pretrained_model:
    for i in history.history:
        print(i)
    acc = history.history['accuracy']
    #val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    #val_loss = history.history['val_loss']
     
    epochs = range(len(acc))
     
    plt.plot(epochs, acc, 'b', label='Training acc')
    plt.title('Training accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss, 'b', label='Training loss')
    plt.title('Training loss')
    plt.legend()
    plt.show()

### Predict

In [73]:
def predict(text, include_neutral=False):
    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment

    return int(round(score[0],0))  

In [74]:
y_pred_1d = []
y_test_1d = list(df_test.label)
scores = model.predict(x_test, verbose=1, batch_size=8000)
y_pred_1d = [int(round(score[0],0)) for score in scores]

y_pred_1d_val = []
y_val_1d = list(df_val.label)
scores = model.predict(x_val, verbose=1, batch_size=8000)
y_pred_1d_val = [int(round(score[0],0)) for score in scores]



In [93]:
# Printing some examples of sentences our model got wrong from the validation data. 
val  = pd.read_csv(validation_domain)
for i in range(200,300):
    pred = y_val_1d[i]
    true = y_pred_1d_val[i]
    if pred != true:
        print(val['text'][i],f'\nTrue: {true}, Predicted as: {pred}\n{i}\n')

First this is an excellent kindle edition as it is true to the original and contains no scanning errors also it contains copies of the original artwork which can be zoomed to view in addition there is a glossary which contains words or phrases that may not be familiar and I found myself using the links which were underlined. Also included in the book complete with photographs is a comprehensive biography of L Ron Hubbard.The actual story is typical of the type written at the time when authors had to bash out stories in minimal time to meet editor's time scales and often paid by the word.  I have now read 4 of these books and previously thought that the "Iron Duke" was the weakest however I do not think this one is as good. It is a very short story probably best described as fantasy; from the cover picture I thought it was going to be a "spaceship" type science fiction. The story centres around a professor who discovers a formula which when allows him to instantly teleport to any destin

### Classification Report

In [75]:
# Printing classification reports for validation and test data:

print('Classification report for predicting on validation data:\n',classification_report(y_val_1d, y_pred_1d_val),'\n\n')

print('Classification report for predicting on test data:\n',classification_report(y_test_1d, y_pred_1d))

Classification report for predicting on validation data:
               precision    recall  f1-score   support

           0       0.76      0.64      0.69       382
           1       0.80      0.87      0.83       618

    accuracy                           0.78      1000
   macro avg       0.78      0.76      0.76      1000
weighted avg       0.78      0.78      0.78      1000
 


Classification report for predicting on test data:
               precision    recall  f1-score   support

           0       0.85      0.56      0.68       514
           1       0.66      0.90      0.76       486

    accuracy                           0.72      1000
   macro avg       0.75      0.73      0.72      1000
weighted avg       0.76      0.72      0.72      1000



### Save model

In [76]:
if not pretrained_model:
    model.save(KERAS_MODEL)