[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/junelsolis/AAU-Machine-Learning/HEAD)

# Initial setup and data import

In [None]:
!pip install tweet-preprocessor nltk keras-tuner textblob ipywidgets wordcloud swifter keras sklearn tensorflow tqdm pv autocorrect

In [None]:
# Make default library imports
import pandas as pd
import numpy as np
import preprocessor as p
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import swifter

tqdm.pandas()

plt.style.use('fivethirtyeight')

%matplotlib inline

In [None]:
# Read the data from file
data = pd.read_csv('Sentiment140.tenPercent.sample.tweets.tsv', delimiter='\t')
data.head()


In [None]:
# Check for null values in the data
# Plot label histogram
print('Null values present in labels: ' + str(data['sentiment_label'].isnull().values.any()))
print('Null values present in tweet text: ' + str(data['tweet_text'].isnull().values.any()))
print()

plt.title('Distribution of sentiment values')
plt.bar(['0','4'], [len(data['sentiment_label'].where(data['sentiment_label'] == 0)), len(data['sentiment_label'].where(data['sentiment_label'] == 4))])


# Preprocess tweets

In [None]:
# Import NLTK dependencies
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))

import random

In [None]:
from textblob import TextBlob, Word
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from autocorrect import Speller
from os import path
import swifter

### Text processing functions

In [None]:
def process_tweet(tweet):

  clean_tweet = tweet
  # clean_tweet = p.clean(tweet)
  
  # Remove punctuations and numbers
  clean_tweet = re.sub('[^a-zA-Z]', ' ', clean_tweet)

  # Convert to lower case
  clean_tweet = clean_tweet.lower()

  # Single character removal
  clean_tweet = re.sub(r"\s+[a-zA-Z]\s+", ' ', clean_tweet)

  # Removing multiple spaces
  clean_tweet = re.sub(r'\s+', ' ', clean_tweet)

  # Remove words longer than 41 chars
  clean_tweet_words = clean_tweet.split(' ')
  filter_max_word_length_tweet = []
  for w in clean_tweet_words:
    if (len(w) <= 40):
      filter_max_word_length_tweet.append(w)

  clean_tweet =  " ".join(filter_max_word_length_tweet)

  # Remove multiple spaces again
  clean_tweet = re.sub(r'\s+', ' ', clean_tweet)
  return clean_tweet


tag_dict = {"J": 'a', 
            "N": 'n', 
            "V": 'v', 
            "R": 'r'}

########################
########################
def lemmatize(tweet):
  blob = TextBlob(tweet)

  
  words_and_tags = [(Word(w), tag_dict.get(pos[0], 'n')) for w, pos in blob.tags]
  lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]

  return " ".join(lemmatized_list)


########################
########################
ps = PorterStemmer()

def stem(tweet):
  output = []
  for w in tweet.split():
      output.append("".join(ps.stem(w)))

  return " ".join(output)

########################
########################
def remove_stop_words(tweet):

  word_tokens = word_tokenize(tweet)  
  
  filtered_sentence = []  
    
  for w in word_tokens:  
      if w not in stop_words:  
          filtered_sentence.append(w)  

  return " ".join(filtered_sentence)

########################
########################
def correct_spelling(tweet):
  
    blob = TextBlob(tweet)
    return str(blob.correct())


########################
########################
def empty_single_word_tweets(tweet):
    if (len(tweet.split(' ')) > 1):
        return tweet
    else:
        return ''



## IMPORTANT ##
In this section, the tweet samples are preprocessed. As it takes a lot of time to do this, the pickled data has been saved to a file called __clean_data.pkl__

If this file exists in the project directoy, it is automatically loaded and used for the rest of the notebook. If refreshing the data is needed, then delete the __clean_data.pkl__ file and run the cell below. __Be advised:__ it will take at least half an hour on regular PC's.

In [None]:
!pip install autocorrect
from autocorrect import Speller

In [None]:
# Clean the tweets. 
# Remove the following:
# - URLS
# - Hashtags
# - Mentions
# - Reserved words (RT, FAV)
# - Emojis
# - Smileys
# - Numbers



if path.exists('clean_data.pkl'):
    clean_data = pd.read_pickle('clean_data.pkl')
    
else:

    # Copy the dataset and retain the original data
    clean_data = data.copy()
    clean_data = clean_data.sample(70000) # reduce number of samples

    # def clean_data.swifter.allow_dask_on_strings(enable=True)

    # Convert labels to binary
    clean_data.loc[clean_data['sentiment_label'] == 4, 'sentiment_label'] = 1


    # Run initial clean with tweet-preprocessor
    print('Initial cleaning...\n')
    clean_data['tweet_text'] = clean_data['tweet_text'].swifter.allow_dask_on_strings().apply(lambda row: p.clean(row))

    # print('Spelling check...\n')
    # speller = Speller()
    # n = 1000
    # for g, df in clean_data.groupby(np.arange(len(clean_data)) // n):
    #     df['tweet_text'] = df['tweet_text'].swifter.allow_dask_on_strings().apply(lambda row: speller(row))

    # Lemmatize
    # print('Lemmatizing...\n')
    # clean_data['tweet_text'] = clean_data['tweet_text'].swifter.allow_dask_on_strings().apply(lambda row: lemmatize(row))

    # Stemming
    # print('Stemming...\n')
    # clean_data['tweet_text'] = clean_data['tweet_text'].swifter.allow_dask_on_strings().apply(lambda row: stem(row))

    # Run more cleaning
    print('More cleaning...\n')
    clean_data['tweet_text'] = clean_data['tweet_text'].swifter.allow_dask_on_strings().apply(lambda row: process_tweet(row))

    # Remove stop words
    # print('Remove stop words...\n')
    # clean_data['tweet_text'] = clean_data['tweet_text'].swifter.allow_dask_on_strings().apply(lambda row: remove_stop_words(row))

    # Correct spelling
    # speller = Speller()
    # print('Spelling check...\n')
    # clean_data['tweet_text'] = clean_data['tweet_text'].swifter.allow_dask_on_strings().apply(lambda row: speller(row))

    # Remove tweets with only a single word
    # print('Remove single-word tweets...\n')
    # clean_data['tweet_text'] = clean_data['tweet_text'].swifter.allow_dask_on_strings().apply(lambda row: empty_single_word_tweets(row))
    # clean_data.drop(clean_data[clean_data['tweet_text'] == ''].index, inplace=True)



    # Save cleaned data to pickle to save time later
    clean_data.to_pickle('clean_data.pkl')




In [None]:
# Print random sample of cleaned tweets
print(clean_data.count())
clean_data.sample(15)

# Support Vector Machine

## Prepare the dataset

In [None]:
# # Split the dataset into training and test
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(clean_data['tweet_text'], clean_data['sentiment_label'], test_size=0.2)

## Vectorize using TF-IDF

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# tf_idf_vect = TfidfVectorizer(max_features=2000)
# tf_idf_vect.fit(clean_data['tweet_text'])

# X_train_tf_idf = tf_idf_vect.transform(X_train)
# X_test_tf_idf = tf_idf_vect.transform(X_test)


## Fit data to SVM

In [None]:
# from sklearn import model_selection, naive_bayes, svm


# # Classifier - Algorithm - SVM
# # fit the training dataset on the classifier
# SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', verbose=True)
# SVM.fit(X_train_tf_idf, y_train)

## Measure accuracy

In [None]:
# from sklearn.metrics import accuracy_score
# # predict the labels on validation dataset
# predictions_SVM = SVM.predict(X_test_tf_idf)
# # Use accuracy_score function to get the accuracy
# print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

# Use GloVe

### Download and extract GloVe
 - Do not download if the zip file already exists
 - Do not attempt extract if the __glove_data__ directory exists

In [None]:
import requests

if path.exists('glove.zip') == False:
    url = "http://nlp.stanford.edu/data/glove.twitter.27B.zip" 
    response = requests.get(url, stream=True)
    total_size_in_bytes= int(response.headers.get('content-length', 0))
    block_size = 1024 #1 Kibibyte
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    with open('glove.zip', 'wb') as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")

if path.exists('glove_data') == False:
    n_files = !unzip -l glove.zip | grep . | wc -l
    !unzip -o ./glove.zip -d ./glove_data/ | pv -l -s {n_files[0]} > /dev/null

In [None]:
NB_WORDS = 2500
GLOVE_DIM = 100

# Tokenize the text corpus
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n', split=" ")
tokenizer.fit_on_texts(clean_data['tweet_text'].values)

X = tokenizer.texts_to_sequences(clean_data['tweet_text'].values)
X = pad_sequences(X) # padding our text vector so they all have the same length

In [None]:

glove_file = 'glove.twitter.27B.' + str(GLOVE_DIM) + 'd.txt'
emb_dict = {}
glove = open('glove_data/' + glove_file)
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [None]:
emb_matrix = np.zeros((NB_WORDS, GLOVE_DIM))

for w, i in tokenizer.word_index.items():
    if i < NB_WORDS:
        vect = emb_dict.get(w)
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

# LSTM/RNN

In [None]:
# Split the dataset into training and test
from sklearn.model_selection import train_test_split
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X, clean_data['sentiment_label'], test_size=0.2)

### Configure hyperparameter tuning

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.losses import SparseCategoricalCrossentropy
from keras.optimizers import Adam

def build_model(hp):

    # variables to adjust during tuning
    hp_units_1 = hp.Int('units_1', min_value = 32, max_value = 512, step = 32)
    hp_units_2 = hp.Int('units_2', min_value = 32, max_value = 512, step = 32)
    hp_learning_rate = hp.Choice('learning_rate', values = [1e-2, 1e-3, 1e-4])
    optimizer = hp.Choice('optimizer', ['adam', 'sgd']) 

    
    # define model
    model = Sequential()

    model.add(Embedding(NB_WORDS, GLOVE_DIM, input_length=X.shape[1]))
    model.layers[0].set_weights([emb_matrix])
    model.layers[0].trainable = False

    model.add(LSTM(hp_units_1, return_sequences = True, dropout = 0.3, recurrent_dropout = 0.2))
    model.add(LSTM(hp_units_2, dropout=0.3, recurrent_dropout=0.2))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss=SparseCategoricalCrossentropy(), optimizer=optimizer, metrics=['accuracy'])

    return model

### Hyperparameter tuning
This cell executes hyperparameter tuning. On this se

In [None]:
import kerastuner as kt
import tensorflow as tf
import IPython

lstm_tuner = kt.Hyperband(build_model,
                     objective = 'val_accuracy', 
                     max_epochs = 5,
                     factor = 3,
                     directory = './',
                     project_name = 'lstm_tuning')


class ClearTrainingOutput(tf.keras.callbacks.Callback):
  def on_train_end(*args, **kwargs):
    IPython.display.clear_output(wait = True)

# lstm_tuner.search(X_train_lstm, y_train_lstm, epochs = 1, validation_split=0.3, callbacks = [ClearTrainingOutput()])

# Get the optimal hyperparameters
best_hps = lstm_tuner.get_best_hyperparameters(num_trials = 1)[0]
best_hps

# print(f"""
# The hyperparameter search is complete. The optimal number of units in the first densely-connected
# layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
# is {best_hps.get('learning_rate')}.
# """)

### Compile and train
Load and use the hyperparameter values gathered during tuning to build the actual model and save to disk as __lstm__

In [None]:
# Compile and train model
if os.path('models/lstm') == False:
    lstm_model = lstm_tuner.hypermodel.build(best_hps)
    history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs = 6, validation_split=0.3)

    with open('models/lstm_training_history', 'wb') as history_file:
        pickle.dump(history.history, history_file)

    lstm_model.save('lstm')