# Data Preprocessing

The following file contains the code for preprocessing the data.

Make sure to create a directory "data/cleaned_data/" and "data/fully_preprocessed_data/" such that the data can be saved there. Besides, save the fasttext file of https://fasttext.cc/docs/en/english-vectors.html (choosing wiki news - 1 million word vectors) under "data/fasttext_file/wiki-news-300d-1M.vec".

## Importing libraries

In [None]:
# imports
import os
import re
import glob
import nltk
import h5py
import collections
import numpy as np
import pandas as pd
import pickle as pkl
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from keras.preprocessing import sequence
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt


In [None]:
# part of the code below in this file originates from the following book:
# Gulli, Antonio, Amita Kapoor, and Sujit Pal. Deep Learning with TensorFlow 2 and Keras: Regression, ConvNets, GANs, RNNs, NLP, and More with TensorFlow 2 and the Keras API. Packt Publishing, Limited, 2019, pp. 365-373.

In [530]:
import collections
import numpy as np
import nltk
import matplotlib.pyplot as plt
import pickle as pkl

def load_data():
    """Load cleaned data and the unknown token."""
    with open('data/cleaned_data/train_cleaned.pkl', 'rb') as f:
        df_train = pkl.load(f)
    with open('data/cleaned_data/test_cleaned.pkl', 'rb') as f:
        df_test = pkl.load(f)
    with open('data/cleaned_data/merged_cleaned.pkl', 'rb') as f:
        df_merged = pkl.load(f)
    with open('unk_token.pkl', 'rb') as f:
        unk = pkl.load(f)
    return df_train, df_test, df_merged, unk

def preprocess_data(df_train, df_test, df_merged):
    """Nummerate instances and create a dictionary for domain encoding."""
    df_train['idx_domain'] = df_train.groupby(['domain']).cumcount()
    df_test['idx_domain'] = df_test.groupby(['domain']).cumcount()
    df_merged['idx_domain'] = df_merged.groupby('domain').cumcount()
    dict_domain = dict(zip(sorted(set(np.array(df_train['domain']))), np.arange(16)))
    
    return dict_domain

def divide_data(df, dict_domain):
    """Divides the dataframe data into reviews, labels, domain, and instance number."""
    reviews = np.array([s for s in df['text']])
    X, y, domain, idx_domain = reviews, np.array(df['label']), np.array(df['domain']), np.array(df['idx_domain'])
    domain = [dict_domain.get(c, c) for c in domain]
    return X, y, domain, idx_domain

def parse_reviews(reviews):
    """Tokenizes the reviews and counts words."""
    word_freqs = collections.Counter()
    nr_words = []
    parsed_reviews = []
    for review in reviews:
        words = nltk.word_tokenize(review)
        word_freqs.update(words)
        parsed_reviews.append(" ".join(words))
        nr_words.append(len(words))
    return parsed_reviews, nr_words, word_freqs

def visualize_data(nr_words_merged):
    """Visualization: Histogram of review word counts."""
    plt.hist(nr_words_merged, bins=35, range=(0, 400))
    plt.xlabel('Number of words in a review')
    plt.ylabel('Number of reviews')
    plt.show()

 def lookup_word2id(word, word2id):
    """Return word's id if it exists, else return UNK's id."""
    return word2id.get(word, word2id["UNK"])

def load_vectors(embedding_file, word2id, embed_size):
    """Load word embeddings from a pre-trained embedding file."""
    embedding = np.zeros((len(word2id), embed_size))
    with open(embedding_file, encoding="utf8") as file:
        for line in file:
            cols = line.strip().split()
            word = cols[0]
            if word in word2id:
                vec = np.array([float(v) for v in cols[1:]])
                embedding[lookup_word2id(word, word2id)] = vec
    embedding[word2id["PAD"]] = np.zeros((embed_size))
    embedding[word2id["UNK"]] = unk
    return embedding

def assign_wordvectors(parsed_ids, dict_emb):
    """Assign embeddings to parsed ids."""
    emb = np.copy(parsed_ids).astype(int)
    emb = list(emb.flatten())
    emb = np.array([*map(dict_emb.get, emb)])
    emb = emb.reshape((parsed_ids.shape[0], seq_len, 300))
    return emb

def save_to_h5py(data, filepath):
    """Save data to an h5py file."""
    with h5py.File(filepath, 'a') as f:
        f.create_dataset('data', data=data)   

def main():
    df_train, df_test, df_merged, unk = load_data()
    dict_domain = preprocess_data(df_train, df_test, df_merged)
    
    X_train, y_train, domain_train, idx_domain_train = divide_data(df_train, dict_domain)
    X_test, y_test, domain_test, idx_domain_test = divide_data(df_test, dict_domain)
    X_merged, y_merged, domain_merged, idx_domain_merged = divide_data(df_merged, dict_domain)
    
    parsed_reviews_train, nr_words_train, word_freqs_train = parse_reviews(X_train)
    parsed_reviews_test, nr_words_test, _ = parse_reviews(X_test)
    parsed_reviews_merged, nr_words_merged, word_freqs_merged = parse_reviews(X_merged)
    
    # count the reviews that are longer than 100 words
    nr_reviews_merged_long = sum(i > 80 for i in nr_words_merged)
    visualize_data(nr_words_merged)

    # print calculated numbers for longer sequences
    print("Reviews longer than 80 words : "+ str(nr_reviews_merged_long))
    print("Reviews in total: "+ str(len(nr_words_merged)))
    print("Percentage of reviews with more than 80 words: "+str((nr_reviews_merged_long/len(nr_words_merged))*80)+' %')


    # determining the vocabulary size

    # print the full vocabulary size
    print("Full vocabulary size: " + str(len(word_freqs_merged)))

    # determine how many words make up the 25000 most commonly used words
    vocab_words_total_merged = sum(list(word_freqs_merged.values())[:65000])

    # determine total words 
    words_total_merged = sum(list(word_freqs_merged.values()))

    # print the percentage of words covered by a vocabulary of 25000
    print('Percentage of words covered by vocabulary: ' + str(vocab_words_total_merged/words_total_merged))

    # set the vocabulary size and the sequence length
    vocab_size = 65000
    seq_len = 50


    word2id = {"PAD": 0, "UNK": 1}
    for v, (k, _) in enumerate(word_freqs_merged.most_common(vocab_size - 2)):
        word2id[k] = v + 2

    parsed_ids_merged = [[lookup_word2id(w, word2id) for w in s.split()] for s in parsed_reviews_merged]
    parsed_ids_merged = sequence.pad_sequences(parsed_ids_merged, 50)
    parsed_ids_test = [[lookup_word2id(w, word2id) for w in s.split()] for s in parsed_reviews_test]
    parsed_ids_test = sequence.pad_sequences(parsed_ids_test, 50)

    embeddings = load_vectors('wiki-news-300d-1M.vec', word2id, 300)
    dict_emb = {j: embeddings[j] for j in range(embeddings.shape[0])}

    data_emb_merged = assign_wordvectors(parsed_ids_merged, dict_emb)
    data_emb_test = assign_wordvectors(parsed_ids_test, dict_emb)

    save_to_h5py(data_emb_merged, 'data/fully_preprocessed_data/X_merged_preprocessed_new.h5')
    save_to_h5py(data_emb_test, 'data/fully_preprocessed_data/X_test_preprocessed_new.h5')

    pkl.dump(np.array([y_merged, domain_merged, idx_domain_merged]), open("domain_and_label_merged.pkl", "wb"))
    pkl.dump(np.array([y_test, domain_test, idx_domain_test]), open("domain_and_label_test.pkl", "wb"))


if __name__ == "__main__":
    main()
