In [2]:
#Name: Leung Ko Tsun
#SID: 20516287
import numpy as np
import string
import pandas as pd
import nltk
import keras

from sklearn import random_projection
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.corpus import stopwords

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
from keras import metrics
from scipy import sparse

stopwords = set(stopwords.words("english"))

def load_data(file_name):
    """
    :param file_name: a file name, type: str
    return a list of ids, a list of documents, a list of labels
    """
    df = pd.read_csv(file_name)

    return df['id'], df["text"], df['label']
def load_labels(file_name):
    """
    :param file_name: a file name, type: str
    return a list of labels
    """
    return pd.read_csv(file_name)['label']

def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)
def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]
def get_bagofwords(data, vocab_dict):
    '''
    :param data: a list of tokenized documents, type: list
    :param vocab_dict: a mapping from words to indices, type: dict
    return a BOW matrix in Compressed Sparse Row matrix format, type: scipy.sparse.csr_matrix
    '''
    
    '''
    The BOW matrix is first constructed using Row-based list of lists sparse matrix (LIL) format.
    LIL is a convenient format for constructing sparse matrices, as it supports flexible slicing, 
    and it is efficient to change to the matrix sparsity structure.
    '''
    
    data_matrix = sparse.lil_matrix((len(data), len(vocab_dict)))

    for i, doc in enumerate(data):
        for word in doc:
            word_idx = vocab_dict.get(word, -1)
            if word_idx != -1:
                data_matrix[i, word_idx] += 1
                
    '''
    After constructing the BOW matrix on all input documents, we convert the matrix to Compressed Sparse 
    Row (CSR) format for fast arithmetic and matrix vector operations.
    '''
    data_matrix = data_matrix.tocsr()
    
    return data_matrix


if __name__ == '__main__':
    train_file = "data/train.csv"
    test_file = "data/test.csv"
    ans_file = "data/answer.csv"
    train_ids, train_texts, train_labels_raw = load_data(train_file)
    test_ids, test_texts, _ = load_data(test_file)
    test_labels = load_labels(ans_file)

    train_data_label = keras.utils.to_categorical(train_labels_raw-1,
                                                  num_classes=5)
    test_data_label = keras.utils.to_categorical(test_labels-1,
                                                 num_classes=5)
    # Tokenization
    train_tokens = [tokenize(text) for text in train_texts] 
    test_tokens = [tokenize(text) for text in test_texts]
    # Stop words removal
    train_tokens = [filter_stopwords(tokens) for tokens in train_tokens]
    test_tokens = [filter_stopwords(tokens) for tokens in test_tokens]
    # use a set data structure to hold all words appearing in the train set
    vocab = set()
    for i, doc in enumerate(train_tokens):# enumerate over each document in the train set
        # enumerate over each word in the document
        for word in doc:
            # if this word has been added into the set before, 
            # then it will be ignored, otherwise, it will be 
            # added into the set.
            vocab.add(word)
    # create a dictionary from the set of words, where the
    # keys are word strings and the values are numerical indices
    vocab_dict = dict(zip(vocab, range(len(vocab))))
    print('Size of vocab: ', len(vocab_dict))
    train_data_matrix = get_bagofwords(train_tokens, vocab_dict)
    test_data_matrix = get_bagofwords(test_tokens, vocab_dict)
    print('Type of train_data_matrix: ', type(train_data_matrix))
    print('Type of test_data_matrix: ', type(test_data_matrix))
    print('Shape of train_data_matrix:', train_data_matrix.shape)
    print('Shape of test_data_matrix:', test_data_matrix.shape)
    # YOUR CODE HERE
    # Data shape
    N, V = train_data_matrix.shape
    K = train_data_label.shape[1]

    # Hyperparameters
    input_size = V
    output_size = K
    batch_size = 100
    optimizer = SGD
    learning_rate = 0.1
    total_epoch = 20

    # New model
    model = Sequential()

    ##### YOUR CODE HERE #######
    hidden_size = 100
    model.add(Dense(hidden_size, input_shape=(V,)))
    model.add(Dense(hidden_size, input_shape=(hidden_size,), activation="relu"))
    model.add(Dense(K, input_shape=(hidden_size,), activation="softmax"))

    model.compile(loss="binary_crossentropy", 
              optimizer=SGD(lr=learning_rate), 
              metrics=["accuracy"])

    # training
    model.fit(train_data_matrix, train_data_label,
              epochs=total_epoch,
              batch_size=batch_size)
    # testing
    train_score = model.evaluate(train_data_matrix, train_data_label,
                                 batch_size=batch_size)
    test_score = model.evaluate(test_data_matrix, test_data_label,
                                batch_size=batch_size)

    print('Training Loss: {}\n Training Accuracy: {}\n'
          'Testng Loss: {}\n Testing accuracy: {}'.format(
              train_score[0], train_score[1],
              test_score[0], test_score[1]))


Size of vocab:  16021
Type of train_data_matrix:  <class 'scipy.sparse.csr.csr_matrix'>
Type of test_data_matrix:  <class 'scipy.sparse.csr.csr_matrix'>
Shape of train_data_matrix: (2000, 16021)
Shape of test_data_matrix: (400, 16021)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training Loss: 0.2352612167596817
 Training Accuracy: 0.7394999861717224
Testng Loss: 0.3545567989349365
 Testing accuracy: 0.5600000023841858
