# Assignment 1 solution

This notebook contains a solution to assignment 1

## Imports

In [53]:
import random

import numpy as np

import nltk
import nltk.stem

# Needed for the WordNetLemmatizer
import nltk.corpus
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

import sklearn
import sklearn.naive_bayes
import sklearn.ensemble
import sklearn.metrics

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Helgi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Helgi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Pre-process data

In [103]:
data_folder = 'E:/Repos/comp550/assignment1/data'
positives_file = 'rt-polarity.pos'
negatives_file = 'rt-polarity.neg'

random_state = 111
np.random.seed(random_state)
random.seed(random_state)

def read_lines(file_path):
    with open(f'{data_folder}/{positives_file}', 'r') as file_handler:
        return [sentence.rstrip() for sentence in file_handler.readlines()]


In [104]:
positives_data = read_lines(f'{data_folder}/{positives_file}')
negatives_data = read_lines(f'{data_folder}/{positives_file}')
all_text = " ".join(positives_data) + " ".join(negatives_data)
all_text_freq_dist = nltk.FreqDist(nltk.word_tokenize(all_text))

In [4]:
print(all_text_freq_dist.most_common(20))

[('.', 13944), (',', 10536), ('the', 10120), ('a', 7690), ('and', 7106), ('of', 6622), ('to', 3940), ("'s", 3626), ('is', 3550), ('it', 3348), ('that', 2714), ('in', 2678), ('film', 1798), ('with', 1766), ('as', 1760), ('but', 1568), ('an', 1512), ('its', 1400), ('for', 1348), ('this', 1334)]


## Pre-processing functions

In [120]:
def get_wordnet_pos(word):
    """
    Source: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
    Map POS tag to first character lemmatize() accepts
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

def remove_rare_words(min_frequency):
    return corpus

def remove_n_common_stopwords(corpus, n):
    return corpus

def tokenize(sentence_list):
    result = []
    for sentence in sentence_list:
        result.append(nltk.word_tokenize(sentence))
    return result

def lemmatize(sentence_tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    sentences = []
    
    for sentence in sentence_tokens:
        words = []
        for token in sentence:
            words.append(lemmatizer.lemmatize(token, pos=get_wordnet_pos(token)))
        sentences.append(words)
    return sentences

def stem(sentence_tokens):
    stemmer = nltk.stem.PorterStemmer()
    sentences = []
    
    for sentence in sentence_tokens:
        words = []
        for token in sentence:
            words.append(stemmer.stem(token))
        sentences.append(words)
    return sentences

def flatten_2d_list(list_2d):
    return [item for sublist in list_2d for item in sublist]

def preprocess_dataset(normalization, n_common_stopwords_to_remove, minimum_word_frequency):
    assert normalization.lower() in ['lemmatization', 'stemming']
    
    samples = {
        'positive': tokenize(read_lines(f'{data_folder}/{positives_file}')),
        'negative': tokenize(read_lines(f'{data_folder}/{positives_file}'))
    }
    
    if normalization == 'lemmatization':
        samples['positive'] = lemmatize(samples['positive'])
        samples['negative'] = lemmatize(samples['negative'])
    else:
        samples['positive'] = stem(samples['positive'])
        samples['negative'] = stem(samples['negative'])
        
    freq_dist = nltk.FreqDist(flatten_2d_list(samples['positive']) + flatten_2d_list(samples['negative']))
    
    # Todo: remove punctuation
    # Todo: filter using n_common_stopwords_to_remove
    # Todo: filter using minimum_word_frequency
    # Todo: return unigram counts -> 0/1 mapping


## Hyper-parameter search across models

In [24]:
def choose_random_params(parameters):
    return {
        name: np.random.choice(values)
        for name, values in parameters.items()
    }

search_params = {
    'data': {
        'normalization': ['lemmatization', 'stemming'],
        'n_common_stopwords_to_remove': np.arange(1000, step=10),
        'minimum_word_frequency': np.arange(10)
    },
    'model': {
        'logistic_regression': {
            'learning_rate': ['constant'],
            'eta0': [1e-5, 1e-4, 1e-3, 1e-2], # learning rate
            'loss': ['log'],
            'alpha': [0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 2e-1, 4e-1], # regularization
            'penalty': ['l2', 'l1', 'elasticnet'],
            'max_iter': np.arange(20), # epochs
            'random_state': [random_state]
        },
        'linear_support_vector_machine': {
            'kernel': ['linear'],
            'max_iter': np.arange(20), # epochs
            'C': [0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 2e-1, 4e-1], # L2 regularization
            'random_state': [random_state]
        },
        'naive_bayes': {
            'alpha': np.arange(1.1, step=0.1),
            'fit_prior': [True],
            'class_prior': [None]
        },
        'random_forest': {
            'n_estimators': np.arange(10, 1000, step=10),
            'max_depth': np.append(np.array(None), np.arange(16, step=2)),
            'max_features': [None, 'auto', 'sqrt', 'log2'],
            'random_state': [random_state]
        }
    }
}

models = {
    'logistic_regression': sklearn.linear_model.SGDClassifier,
    'linear_support_vector_machine': sklearn.svm.SVC,
    'naive_bayes': sklearn.naive_bayes.MultinomialNB,
    'random_forest': sklearn.ensemble.RandomForestClassifier
}

data_variations = 1
model_variations = 1
split = [0.8, 0.9]

In [None]:
results = []

for i in range(data_variations):
    print(f'Data variation {i+1}/data_variations')
    
    data_params = choose_random_params(params['data'])
    
    X, y = preprocess_dataset(**data_params)
    X_train, y_train = X[: split[0]], y[: split[0]]
    X_valid, y_valid = X[split[0] : split[1]], y[split[0] : split[1]]

    for model_name, model_class in models.item():
        for j in range(model_variations):
            print(f'\t{model_name} {j+1}/model_variations')
            
            model_params = choose_random_params(params['model'][model_name])
            model = model_class(**model_params)

            model.train(X_train, y_train)
            predictions = model.predict(X_valid)
            accuracy = sklearn.metrics.accuracy_score(y_valid, predictions)
            
            results.append({
                'model_name': model_name,
                'data_params': data_params,
                'model_params': model_params
                'accuracy': accuracy
            })

# random_predictions = np.random.randint(low=0, high=2, size=X_test.shape[0])

## Results