# Assignment 1 solution

This notebook contains a solution to assignment 1

## Imports

In [1]:
import random

import numpy as np

import nltk
import nltk.stem

# Needed for the WordNetLemmatizer
import nltk.corpus
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

import sklearn
import sklearn.naive_bayes
import sklearn.ensemble
import sklearn.metrics
import sklearn.feature_extraction.text
import sklearn.utils

random_state = 111
np.random.seed(random_state)
random.seed(random_state)

[nltk_data] Downloading package wordnet to /home/helgi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/helgi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/helgi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Pre-process data

In [13]:
def read_lines(file_path):
    with open(file_path, 'r', encoding='ISO-8859-1') as file_handler:
        return [sentence.rstrip() for sentence in file_handler.readlines()]

# data_folder = 'E:/Repos/comp550/assignment1/data'
data_folder = '/home/helgi/repos/comp550/assignment1/data/'
positives_file = 'rt-polarity.pos'
negatives_file = 'rt-polarity.neg'

splits = [0.8, 0.9]

negatives_data = [
    (document, 'negative')
    for document in read_lines(f'{data_folder}/{negatives_file}')
]
positives_data = [
    (document, 'positive')
    for document in read_lines(f'{data_folder}/{positives_file}')
]
all_data = sklearn.utils.shuffle(np.array(negatives_data + positives_data), random_state=random_state)
n = all_data.shape[0]

X_train, y_train = all_data[:int(splits[0]*n), 0], all_data[:int(splits[0]*n), 1]
X_valid, y_valid = all_data[int(splits[0]*n):int(splits[1]*n), 0], all_data[int(splits[0]*n): int(splits[1]*n), 1]
X_test, y_test = all_data[int(splits[1]*n):, 0], all_data[int(splits[1]*n):, 1]

In [19]:
class Lemmatizer:
    def __init__(self):
        self.normalizer = nltk.stem.WordNetLemmatizer()
        self.tag_prefix_dict = {
            'J': nltk.corpus.wordnet.ADJ,
            'N': nltk.corpus.wordnet.NOUN,
            'V': nltk.corpus.wordnet.VERB,
            'R': nltk.corpus.wordnet.ADV
        }
    
    def __call__(self, document):
        tokens = nltk.word_tokenize(document)
        return [
            self.normalizer.lemmatize(token, pos=self.get_tag_class(tag))
            for token, tag in nltk.pos_tag(tokens)
        ]
    
    def get_tag_class(self, tag):
        prefix = tag[0].upper()
        return self.tag_prefix_dict.get(prefix, nltk.corpus.wordnet.NOUN)

class Stemmer:
    def __init__(self):
        self.normalizer = nltk.stem.PorterStemmer()
    
    def __call__(self, document):
        return [
            self.normalizer.stem(token)
            for token in nltk.word_tokenize(document)
        ]

def fit_vectorizer(X_data, tokenizer, stop_words, min_df):    
    vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        tokenizer=tokenizer,
        stop_words=stop_words,
        min_df=min_df
    )
    vectorizer.fit_transform(X_data)
    return vectorizer

# vectorizer.get_feature_names()

## Hyper-parameter search across models

In [17]:
def choose_random_params(parameters):
    return {
        name: np.random.choice(values)
        for name, values in parameters.items()
    }

search_params = {
    'data': {
        'tokenizer': [Lemmatizer(), Stemmer()],
        'stop_words': ['english', None], # Try custom stop words list?
        'min_df': [1, 2, 3] # Minimum token frequency
    },
    'model': {
        'logistic_regression': {
            'learning_rate': ['constant'],
            'eta0': [1e-5, 1e-4, 1e-3, 1e-2], # learning rate
            'loss': ['log'],
            'alpha': [0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 2e-1, 4e-1], # regularization
            'penalty': ['l2', 'l1', 'elasticnet'],
            'max_iter': np.arange(20), # epochs
            'random_state': [random_state]
        },
        'linear_support_vector_machine': {
            'kernel': ['linear'],
            'max_iter': np.arange(20), # epochs
            'C': [0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 2e-1, 4e-1], # L2 regularization
            'random_state': [random_state]
        },
        'naive_bayes': {
            'alpha': np.arange(1.1, step=0.1),
            'fit_prior': [True],
            'class_prior': [None]
        },
        'random_forest': {
            'n_estimators': np.arange(10, 1000, step=10),
            'max_depth': np.append(np.array(None), np.arange(16, step=2)),
            'max_features': [None, 'auto', 'sqrt', 'log2'],
            'random_state': [random_state]
        }
    }
}

models = {
    'logistic_regression': sklearn.linear_model.SGDClassifier,
    'linear_support_vector_machine': sklearn.svm.SVC,
    'naive_bayes': sklearn.naive_bayes.MultinomialNB,
    'random_forest': sklearn.ensemble.RandomForestClassifier
}

data_variations = 1
model_variations = 1

In [18]:
data_variation_mapping = {}
results = {i: [] for i in range(data_variations)}

for i in range(data_variations):
    print(f'Data variation {i+1}/{data_variations}')
    
    data_params = choose_random_params(search_params['data'])
    
    data_variation_mapping[i] = data_params
    
    print('Fitting vectorizer...')
    vectorizer = fit_vectorizer(X_train, **data_params)
    
    X_train_model = vectorizer.transform(X_train)
    X_valid_model = vectorizer.transform(X_valid)
    X_test_model = vectorizer.transform(X_test)

    for model_name, model_class in models.items():
        for j in range(model_variations):
            print(f'\t{model_name} {j+1}/{model_variations}')
            
            model_params = choose_random_params(search_params['model'][model_name])
            
            model = model_class(**model_params)

            model.fit(X_train_model, y_train)
            predictions = model.predict(X_valid_model)
            accuracy = sklearn.metrics.accuracy_score(y_valid, predictions)
            
            results[i].append({
                'model_name': model_name,
                'data_params': data_params,
                'model_params': model_params,
                'accuracy': accuracy
            })

# random_predictions = np.random.randint(low=0, high=2, size=X_test.shape[0])

Data variation 1/1
	logistic_regression 1/1
	linear_support_vector_machine 1/1




	naive_bayes 1/1
	random_forest 1/1


## Test set results

In [None]:
# random_predictions = np.random.randint(low=0, high=2, size=X_test.shape[0])