### Coursework 1

Completing two classification tasks. One of the classification tasks is related to image classification and the other relates to text classification.

#### Task 1

In this task, you are provided with three classes of noisy images, cars, bikes and people in real world settings. You need to implement a boosting based classifier that can be used to classify the images.

#### Importing modules

In [None]:
# For image classification...
import numpy as np
from scipy import ndimage
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import os
import glob
import csv
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

# For sentiment analysis...
import string
import random
import nltk
nltk.download('popular')
from collections import Counter
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer
lanc = LancasterStemmer()
from sklearn.feature_extraction.text import TfidfVectorizer

#### Feature extraction code

In [1]:
def rotate_image(image, angle):
    return ndimage.rotate(image, angle, reshape=False, mode='nearest')

In [2]:
def shift_image(image, shift):
    '''Shift is either a float or sequence of 2 floats, one for each axis'''
    return ndimage.shift(image, shift, mode='nearest')

In [3]:
def change_brightness(image, value):
    value = np.int(value)
    if value > 0:
        new_image = np.where((255 - image) < value, 255, image+value)
    else:
        value = np.abs(value)
        new_image = np.where((image < value), 0, image-value)
    return new_image

In [4]:
def hflip_image(image):
    '''Flips image horizontally'''
    return np.fliplr(image)

In [5]:
def augment_image(image, n_out=10, shift_range=[-15,15], rot_range=[-15,15], hflip=True, brightness_range=[-25,25]):
    '''n_out = Number of output images (excluding original)'''
    
    # Randomising augmentation parameters...
    augmented_images = []
    rand_angles = np.random.rand(n_out)*(rot_range[1]-rot_range[0]) + rot_range[0]
    rand_shifts = np.random.rand(n_out)*(shift_range[1]-shift_range[0]) + shift_range[0]
    rand_brightness = np.random.rand(n_out)*(brightness_range[1]-brightness_range[0]) + brightness_range[0]
    if hflip == True:
        rand_flips = np.random.choice([True, False], size=n_out)
    else:
        rand_flips = [False]*n_out
    
    # Creating n augmented images from input image...
    for i in range(n_out):
        augmented_image = rotate_image(image, rand_angles[i])
        augmented_image = shift_image(augmented_image, rand_shifts[i])
        augmented_image = change_brightness(augmented_image, rand_brightness[i])
        if rand_flips[i] == True:
            augmented_image = hflip_image(augmented_image)
        
        # Adding image to output array...
        augmented_images.append(augmented_image)
    
    return augmented_images

In [None]:
def max_pool(image, KL=(2,2)):
    M, N = image.shape
    (K, L) = KL

    MK = M // K
    NL = N // L
    return image[:MK*K, :NL*L].reshape(MK, K, NL, L).max(axis=(1, 3))

In [1]:
def obtain_dataset_train_test(folder_name_train, folder_name_test, n_aug=1, hflip=False, pool=True, combined=True):    
    ## Processing train images first, NOT including test images in this stage
    ## Processing training images...
    X_train = []
    y_train = []
    for fullpath in glob.iglob(f'{folder_name_train}/*/*', recursive = True):
        _, target, file = fullpath.split(os.path.sep)
        
        # Getting greyscaled image data...
        if pool == True:
            X_train.append(max_pool(cv2.imread(fullpath, 0), (1,2)))
        else:
            X_train.append(cv2.imread(fullpath, 0))
        y_train.append(target)
    X_train, y_train = np.array(X_train), np.array(y_train)
    
    # Initialising hog converter...
    hog_feature_len=34020
    hog = cv2.HOGDescriptor()
    
    # Converting y to 1, 2, 3 format...
    Xtrain = []
    ytrain = []
    for i, observation in enumerate(y_train):
        if observation == 'cars':
            y_train[i] = 1
        elif observation == 'bikes':
            y_train[i] = 2
        else:
            y_train[i] = 3
        
        # Augmenting training data...
        image = X_train[i]
        if combined == True:
            Xtrain.append(np.append(image.reshape(-1), hog.compute(image).reshape(-1)))
        else:
            Xtrain.append(image.reshape(-1))
        ytrain.append(y_train[i])
        
        # Flipping training images...
        image_flipped = np.fliplr(image)
        if combined == True:
            Xtrain.append(np.append(image_flipped.reshape(-1), hog.compute(image_flipped).reshape(-1)))
        else:
            Xtrain.append(image_flipped.reshape(-1))
        ytrain.append(y_train[i])
        #'''
        # Augmenting training images, creating n_aug new images for each...
        image = X_train[i]
        augmented_images = augment_image(image, n_out=n_aug, hflip=True)
        for augmented_img in augmented_images:
            if combined == True:
                Xtrain.append(np.append(augmented_img.reshape(-1), hog.compute(augmented_img).reshape(-1)))
            else:
                Xtrain.append(augmented_img.reshape(-1))
            ytrain.append(y_train[i])
      
    X_train = np.array(Xtrain)
    y_train = np.array(ytrain).astype(int)
    
    # Shuffling training images...
    shuffle_inds = np.arange(y_train.size)
    np.random.shuffle(shuffle_inds)
    X_train = X_train[shuffle_inds]
    y_train = y_train[shuffle_inds]
    
    # Standardising (fitting on training data only)...
    standariser = StandardScaler()
    X_train = standariser.fit_transform(X_train)
    
    ## Processing test images over here, while ensuring that there is no data leakage from train to test
    ## Processing test images...
    X_test = []
    y_test = []
    for fullpath in glob.iglob(f'{folder_name_test}/*/*', recursive = True):
        _, target, file = fullpath.split(os.path.sep)
        
        # Getting greyscaled image data...
        if pool == True:
            test_image = max_pool(cv2.imread(fullpath, 0), (1,2))
        else:
            test_image = cv2.imread(fullpath, 0)
        if combined == True:
            X_test.append(np.append(test_image.reshape(-1), hog.compute(test_image).reshape(-1)))
        else:
            X_test.append(test_image.reshape(-1))
        y_test.append(target)
    
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    
    # Converting y to -1, 0, 1 format...
    for i, observation in enumerate(y_test):
        if observation == 'cars':
            y_test[i] = 1
        elif observation == 'bikes':
            y_test[i] = 2
        else:
            y_test[i] = 3
        
    y_test = y_test.astype(int)
    
    # Standardising (fit on training data only)...
    X_test = standariser.transform(X_test)
    
    return (X_train, y_train, X_test, y_test) 

#### Boosting classifier

In [7]:
class BoostingClassifier:
    
    '''Adaboost classifier'''
    
    def __init__(self, n_its=135, max_depth=2, print_progress=False):
        self.n_its = n_its
        self.model_weights = np.zeros(n_its)
        self.models = np.zeros(shape=n_its, dtype=object)
        self.print_progress = print_progress
        self.max_depth = max_depth
    
    def fit(self, X, y):

        # Initialising the weights...
        self.n_classes = np.unique(y).size
        n_obs = y.size
        weights = np.ones(n_obs) * (1 / n_obs)
        
        # Applying boosting iterations...
        for itr in range(self.n_its):
            if self.print_progress == True:
                print('Iteration', itr+1)
            
            # Implimenting decision stump...
            tree = DecisionTreeClassifier(max_depth=self.max_depth)
            tree = tree.fit(X, y, sample_weight=weights)
            pred_y = tree.predict(X)
            
            
            # Calculating weighted error...
            mod = np.sum(np.abs(weights))
            diff = pred_y - y
            loss = np.array([1 if i!=0 else 0 for i in diff])
            error = (1 / mod) * np.dot(weights, loss)
            
            # Model weight...
            model_weight = np.log((1 - error) / error) + np.log(self.n_classes - 1)
            self.model_weights[itr] = model_weight
            
            # Updated weights...
            weights *= np.exp((model_weight) * loss)
            weights /= np.sum(np.abs(weights))
            
            # Add tree to list...
            self.models[itr] = tree
        
        if self.print_progress == True:
            print('Model fit!')
        return self

    def boost_predict(self, X):
        
        model_weights = self.model_weights
        n_obs = X.shape[0]
        
        # Prediction of boosting classifier...
        #model_predictions = np.zeros((self.n_its, n_obs), dtype=object)
        model_predictions = []
        
        for itr, model in enumerate(self.models):
            #model_predictions[itr] = model.predict(X)
            model_predictions.append(model.predict(X))
        
        model_predictions = np.array(model_predictions)
        
        predictions = np.zeros(n_obs)
        n_models = model_weights.size

        for col_num in range(n_obs):
            weighted_counts = np.zeros(self.n_classes)
            for model_num in range(n_models):
                pred = model_predictions[model_num, col_num] - 1
                weighted_counts[pred] += model_weights[model_num]
            if self.n_classes == 2:
                predictions[col_num] = np.argmin(weighted_counts)
            else:
                predictions[col_num] = np.argmax(weighted_counts) + 1
        return predictions

### Cross-validation function

In [None]:
def cross_validate(model, X, y, k_splits=5, boost=True, seed=None, text=False):
    N = y.size
    split_size = round(N / k_splits)
    shuffle = np.arange(N)
    if seed is None:
        np.random.shuffle(shuffle)
    else:
        np.random.seed(seed)
        np.random.shuffle(shuffle)
    
    X = X[shuffle]
    y = y[shuffle]
    
    test_accs = []
    train_accs = []
    
    for split in range(k_splits):
        print('Split', split+1, 'in progress')
        
        # Deciding on test/train splits...
        if split + 1 < k_splits:
            if text == True:
                X_test = X[split*split_size:(split+1)*split_size]
                y_test = y[split*split_size:(split+1)*split_size]
                X_train = np.append(X[:split*split_size], X[(split+1)*split_size:])
                y_train = np.append(y[:split*split_size], y[(split+1)*split_size:])
            else:
                X_test = X[split*split_size:(split+1)*split_size,:]
                y_test = y[split*split_size:(split+1)*split_size]
                X_train = np.vstack((X[:split*split_size,:], X[(split+1)*split_size:,:]))
                y_train = np.append(y[:split*split_size], y[(split+1)*split_size:])
        else:
            if text == True:
                X_test = X[split*split_size:]
                y_test = y[split*split_size:]
                X_train = X[:split*split_size]
                y_train = y[:split*split_size]
            else:
                X_test = X[split*split_size:,:]
                y_test = y[split*split_size:]
                X_train = X[:split*split_size,:]
                y_train = y[:split*split_size]
        
        if text == True:
            pt = process_text()
            # Vectorising train and converting to tf-idf format (fitting to train data only)...
            X_train = pt.fit_transform(X_train)
            # Vectorising test and converting to tf-idf format (fitting to train data only)...
            X_test = pt.transform(X_test)
            
            if boost == True:
                # Fitting model to text data via boosting...
                model.fit(X_train, y_train)
                # Predcting test values...
                y_test_pred = model.boost_predict(X_test)
            
            else:
                # Fitting model to processed text data via SVM...
                model.fit_text(X_train, y_train)
                # Predcting test values...
                y_test_pred = model.predict_text(X_test)
            
        elif boost == True:
            # Fitting model to image data via boosting...
            model.fit(X_train, y_train)
            # Predcting test values...
            y_test_pred = model.boost_predict(X_test)
        
        else:
            # Fitting model to image data via SVM...
            model.fit_image(X_train, y_train)
            # Predcting test values...
            y_test_pred = model.predict_image(X_test)
        
        
        # Calculating prediction accuracy...
        test_acc = accuracy_score(y_test, y_test_pred)
        
        # Predcting train values...
        if boost == True:
            y_train_pred = model.boost_predict(X_train)
        else:
            y_train_pred = model.predict_image(X_train)
        
        # Calculating prediction accuracy...
        train_acc = accuracy_score(y_train, y_train_pred)
        
        # Adding accuracies to list...
        test_accs.append(test_acc)
        train_accs.append(train_acc)
    
    return test_accs, train_accs

In [None]:
all_n_augs = np.arange(6)
all_its = [5,7,9]

test_cv_means = []
train_cv_means = []

t1 = time.time()

for i, n_aug in enumerate(all_n_augs):
    train_it_mean = []
    test_it_mean = []
    for j, n_its in enumerate(all_its):
        print('\n\nIteration number {}:   ({} aug images, {} learners)'.format(i*3 + j, n_aug, n_its))
        Xtrain, ytrain, Xtest, ytest = process_images(n_aug=n_aug)
        bc = BoostingClassifier(n_its, print_progress=False)
        test_score, train_score = cross_validate(bc, Xtrain, ytrain, seed=2)
        train_it_mean.append(np.mean(test_score))
        test_it_mean.append(np.mean(train_score))
        print('Test Scores:\n', test_score, np.mean(test_score))
        print('\nTrain Scores:\n', train_score, np.mean(train_score))
    test_cv_means.append(test_it_mean)
    train_cv_means.append(train_it_mean)

# Printing time taken...
t2 = time.time()
delta_t = t2-t1
mins = delta_t//60
secs = delta_t - (mins*60)
print('\n{} mins and {} seconds'.format(mins, secs))

# Plotting the graph...
fig = plt.figure(figsize=(16,10))
plt.title('Mean accuracy of 5-fold cross validation vs. number of learners', fontsize=20, pad=10)
plt.ylabel('Mean accuracy', fontsize=20, labelpad=15)
plt.xlabel('Number of boosting learners', fontsize=20, labelpad=15)
#plt.plot(rand_its, test_cv_means, label='Test', color='r')
#plt.plot(rand_its, train_cv_means, label='Train', color='b')
plt.plot(all_n_augs, test_cv_means, label='Test', color='r')
plt.plot(all_n_augs, train_cv_means, label='Train', color='b')
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid()
plt.show()
plt.savefig('n_augmented_boost.png')

#### Task 2

Classifing the above dataset using a Support Vector Machine (SVM) with tailored kernels.

In [9]:
class SVMClassifier:
    def __init__(self, kernel='rbf'):
        # Implement initialisation...
        self.some_paramter=1
        self.kernel = kernel
        self.svc = OneVsRestClassifier(SVC(kernel=self.kernel))
        self.svc_text = SVC(kernel=self.kernel)
        
    def fit_image(self, X,y):        
        #training of the SVM
        self.svc.fit(X, y)
        # providing for separate image kernels
        return
    
    def fit_text(self, X,y):
        # Training of the SVM
        self.svc_text.fit(X, y)
        # providing for separate text kernels
        return
    
    def predict_image(self, X):
        # prediction routine for the SVM
        pred_y = self.svc.predict(X)
        return pred_y
    
    def predict_text(self, X):
        # prediction routine for the SVM
        pred_y = self.svc_text.predict(X)
        return pred_y  

### Kernels

In [None]:
class gaussian_kernel():
    '''
    Defining the gaussian kernel for a given sigma.
    (VECTORISED)
    ========================================
    Input
    ========================================
    X1: 2D numpy array
    X2: 2D numpy array
    sigma: A hyper parameter
    
    ========================================
    Output
    ========================================
    Covariance matrix (2D numpy array)
    '''
    def __init__(self, sigma=140):
        self.sigma = sigma
    
    def fit(self, X1, X2):
        X1_norm = np.square(np.linalg.norm(X1, axis=1))
        X2_norm = np.square(np.linalg.norm(X2, axis=1))
        
        distances = X1_norm.reshape(-1, 1) + X2_norm.reshape(1, -1) - 2*np.dot(X1, X2.T)

        arg = distances / (2*(self.sigma**2))
        return np.exp(-arg)

In [None]:
class log_kernel():
    '''
    Defining the log kernel for a given d.
    (VECTORISED)
    ========================================
    Input
    ========================================
    X1: 2D numpy array
    X2: 2D numpy array
    d: A hyper parameter
    
    ========================================
    Output
    ========================================
    Covariance matrix (2D numpy array)
    '''
    def __init__(self, d=1):
        self.d = d
    
    def fit(self, X1, X2):
        X1_norm = np.square(np.linalg.norm(X1, axis=1))
        X2_norm = np.square(np.linalg.norm(X2, axis=1))
        
        distances = X1_norm.reshape(-1, 1) + X2_norm.reshape(1, -1) - 2*np.dot(X1, X2.T)

        arg = distances**self.d + 1
        return - np.log(arg)

In [None]:
class students_t_kernel():
    '''
    Defining the student's t kernel for a given d.
    (VECTORISED)
    ========================================
    Input
    ========================================
    X1: 2D numpy array
    X2: 2D numpy array
    d: A hyper parameter
    
    ========================================
    Output
    ========================================
    Covariance matrix (2D numpy array)
    '''
    def __init__(self, d=1):
        self.d = d
    
    def fit(self, X1, X2):
        X1_norm = np.square(np.linalg.norm(X1, axis=1))
        X2_norm = np.square(np.linalg.norm(X2, axis=1))
        
        distances = X1_norm.reshape(-1, 1) + X2_norm.reshape(1, -1) - 2*np.dot(X1, X2.T)

        arg = distances**self.d + 1
        return 1 / arg

In [None]:
class cauchy_kernel():
    '''
    Defining the student's t kernel for a given d.
    (VECTORISED)
    ========================================
    Input
    ========================================
    X1: 2D numpy array
    X2: 2D numpy array
    d: A hyper parameter
    
    ========================================
    Output
    ========================================
    Covariance matrix (2D numpy array)
    '''
    def __init__(self, sigma=1):
        self.sigma = sigma
    
    def fit(self, X1, X2):
        X1_norm = np.square(np.linalg.norm(X1, axis=1))
        X2_norm = np.square(np.linalg.norm(X2, axis=1))
        
        distances = X1_norm.reshape(-1, 1) + X2_norm.reshape(1, -1) - 2*np.dot(X1, X2.T)

        arg = (distances / (self.sigma**2)) + 1
        return 1 / arg

In [None]:
class tanh_kernel():
    '''
    Defining the hyperbolic tangent kernel for a given k and c.
    (VECTORISED)
    ========================================
    Input
    ========================================
    X1: 2D numpy array
    X2: 2D numpy array
    k: Scale hyper parameter
    c: Offset hyper parameter
    
    ========================================
    Output
    ========================================
    Covariance matrix (2D numpy array)
    '''
    def __init__(self, k=1, c=0):
        self.k = k
        self.c = c
    
    def fit(self, X1, X2):
        return np.tanh((self.k * X1 @ X2.T) + self.c)

In [10]:
class histogram_intersection_kernel():
    '''
    Defining the histogram intersection kernel.
    (VECTORISED)
    ========================================
    Input
    ========================================
    X1: 2D numpy array
    X2: 2D numpy array
    
    ========================================
    Output
    ========================================
    Covariance matrix (2D numpy array)
    '''
    def fit(self, X1, X2):
        N1 = X1.shape[0]
        N2 = X2.shape[0]
        
        cov = np.zeros((N1,N2))
        for i in range(N1):
            for j in range(N2):
                cov[i,j] = np.sum(np.minimum(X1[i], X2[j]))

        return cov

### Testing kernels...

In [11]:
M = Xtrain.shape[1]
var = np.var(Xtrain)
sigma = np.sqrt(0.5 * M * var)

#'linear'                            # It's ok
gaus = gaussian_kernel(sigma=sigma) # ls = 140   # Pretty good
tanh = tanh_kernel(k=sigma, c=0)     # This is bad, sk's sigmoid is better thoughsvc = OneVsRestClassifier(SVC(kernel=cauchy.fit))
log = log_kernel(d=2)                # Great, no optimal hyperparamter though!
hist = histogram_intersection_kernel()  # Great and no hyper parameters!
stud = students_t_kernel(d=1)        # Bad, massively overfits
cauchy = cauchy_kernel(sigma=sigma) # Great, doesn't overfit as much

In [12]:
svc = SVMClassifier(kernel=cauchy.fit)
test_accs, train_accs = cross_validate(svc, Xtrain, ytrain, boost=False, seed=None)

print('')
print(test_accs, np.mean(test_accs), '\n')
print(train_accs, np.mean(train_accs))

"\n#svc = OneVsRestClassifier(SVC(kernel=cauchy.fit))\nsvc = SVMClassifier(kernel=cauchy.fit)\ntest_accs, train_accs = cross_validate(svc, Xtrain, ytrain, boost=False, seed=None)\n\nprint('')\nprint(test_accs, np.mean(test_accs), '\n')\nprint(train_accs, np.mean(train_accs))\n"

In [None]:
mySVC = SVMClassifier(kernel=hist.fit)
mySVC.fit_image(Xtrain, ytrain)
y_pred = mySVC.predict_image(Xtest)
print('accuracy', accuracy_score(ytest, y_pred))
trainy_pred = mySVC.predict_image(Xtrain)
print('accuracy', accuracy_score(ytrain, trainy_pred))

In [13]:
lss = np.linspace(0.01, 7, 250)
sf = 1
best_acc = 0
for i, ls in enumerate(lss):
    print('\nIter:', i+1)
    print('===========')
    print('Length-scale = {}, sigma_f = {}\n'.format(ls,sf))
    g = g_kernel(length_scale=ls, sigma_f=sf)
    sc = OneVsRestClassifier(SVC(kernel=g.out))
    sc.fit(Xtrain, ytrain)
    y_pred = sc.predict(Xtest)
    acc = accuracy_score(ytest, y_pred)
    print('accuracy', acc)
    if acc > best_acc:
        best_acc = acc
        best_ls = ls
        best_sf = sf
        print('New Best!')
print('\n\nOverall best acc =', best_acc)
print(best_ls, best_sf)



#### Task 3

Obtaining sentiment analysis for the movie review dataset. The dataset consists of movie reviews with the sentiments being provided. The sentiments are either positive or negative. I will train a boosting-based classifier to obtain train and cross-validate on the dataset provided. The method will be evaluated against an external test set.

#### Process the text and obtain a bag of words-based features 

In [16]:
def extract_reviews(filename):
    X = []
    y = []
    with open(filename, 'r', newline='', encoding='Latin1') as file:  # 
            reviews = csv.reader(file)
            next(reviews, None)
            # Reading in and appending reviews to raw_x list...
            for review in reviews:
                X.append(review[0])

                # Converting 'positive' to a y-value of 1 and 'negative' to a y-value of 0...
                # Appending this to raw_y list...
                if review[1] == 'positive':
                    y.append(1)
                else:
                    y.append(0)
    
    # Converting to numpy arrays...
    X = np.array(X)
    y = np.array(y)
    
    return X, y

In [17]:
# Function to remove stop-words and punctuation, and to stem words...
def clean_text(reviews):   
    stop_words = set(stopwords.words('english'))   # set(c_stop_words) | set(stopwords.words('english'))
    stp_wrds_punc = stop_words | set(string.punctuation)
    
    sep = ' '
    for i, review in enumerate(reviews):
        # Converting to lowercase and temporarlity tokenising words...
        tokens = word_tokenize(review.lower())
        # Removing stop-words, punctuation, numbers and stemming words...
        fltd_tokens = [lanc.stem(token) for token in tokens if token not in stp_wrds_punc and token.isalpha()]
        reviews[i] = sep.join(fltd_tokens)
    return reviews

In [None]:
def extract_bag_of_words(train_file):
    # Extracting training reveiws from CSV file...
    X, y = extract_reviews(train_file)

    # Randomising reviews...
    np.random.seed(5)
    shuffle = np.random.permutation(X.shape[0])
    X = X[shuffle]
    y = y[shuffle]
    
    # Train/Test split...
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.8, random_state=123)
    
    # Removing stop-words, punctuation and stem words...
    Xtrain = clean_text(Xtrain)
    Xtest = clean_text(Xtest)
    
    return (Xtrain, ytrain, Xtest, ytest)

In [1]:
# Vectorising with bi-grams and applying tf-idf...
class process_text():
    def __init__(self, sparse=True):
        self.sparse = sparse
        self.tfvec = TfidfVectorizer(ngram_range=(1,2))#, max_features=10000)
        
    def fit_transform(self, X):
        # Vectorising with bi-grams...
        X = self.tfvec.fit_transform(X)
        if self.sparse == True:
            return X
        else:
            return X.toarray()
    
    def transform(self, X):
        # Vectorising with bi-grams...
        X = self.tfvec.transform(X)
        if self.sparse == True:
            return X
        else:
            return X.toarray()

In [5]:
def extract_bag_of_words_train_test(train_file, test_file, sparse=True):
    # Process training data first and ensure the test data is not used while extracting bag of words feature vector
    # Extracting training reveiws from CSV file...
    Xtrain, ytrain = extract_reviews(train_file)
    
    # Removing stop-words, punctuation and stem words...
    Xtrain = clean_text(Xtrain)
    
    # Vectorising and converting to tf-idf format (fitting to train data only)...
    pt = process_text(sparse=sparse)
    Xtrain = pt.fit_transform(Xtrain)
    
    
    ## Process testing data here. Ensure that test data is not used above
    # Extracting test reveiws from CSV file...
    Xtest, ytest = extract_reviews(test_file)
    
    # Removing stop-words, punctuation and stem words...
    Xtest = clean_text(Xtest)
    
    # Vectorising and converting to tf-idf format (fitting to train data only)...
    Xtest = pt.transform(Xtest)
    
    return (Xtrain, ytrain, Xtest, ytest)

### Cross-validation on training set...

In [20]:
# Extracting and cleaning review data...
Xtrain, ytrain, Xtest, ytest = extract_bag_of_words('movie_review_train.csv')

In [21]:
bc = BoostingClassifier(n_its=100, max_depth=2, print_progress=False)
test_accs, train_accs = cross_validate(bc, Xtrain, ytrain, boost=True, seed=None, text=True)

print('')
print(test_accs, np.mean(test_accs), '\n')
print(train_accs, np.mean(train_accs))

"\nbc = BoostingClassifier(n_its=100, max_depth=2, print_progress=False)\ntest_accs, train_accs = cross_validate(bc, Xtrain, ytrain, boost=True, seed=None, text=True)\n\nprint('')\nprint(test_accs, np.mean(test_accs), '\n')\nprint(train_accs, np.mean(train_accs))\n"

In [22]:
# Vectorising and converting to tf-idf format (fitting to train data only)...
pt = process_text()
Xtrain = pt.fit_transform(Xtrain)
Xtest = pt.transform(Xtest)

## Testing predictions...
svd.fit(Xtrain, ytrain)
svm_pred_test_y = svd.predict(Xtest)

'\n# Vectorising and converting to tf-idf format (fitting to train data only)...\npt = process_text()\nXtrain = pt.fit_transform(Xtrain)\nXtest = pt.transform(Xtest)\n\n## Testing predictions...\nsvd.fit(Xtrain, ytrain)\nsvm_pred_test_y = svd.predict(Xtest)\n'

In [23]:
from sklearn.linear_model import SGDClassifier
#svd = SGDClassifier(tol=None)
# Training...
#svd.fit(Xtrain, ytrain)
# Predicitng...                
#y_test_pred = svd.predict(Xtest)
print('Test set accuracy =', accuracy_score(ytest, svm_pred_test_y)*100,'%')
svm_pred_train_y = svd.predict(Xtrain)
print('Train set accuracy =', accuracy_score(ytrain, svm_pred_train_y)*100,'%')

"\nfrom sklearn.linear_model import SGDClassifier\n#svd = SGDClassifier(tol=None)\n# Training...\n#svd.fit(Xtrain, ytrain)\n# Predicitng...                \n#y_test_pred = svd.predict(Xtest)\nprint('Test set accuracy =', accuracy_score(ytest, svm_pred_test_y)*100,'%')\nsvm_pred_train_y = svd.predict(Xtrain)\nprint('Train set accuracy =', accuracy_score(ytrain, svm_pred_train_y)*100,'%')\n"

In [24]:
tree = DecisionTreeClassifier(max_depth=2)
tree.fit(Xtrain, ytrain)
pred_y = tree.predict(Xtest)
print('Test set accuracy =', accuracy_score(ytest, pred_y))
print(pred_y)

"\ntree = DecisionTreeClassifier(max_depth=2)\ntree.fit(Xtrain, ytrain)\npred_y = tree.predict(Xtest)\nprint('Test set accuracy =', accuracy_score(ytest, pred_y))\nprint(pred_y)\n"

In [25]:
# Boosting classifier...
bc = BoostingClassifier(n_its=100, max_depth=4, print_progress=True)
# Training...
bc.fit(Xtrain, ytrain)
# Predicitng...                
bc_pred_test_y = bc.boost_predict(Xtest)
print('Test set accuracy =', accuracy_score(ytest, bc_pred_test_y))
bc_pred_train_y = bc.boost_predict(Xtrain)
print('Train set accuracy =', accuracy_score(ytrain, bc_pred_train_y))

"\n# Boosting classifier...\nbc = BoostingClassifier(n_its=100, max_depth=4, print_progress=True)\n# Training...\nbc.fit(Xtrain, ytrain)\n# Predicitng...                \nbc_pred_test_y = bc.boost_predict(Xtest)\nprint('Test set accuracy =', accuracy_score(ytest, bc_pred_test_y))\nbc_pred_train_y = bc.boost_predict(Xtrain)\nprint('Train set accuracy =', accuracy_score(ytrain, bc_pred_train_y))\n"

#### Task 4

Classifying the movie review dataset using a Support Vector Machine (SVM) with tailored kernels.

In [27]:
# Intialising SVM...
svc = SVMClassifier(kernel='sigmoid')
# Extracting and cleaning review data...
X,y = extract_bag_of_words('movie_review_train.csv')
# Performing cross-validation...
test_accs, train_accs = cross_validate(svc, X, y, boost=False, seed=None, text=True)

# Fitting...
svc.fit_text(Xtrain, ytrain)
# Predicitng...  
text_svc_test_pred = svc.predict_text(Xtest)

# Printing accuracy...
print('Test set accuracy =', accuracy_score(ytest, text_svc_test_pred))
text_svc_train_pred = svc.predict_text(Xtrain)
print('Train set accuracy =', accuracy_score(ytrain, text_svc_train_pred))

"\n# Intialising SVM...\nsvc = SVMClassifier(kernel='sigmoid') # 'rbf'\n# Extracting and cleaning review data...\n#X,y = extract_bag_of_words('movie_review_train.csv')\n# Performing cross-validation...\n#test_accs, train_accs = cross_validate(svc, X, y, boost=False, seed=None, text=True)\n\n# Fitting...\nsvc.fit_text(Xtrain, ytrain)\n# Predicitng...  \ntext_svc_test_pred = svc.predict_text(Xtest)\n\n# Printing accuracy...\nprint('Test set accuracy =', accuracy_score(ytest, text_svc_test_pred))\ntext_svc_train_pred = svc.predict_text(Xtrain)\nprint('Train set accuracy =', accuracy_score(ytrain, text_svc_train_pred))\n"

In [None]:
class spline_kernel():
    '''
    Defining the histogram intersection kernel.
    (VECTORISED)
    ========================================
    Input
    ========================================
    X1: 2D numpy array
    X2: 2D numpy array
    
    ========================================
    Output
    ========================================
    Covariance matrix (2D numpy array)
    '''
    def fit(self, X1, X2):
        N1 = X1.shape[0]
        N2 = X2.shape[0]
        
        cov = np.zeros((N1,N2))
        for i in range(N1):
            for j in range(N2):
                X1_i, X2_j = X1[i], X2[j]
                
                min_xy = np.minimum(X1_i, X2_j)
                xy = X1_i * X2_j
                x_plus_y = X1_i + X2_j
                
                arg = 1 + xy + (xy*min_xy) - (0.5*x_plus_y*(min_xy**2)) + ((min_xy**3)/3)
                cov[i,j] = np.prod(arg)
        
        return cov