In [None]:
import os
import sys

In [None]:
# To add your own Drive Run this cell.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Please append your own directory after ‘/content/drive/My Drive/'
### ========== TODO : START ========== ###
sys.path += ['/content/drive/My Drive/CS M146/HW3-code'] 
### ========== TODO : END ========== ###

In [None]:
"""
Author      : Yi-Chieh Wu, Sriram Sankararman
Description : Twitter
"""

from string import punctuation

import numpy as np
import matplotlib.pyplot as plt
# !!! MAKE SURE TO USE SVC.decision_function(X), NOT SVC.predict(X) !!!
# (this makes ``continuous-valued'' predictions)
from sklearn.svm import SVC
#from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

# Problem: Twitter Analysis Using SVM
In this project, you will be working with Twitter data. Specifically, we have supplied you with a number of tweets that are reviews/reactions to movies, 

e.g., <i> "@nickjfrost just saw The Boat That Rocked/Pirate Radio and I thought it was brilliant! You and the rest of the cast were fantastic! < 3". </i>

You will learn to automatically classify such tweets as either positive or negative reviews. To do this, you will employ Support Vector Machines (SVMs), a popular choice for a large number of classification problems.

Datasets:
* **tweets.txt** contains 630 tweets about movies. Each line in the file contains exactly one tweet, so there are 630 lines in total.

* **labels.txt** contains the corresponding labels. If a tweet praises or recommends a movie, it is classified as a positive review and labeled +1; otherwise it is classified as a negative review and labeled -1. These labels are ordered, i.e. the label for the ith tweet in tweets.txt corresponds to the ith number in labels.txt.

In [None]:
######################################################################
# functions -- input/output
######################################################################

def read_vector_file(fname):
    """
    Reads and returns a vector from a file.

    Parameters
    --------------------
        fname  -- string, filename

    Returns
    --------------------
        labels -- numpy array of shape (n,)
                    n is the number of non-blank lines in the text file
    """
    return np.genfromtxt(fname)


def write_label_answer(vec, outfile):
    """
    Writes your label vector to the given file.

    Parameters
    --------------------
        vec     -- numpy array of shape (n,) or (n,1), predicted scores
        outfile -- string, output filename
    """

    # for this project, you should predict 70 labels
    if(vec.shape[0] != 70):
        print("Error - output vector should have 70 rows.")
        print("Aborting write.")
        return

    np.savetxt(outfile, vec)
    

## 1. Feature Extraction

Here we use a bag-of-words model to convert each tweet into a feature vector. A bag-of-words model treats a text file as a collection of words, disregarding word order. The first step in building a bag-of-words model involves building a "dictionary". A dictionary contains all of the unique words in the text file. 

For this project, punctuations were included in the dictionary as well. For example, a text file containing <i>"John likes movies. Mary likes movies2!!"</i> will have a dictionary $\textbf{\{'John':0, 'Mary':1, 'likes':2, 'movies':3, 'movies2':4, '.':5, '!':6\}}$. 

The $\textbf{(key,value)}$ pairs are $\textbf{(word, index)}$, where the index keeps track of the number of unique words (size of the dictionary). Given a dictionary containing $d$ unique words, we can transform the $n$ variable-length tweets into $n$ feature vectors of length $d$ by setting the $i^{th}$ element of the $j^{th}$ feature vector to 1 if the $i^{th}$ dictionary word is in the $j^{th}$ tweet, and 0 otherwise.

In [None]:
######################################################################
# functions -- feature extraction
######################################################################

def extract_words(input_string):
    """
    Processes the input_string, separating it into "words" based on the presence
    of spaces, and separating punctuation marks into their own words.

    Parameters
    --------------------
        input_string -- string of characters

    Returns
    --------------------
        words        -- list of lowercase "words"
    """

    for c in punctuation :
        input_string = input_string.replace(c, ' ' + c + ' ')
    return input_string.lower().split()


def extract_dictionary(infile):
    """
    Given a filename, reads the text file and builds a dictionary of unique
    words/punctuations.

    Parameters
    --------------------
        infile    -- string, filename

    Returns
    --------------------
        word_list -- dictionary, (key, value) pairs are (word, index)
    """

    word_list = {}
    idx = 0
    with open(infile, 'r') as fid :
        ### ========== TODO : START ========== ###
        # part 1a: process each line to populate word_list

        for line in fid: # read tweet by tweet
            tweet = extract_words(line) # list of "words"
            for word in tweet:
                if word not in word_list: 
                  word_list[word] = idx
                  idx += 1

        ### ========== TODO : END ========== ###

    return word_list


def extract_feature_vectors(infile, word_list):
    """
    Produces a bag-of-words representation of a text file specified by the
    filename infile based on the dictionary word_list.

    Parameters
    --------------------
        infile         -- string, filename
        word_list      -- dictionary, (key, value) pairs are (word, index)

    Returns
    --------------------
        feature_matrix -- numpy array of shape (n,d)
                          boolean (0,1) array indicating word presence in a string
                            n is the number of non-blank lines in the text file
                            d is the number of unique words in the text file
    """

    num_lines = sum(1 for line in open(infile,'rU'))
    num_words = len(word_list)
    feature_matrix = np.zeros((num_lines, num_words))

    with open(infile, 'r') as fid :
        ### ========== TODO : START ========== ###
        # part 1b: process each line to populate feature_matrix
        line_num = 0

        for line in fid:
            tweet = extract_words(line)
            for key, value in word_list.items():
                if key in tweet: feature_matrix[line_num][value] = 1
            line_num += 1
        ### ========== TODO : END ========== ###

    return feature_matrix

## 2. Hyperparameter Selection for a Linear-Kernel SVM and an RBF-kernel SVM

Next, we will learn a classifier to separate the training data into positive and negative tweets. For the classifier, we will use SVMs with two different kernels: linear and radial basis function (RBF). 

The **sklearn.svm.SVC** class was used and only three of the initialization parameters: **kernel**, **gamma**, and **C** were explicitly set. **SVC.fit(X,y)** was used to train the SVM, but in lieu of using **SVC.predict(X)** to make predictions, **SVC.decision_function(X)**, which returns the (signed) distance of the samples to the separating hyperplane, was used instead.

SVMs have hyperparameters that must be set by the user. For both linear and RBF-kernel SVMs, hyperparameters were selected using 5-fold cross-validation (CV). Using 5-fold CV, we will select the hyperparameters that lead to the ‘best’ mean performance across all 5 folds.

In [None]:
######################################################################
# functions -- evaluation
######################################################################

def performance(y_true, y_pred, metric="accuracy"):
    """
    Calculates the performance metric based on the agreement between the
    true labels and the predicted labels.

    Parameters
    --------------------
        y_true -- numpy array of shape (n,), known labels
        y_pred -- numpy array of shape (n,), (continuous-valued) predictions
        metric -- string, option used to select the performance measure
                  options: 'accuracy', 'f1_score', 'auroc', 'precision',
                           'sensitivity', 'specificity'

    Returns
    --------------------
        score  -- float, performance score
    """
    # map continuous-valued predictions to binary labels
    y_label = np.sign(y_pred)
    y_label[y_label==0] = 1

    ### ========== TODO : START ========== ###
    # part 2a: compute classifier performance
    if metric == "accuracy": 
      score = metrics.accuracy_score(y_true,y_label)
    elif metric == "f1_score":
      score = metrics.f1_score(y_true,y_label)
    elif metric == "auroc":
      score = metrics.roc_auc_score(y_true, y_label)
    elif metric == "precision":
      score = metrics.precision_score(y_true, y_label)
    else:
      mcm = metrics.confusion_matrix(y_true, y_label) # 2 by 2 this case
      tn, fp, fn, tp = mcm.ravel()
      if metric == "sensitivity": # true positive rate
        score = tp / (tp + fn)
      if metric == "specificity": # true negative rate
        score = tn / (tn + fp)
    ### ========== TODO : END ========== ###
    return score

def cv_performance(clf, X, y, kf, metric="accuracy"):
    """
    Splits the data, X and y, into k-folds and runs k-fold cross-validation.
    Trains classifier on k-1 folds and tests on the remaining fold.
    Calculates the k-fold cross-validation performance metric for classifier
    by averaging the performance across folds.

    Parameters
    --------------------
        clf    -- classifier (instance of SVC)
        X      -- numpy array of shape (n,d), feature vectors
                    n = number of examples
                    d = number of features
        y      -- numpy array of shape (n,), binary labels {1,-1}
        kf     -- cross_validation.KFold or cross_validation.StratifiedKFold
        metric -- string, option used to select performance measure

    Returns
    --------------------
        score   -- float, average cross-validation performance across k folds
    """

    ### ========== TODO : START ========== ###
    # part 2b: compute average cross-validation performance
    metric_score = np.zeros(kf.get_n_splits(X, y))
    counter = 0
    # split data based on cross validation kf
    for train_index, test_index in kf.split(X,y): # loop for k times aka k folds
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # train SVM
        clf.fit(X_train, y_train)
        # predict using trained classifier
        y_pred = clf.decision_function(X_test)
        # metric score
        metric_score[counter] = performance(y_test, y_pred, metric)
        counter += 1
    
    score = np.average(metric_score)

    return score
    ### ========== TODO : END ========== ###


def select_param_linear(X, y, kf, metric="accuracy"):
    """
    Sweeps different settings for the hyperparameter of a linear-kernel SVM,
    calculating the k-fold CV performance for each setting, then selecting the
    hyperparameter that 'maximize' the average k-fold CV performance.

    Parameters
    --------------------
        X      -- numpy array of shape (n,d), feature vectors
                    n = number of examples
                    d = number of features
        y      -- numpy array of shape (n,), binary labels {1,-1}
        kf     -- cross_validation.KFold or cross_validation.StratifiedKFold
        metric -- string, option used to select performance measure

    Returns
    --------------------
        C -- float, optimal parameter value for linear-kernel SVM
    """

    print('Linear SVM Hyperparameter Selection based on ' + str(metric) + ':')
    C_range = 10.0 ** np.arange(-3, 3)
    
    ### ========== TODO : START ========== ###
    # part 2c: select optimal hyperparameter using cross-validation
    c_score = np.zeros(len(C_range))
    counter = 0
    for c in C_range:
      clf = SVC(kernel = 'linear', C=c) # define SVM instance
      c_score[counter] = cv_performance(clf, X, y, kf, metric)
      counter += 1

    C = C_range[np.argmax(c_score)]
    print(f"For {metric}, cv scores across different parameters are {c_score}")

    return C
    ### ========== TODO : END ========== ###


def select_param_rbf(X, y, kf, metric="accuracy"):
    """
    Sweeps different settings for the hyperparameters of an RBF-kernel SVM,
    calculating the k-fold CV performance for each setting, then selecting the
    hyperparameters that 'maximize' the average k-fold CV performance.

    Parameters
    --------------------
        X       -- numpy array of shape (n,d), feature vectors
                     n = number of examples
                     d = number of features
        y       -- numpy array of shape (n,), binary labels {1,-1}
        kf     -- cross_validation.KFold or cross_validation.StratifiedKFold
        metric  -- string, option used to select performance measure

    Returns
    --------------------
        gamma, C -- tuple of floats, optimal parameter values for an RBF-kernel SVM
    """

    print('RBF SVM Hyperparameter Selection based on ' + str(metric) + ':')

    ### ========== TODO : START ========== ###
    # part 3b: create grid, then select optimal hyperparameters using cross-validation
    C_range = 10.0 ** np.arange(-3, 4)
    gamma_range = 10.0 ** np.arange(-5, 2)

    tuple_score = np.zeros(len(C_range)*len(gamma_range))
    counter = 0

    for c in C_range:
      for gamma in gamma_range:
        clf = SVC(kernel = 'rbf', C=c, gamma=gamma) # define SVM instance
        tuple_score[counter] = cv_performance(clf, X, y, kf, metric)
        counter += 1

    index = np.argmax(tuple_score)
    best_tuple = (gamma_range[index%7], C_range[index//7])
    print(f"For {metric}, the best cv scores across different parameters is {tuple_score[index]}")

    return best_tuple

    ### ========== TODO : END ========== ###


def performance_test(clf, X, y, metric="accuracy"):
    """
    Estimates the performance of the classifier using the 95% CI.

    Parameters
    --------------------
        clf          -- classifier (instance of SVC)
                          [already fit to data]
        X            -- numpy array of shape (n,d), feature vectors of test set
                          n = number of examples
                          d = number of features
        y            -- numpy array of shape (n,), binary labels {1,-1} of test set
        metric       -- string, option used to select performance measure

    Returns
    --------------------
        score        -- float, classifier performance
        lower, upper -- tuple of floats, confidence interval
    """


    ### ========== TODO : START ========== ###
    # part 4b: return the values of test results under a metric.
    y_pred = clf.decision_function(X)
    score = performance(y, y_pred, metric)
    
    return score
    ### ========== TODO : END ========== ###

## 3. Test Set Performance

Apply the two classifiers learned in the previous sections to the test data and measure performance.

Performance was measured using the following metrics: 
* accuracy
* F1-Score
* AUROC
* precision
* sensitivity
* specificity

In [None]:
######################################################################
# main
######################################################################

def main() :
    np.random.seed(1234)

    # read the tweets and its labels, change the following two lines to your own path.
    file_path = '/content/drive/My Drive/CS M146/HW3-code/data/tweets.txt'
    label_path = '/content/drive/My Drive/CS M146/HW3-code/data/labels.txt'
    dictionary = extract_dictionary(file_path)
    print(len(dictionary))
    X = extract_feature_vectors(file_path, dictionary)
    y = read_vector_file(label_path)


    metric_list = ["accuracy", "f1_score", "auroc", "precision", "sensitivity", "specificity"]

    ### ========== TODO : START ========== ###
    # part 1c: split data into training (training + cross-validation) and testing set
    X_training = X[0:559]
    y_training = y[0:559]
    X_test = X[560:629]
    y_test = y[560:629]
    # part 2b: create stratified folds (5-fold CV)
    kf = StratifiedKFold(n_splits=5)
    # part 2d: for each metric, select optimal hyperparameter for linear-kernel SVM using CV
    optimalC_each_metric = np.zeros(len(metric_list))
    for i in range(len(metric_list)):
      C_optimal = select_param_linear(X_training, y_training, kf, metric=metric_list[i])
      optimalC_each_metric[i] = C_optimal
    print(f"Optimal C for each metric is {optimalC_each_metric}")
    # part 3c: for each metric, select optimal hyperparameter for RBF-SVM using CV
    optimalTuple_each_metric = np.zeros((len(metric_list), 2))
    for i in range(len(metric_list)):
      tuple_optimal = select_param_rbf(X_training, y_training, kf, metric=metric_list[i])
      optimalTuple_each_metric[i] = tuple_optimal
    print(f"Optimal gamma and C for each metric is {optimalTuple_each_metric}")
    # part 4a: train linear- and RBF-kernel SVMs with selected hyperparameters
    linear_clf = SVC(kernel = 'linear', C=1)
    rbf_clf = SVC(kernel = 'rbf', C=1000, gamma=0.001)
    
    linear_clf = linear_clf.fit(X_training, y_training)
    rbf_clf = rbf_clf.fit(X_training, y_training)
    # part 4c: test the performance of your two classifiers.
    linear_metric_score = np.zeros(len(metric_list))
    rbf_metric_score = np.zeros(len(metric_list))

    for i in range(len(metric_list)):
      linear_metric_score[i] = performance_test(linear_clf, X_test, y_test, metric_list[i])
      rbf_metric_score[i] = performance_test(rbf_clf, X_test, y_test, metric_list[i])

    print(linear_metric_score)
    print(rbf_metric_score)
    ### ========== TODO : END ========== ###


if __name__ == "__main__" :
    main()

1811




Linear SVM Hyperparameter Selection based on accuracy:
For accuracy, cv scores across different parameters are [0.71019949 0.7119852  0.80701416 0.81586229 0.81406049 0.81406049]
Linear SVM Hyperparameter Selection based on f1_score:
For f1_score, cv scores across different parameters are [0.83053918 0.83140994 0.87553669 0.87589031 0.87427006 0.87427006]
Linear SVM Hyperparameter Selection based on auroc:
For auroc, cv scores across different parameters are [0.5        0.5030303  0.71801376 0.75219913 0.7509333  0.7509333 ]
Linear SVM Hyperparameter Selection based on precision:
For precision, cv scores across different parameters are [0.71019949 0.7114704  0.83460665 0.85568989 0.85530219 0.85530219]
Linear SVM Hyperparameter Selection based on sensitivity:
For sensitivity, cv scores across different parameters are [1.         1.         0.92939873 0.90420886 0.90167722 0.90167722]
Linear SVM Hyperparameter Selection based on specificity:
For specificity, cv scores across different p

# Problem: Boosting vs. Decision Tree

In this exercise, we will compare Decision Tree (DT) to Random Forest, i.e., ensemble of different DTs on different features. 

We will explore the effect of two hyperparameters on ensemble performance: 
1. the number of samples in bootstrap sampling; 
2. the number of maximum features to use for each DT. 

Dataset: 
* **titanic_train.csv** contains demographic and ticket information for passengers and their survival status.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split

In [None]:
class Data :
    
    def __init__(self) :
        """
        Data class.
        
        Attributes
        --------------------
            X -- numpy array of shape (n,d), features
            y -- numpy array of shape (n,), targets
        """
                
        # n = number of examples, d = dimensionality
        self.X = None
        self.y = None
        
        self.Xnames = None
        self.yname = None
    
    def load(self, filename, header=0, predict_col=-1) :
        """Load csv file into X array of features and y array of labels."""
        
        # determine filename
        f = filename
        
        # load data
        with open(f, 'r') as fid :
            data = np.loadtxt(fid, delimiter=",", skiprows=header)
        
        # separate features and labels
        if predict_col is None :
            self.X = data[:,:]
            self.y = None
        else :
            if data.ndim > 1 :
                self.X = np.delete(data, predict_col, axis=1)
                self.y = data[:,predict_col]
            else :
                self.X = None
                self.y = data[:]
        
        # load feature and label names
        if header != 0:
            with open(f, 'r') as fid :
                header = fid.readline().rstrip().split(",")
                
            if predict_col is None :
                self.Xnames = header[:]
                self.yname = None
            else :
                if len(header) > 1 :
                    self.Xnames = np.delete(header, predict_col)
                    self.yname = header[predict_col]
                else :
                    self.Xnames = None
                    self.yname = header[0]
        else:
            self.Xnames = None
            self.yname = None


# helper functions
def load_data(filename, header=0, predict_col=-1) :
    """Load csv file into Data class."""
    data = Data()
    data.load(filename, header=header, predict_col=predict_col)
    return data

In [None]:
# Change the path to your own data directory
titanic = load_data("/content/drive/My Drive/CS M146/HW3-code/data/titanic_train.csv", header=1, predict_col=0)
X = titanic.X; Xnames = titanic.Xnames
y = titanic.y; yname = titanic.yname
n,d = X.shape  # n = number of examples, d =  number of features

In [None]:
def error(clf, X, y, ntrials=100, test_size=0.2) :
    """
    Computes the classifier error over a random split of the data,
    averaged over ntrials runs.

    Parameters
    --------------------
        clf         -- classifier
        X           -- numpy array of shape (n,d), features values
        y           -- numpy array of shape (n,), target classes
        ntrials     -- integer, number of trials

    Returns
    --------------------
        train_error -- float, training error
        test_error  -- float, test error
    """

    train_error = 0
    test_error = 0

    train_scores = []; test_scores = [];
    for i in range(ntrials):
        xtrain, xtest, ytrain, ytest = train_test_split (X,y, test_size = test_size, random_state = i)
        clf.fit (xtrain, ytrain)

        ypred = clf.predict (xtrain)
        err = 1 - metrics.accuracy_score (ytrain, ypred, normalize = True)
        train_scores.append (err)

        ypred = clf.predict (xtest)
        err = 1 - metrics.accuracy_score (ytest, ypred, normalize = True)
        test_scores.append (err)

    train_error =  np.mean (train_scores)
    test_error = np.mean (test_scores)
    return train_error, test_error


In [None]:
### ========== TODO : START ========== ###
# Part 5(a): Implement the decision tree classifier and report the training error.
print('Classifying using Decision Tree...')
model = DecisionTreeClassifier(criterion='entropy')
clf = model.fit(X, y)
y_pred = clf.predict(X)
train_error = 1 - metrics.accuracy_score(y, y_pred, normalize=True)
print('\t-- training error: %.3f' % train_error)
### ========== TODO : END ========== ###

Classifying using Decision Tree...
	-- training error: 0.014


In [None]:
train_error, test_error = error (DecisionTreeClassifier (criterion = 'entropy'), X, y)
print('\tDecision Tree\t-- avg train error : %.3f\tavg test error : %.3f' %(train_error, test_error))

	Decision Tree	-- avg train error : 0.012	avg test error : 0.239


In [None]:
### ========== TODO : START ========== ###
# Part 5(b): Implement the random forest classifier and adjust the number of samples used in bootstrap sampling.
max_sample = np.arange(0.1, 0.9, 0.1)
train_error_vec = np.zeros(len(max_sample))
test_error_vec = np.zeros(len(max_sample))

for i in range(len(max_sample)):
  model = RandomForestClassifier(criterion="entropy", max_samples=max_sample[i])
  train_error, test_error = error(model, X, y)
  train_error_vec[i] = train_error
  test_error_vec[i] = test_error

index = np.argmin(test_error_vec)
best_train_error = train_error_vec[index]
best_test_error = test_error_vec[index]
best_max_sample = max_sample[index]

print(best_train_error)
print(best_test_error)
print(best_max_sample)
### ========== TODO : END ========== ###

0.09295254833040423
0.1885314685314685
0.30000000000000004


In [None]:
### ========== TODO : START ========== ###
# Part 5(c): Implement the random forest classifier and adjust the number of features for each decision tree.
max_feature = np.arange(1, 8, 1)
train_error_vec = np.zeros(len(max_feature))
test_error_vec = np.zeros(len(max_feature))

for i in range(len(max_feature)):
  model = RandomForestClassifier(criterion="entropy", max_features=max_feature[i] ,max_samples=0.3)
  train_error, test_error = error(model, X, y)
  train_error_vec[i] = train_error
  test_error_vec[i] = test_error

index = np.argmin(test_error_vec)
best_train_error = train_error_vec[index]
best_test_error = test_error_vec[index]
best_max_feature = max_feature[index]

print(best_train_error)
print(best_test_error)
print(best_max_feature)
### ========== TODO : END ========== ###

0.09298769771528997
0.18587412587412586
2
