Optimize the Logisit Regression Model using

1. Initialize weight using uniform random method

2. Implement adagrad so that weight decay as epoh increase

3. Implement L2 regulization


Ensemble Generative Model with Logistic Model

Result: 

Private Score: 0.89095

Public Score: 0.88914
![替代文字](https://drive.google.com/uc?id=1h3NnRiUkG7Nve7klqFEnGt8qepvuTsM9)

Ranked: 104/285

![替代文字](https://drive.google.com/uc?id=15KsiibD4Wb4xewFNtRx-REG1O1FFP4od)

Dataset

In [1]:
!gdown --id '1KSFIRh0-_Vr7SdiSCZP1ItV7bXPxMD92' --output data.tar.gz
!tar -zxvf data.tar.gz
!ls

Downloading...
From: https://drive.google.com/uc?id=1KSFIRh0-_Vr7SdiSCZP1ItV7bXPxMD92
To: /content/data.tar.gz
0.00B [00:00, ?B/s]6.11MB [00:00, 95.7MB/s]
data/
data/sample_submission.csv
data/test_no_label.csv
data/train.csv
data/X_test
data/X_train
data/Y_train
data  data.tar.gz  sample_data


Preparing Data

In [120]:
import numpy as np

np.random.seed(0)
X_train_fpath = './data/X_train'
Y_train_fpath = './data/Y_train'
X_test_fpath = './data/X_test'
output_fpath = './output_{}.csv'

# Parse csv files to numpy array
with open(X_train_fpath) as f:
    next(f)
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
    next(f)
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

def _normalize(X, train = True, specified_column = None, X_mean = None, X_std = None):
    # This function normalizes specific columns of X.
    # The mean and standard variance of training data will be reused when processing testing data.
    #
    # Arguments:
    #     X: data to be processed
    #     train: 'True' when processing training data, 'False' for testing data
    #     specific_column: indexes of the columns that will be normalized. If 'None', all columns
    #         will be normalized.
    #     X_mean: mean value of training data, used when train = 'False'
    #     X_std: standard deviation of training data, used when train = 'False'
    # Outputs:
    #     X: normalized data
    #     X_mean: computed mean value of training data
    #     X_std: computed standard deviation of training data

    if specified_column == None:
        specified_column = np.arange(X.shape[1])
    if train:
        X_mean = np.mean(X[:, specified_column] ,0).reshape(1, -1)
        X_std  = np.std(X[:, specified_column], 0).reshape(1, -1)

    X[:,specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
     
    return X, X_mean, X_std

def _train_dev_split(X, Y, dev_ratio = 0.25):
    # This function spilts data into training set and development set.
    train_size = int(len(X) * (1 - dev_ratio))
    return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]

def _train_dev_split_inverse(X, Y, dev_ratio = 0.25):
    # This function spilts data into training set and development set.
    train_size = int(len(X) * (1 - dev_ratio))
    return X[train_size:], Y[train_size:], X[:train_size], Y[:train_size]

# Normalize training and testing data
X_train, X_mean, X_std = _normalize(X_train, train = True)
X_test, _, _= _normalize(X_test, train = False, specified_column = None, X_mean = X_mean, X_std = X_std)

with open(X_test_fpath) as f:
  content = f.readline().strip('\n').split(',')
features = np.array(content)

train_size = X_train.shape[0]
test_size = X_test.shape[0]
data_dim = X_train.shape[1]
print('Size of training set: {}'.format(train_size))
print('Size of testing set: {}'.format(test_size))
print('Dimension of data: {}'.format(data_dim))

Size of training set: 54256
Size of testing set: 27622
Dimension of data: 510


Useful Function

In [6]:
def _shuffle(X, Y):
    # This function shuffles two equal-length list/array, X and Y, together.
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

def _sigmoid(z):
    # Sigmoid function can be used to calculate probability.
    # To avoid overflow, minimum/maximum output value is set.
    return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8))

def _f(X, w, b):
    # This is the logistic regression function, parameterized by w and b
    #
    # Arguements:
    #     X: input data, shape = [batch_size, data_dimension]
    #     w: weight vector, shape = [data_dimension, ]
    #     b: bias, scalar
    # Output:
    #     predicted probability of each row of X being positively labeled, shape = [batch_size, ]
    return _sigmoid(np.matmul(X, w) + b)

def _predict(X, w, b):
    # This function returns a truth value prediction for each row of X 
    # by rounding the result of logistic regression function.
    return np.round(_f(X, w, b)).astype(np.int)

def _predict_prob(X, w, b):
    # This function returns a truth value prediction for each row of X 
    # by rounding the result of logistic regression function.
    return _f(X, w, b)
    
def _accuracy(Y_pred, Y_label):
    # This function calculates prediction accuracy
    acc = 1 - np.mean(np.abs(Y_pred - Y_label))
    return acc

def _cross_entropy_loss(y_pred, Y_label):
    # This function computes the cross entropy.
    #
    # Arguements:
    #     y_pred: probabilistic predictions, float vector
    #     Y_label: ground truth labels, bool vector
    # Output:
    #     cross entropy, scalar
    cross_entropy = -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred))
    return cross_entropy

def _gradient(X, Y_label, w, b):
    # This function computes the gradient of cross entropy loss with respect to weight w and bias b.
    y_pred = _f(X, w, b)
    pred_error = Y_label - y_pred
    w_grad = -np.sum(pred_error * X.T, 1)
    b_grad = -np.sum(pred_error)
    return w_grad, b_grad


Generative Model Defination

In [101]:
def Generative(X_train, Y_train):

  # Compute in-class mean
  X_train_0 = np.array([x for x, y in zip(X_train, Y_train) if y == 0])
  X_train_1 = np.array([x for x, y in zip(X_train, Y_train) if y == 1])

  mean_0 = np.mean(X_train_0, axis = 0)
  mean_1 = np.mean(X_train_1, axis = 0)  

  # Compute in-class covariance
  cov_0 = np.zeros((data_dim, data_dim))
  cov_1 = np.zeros((data_dim, data_dim))

  for x in X_train_0:
      cov_0 += np.dot(np.transpose([x - mean_0]), [x - mean_0]) / X_train_0.shape[0]
  for x in X_train_1:
      cov_1 += np.dot(np.transpose([x - mean_1]), [x - mean_1]) / X_train_1.shape[0]

  # Shared covariance is taken as a weighted average of individual in-class covariance.
  cov = (cov_0 * X_train_0.shape[0] + cov_1 * X_train_1.shape[0]) / (X_train_0.shape[0] + X_train_1.shape[0])

  # Compute inverse of covariance matrix.
  # Since covariance matrix may be nearly singular, np.linalg.inv() may give a large numerical error.
  # Via SVD decomposition, one can get matrix inverse efficiently and accurately.
  u, s, v = np.linalg.svd(cov, full_matrices=False)
  inv = np.matmul(v.T * 1 / s, u.T)

  # Directly compute weights and bias
  w = np.dot(inv, mean_0 - mean_1)
  b =  (-0.5) * np.dot(mean_0, np.dot(inv, mean_0)) + 0.5 * np.dot(mean_1, np.dot(inv, mean_1))\
      + np.log(float(X_train_0.shape[0]) / X_train_1.shape[0]) 

  # Compute accuracy on training set
  Y_train_pred = 1 - _predict(X_train, w, b)
  print('Training accuracy: {}'.format(_accuracy(Y_train_pred, Y_train)))


  return w, b

Logisit Regression Model Defination

In [102]:
from tqdm.notebook import trange

def LogisticRegression(X_train, Y_train):

  data_dim = X_train.shape[1]
  #use uniform random to initialize weights
  w = np.random.uniform(0.01, -0.01, data_dim)
  b = np.zeros((1,))

  # Some parameters for training    
  max_iter = 200
  batch_size = 32
  learning_rate = 0.02

  # Keep the loss and accuracy at every iteration for plotting
  train_loss = []
  dev_loss = []
  train_acc = []
  dev_acc = []

  eps = 0.0000001
  w_adagrad = np.zeros(data_dim)
  b_adagrad = 0

  l2_regulization = 0.00001

  epoh = 0
  early_stop = 0
  for epoh in trange(max_iter):
    # Random shuffle at the begging of each epoch
    X_train, Y_train = _shuffle(X_train, Y_train)

    #mini-batch
    for idx in range(train_size//batch_size):
      X = X_train[idx*batch_size:(idx+1)*batch_size]
      Y = Y_train[idx*batch_size:(idx+1)*batch_size]

      w_grad, b_grad = _gradient(X, Y, w, b)
      w_adagrad += w_grad ** 2
      b_adagrad += b_grad ** 2

      w = w*(1-learning_rate*l2_regulization) - learning_rate/np.sqrt(w_adagrad+eps) * w_grad
      b = b*(1-learning_rate*l2_regulization) - learning_rate/np.sqrt(b_adagrad+eps) * b_grad
    # Compute loss and accuracy of training set and development set
    y_train_pred = _f(X_train, w, b)
    Y_train_pred = np.round(y_train_pred)
    train_acc.append(_accuracy(Y_train_pred, Y_train))
    train_loss.append(_cross_entropy_loss(y_train_pred, Y_train) / train_size)

  print('Training loss: {}'.format(train_loss[-1]))
  print('Training accuracy: {}'.format(train_acc[-1]))
  logi_pred_prob = _predict_prob(X_test, w, b)

  return train_acc[-1], w, b
  #return w for feature imortance calculation, remove last w that passed from generative model

Append Generative Model Result to Training Set for Ensemble with Logisit Regression Model

Then drop the unimportant features recursively and to find best number of features used

In [116]:
n_remove = 100

best_acc = 0
best_w = None
best_b = None
best_X_test = None

iteration = 2

for t in trange(iteration):

  w, b = Generative(X_train, Y_train)
  gm_Y_train_pred_prob = 1 - _predict_prob(X_train, w, b)
  gm_Y_train_pred_prob = gm_Y_train_pred_prob.reshape(-1, 1)
  X_train = np.append(X_train, gm_Y_train_pred_prob, axis=1)
  
  gm_pred_prob = 1 - _predict_prob(X_test, w, b)
  gm_pred_prob = gm_pred_prob.reshape(-1, 1)
  X_test = np.append(X_test, gm_pred_prob, axis=1)
  
  #remove last n_remove
  
  acc, w, b = LogisticRegression(X_train, Y_train)

  if acc > best_acc:
    best_acc = acc
    best_w = w
    best_X_test = X_test
    best_b = b
  
  if t == (iteration-1):
    break;

  ind = np.argsort(np.abs(w))[::-1]
  drop = []
  for i in ind[-1:-n_remove:-1]:
    drop.append(i)

  X_train = np.delete(X_train, drop, axis=1)
  X_test = np.delete(X_test, drop, axis=1)
  features = np.delete(features, drop, axis=0)
  #gm_Y_train_pred_prob = np.delete(gm_Y_train_pred_prob, drop, axis=1)

  train_size = X_train.shape[0]
  test_size = X_test.shape[0]
  data_dim = X_train.shape[1]
  print('Size of training set: {}'.format(train_size))
  print('Size of testing set: {}'.format(test_size))
  print('Dimension of data: {}'.format(data_dim))

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Training accuracy: 0.8693232084930699


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Training loss: 0.26670946004648644
Training accuracy: 0.8858375110586848
Size of training set: 54256
Size of testing set: 27622
Dimension of data: 462
Training accuracy: 0.8752580359775878


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Training loss: 0.26703420904325836
Training accuracy: 0.8856900619286346
Size of training set: 54256
Size of testing set: 27622
Dimension of data: 414
Training accuracy: 0.8772485992332645


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


Training loss: 0.26667576387250375
Training accuracy: 0.8856900619286346


In [119]:
best_acc

0.8858375110586848

In [117]:
# Predict testing labels
#X_test, w, b = best_X_test, best_w, best_b
predictions = _predict(X_test, w, b)
with open(output_fpath.format('ensemble'), 'w') as f:
    f.write('id,label\n')
    for i, label in  enumerate(predictions):
        f.write('{},{}\n'.format(i, label))

In [44]:
!gdown --id 1VoF-D1FH0PhIne0pdDMKNmSg-SXfBoKS
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/kaggle.json

Downloading...
From: https://drive.google.com/uc?id=1VoF-D1FH0PhIne0pdDMKNmSg-SXfBoKS
To: /content/kaggle.json
  0% 0.00/65.0 [00:00<?, ?B/s]100% 65.0/65.0 [00:00<00:00, 125kB/s]


In [118]:
!kaggle competitions submit -c ml2020spring-hw2 -f output_ensemble.csv -m "Message"

100% 205k/205k [00:04<00:00, 43.5kB/s]
Successfully submitted to ML2020spring - hw2