In [101]:

def data_preprocessing(mode='train'):
	"""
	:param mode: str, indicating if it's training mode or testing mode
	:return: Tuple(numpy_array, numpy_array), the first one is X, the other one is Y
	"""
	data_lst = []
	label_lst = []
	first_data = True
	if mode == 'train':
		with open(TRAIN, 'r') as f:
			for line in f:
				data = line.split(',')
				# ['0PassengerId', '1Survived', '2Pclass', '3Last Name', '4First Name', '5Sex', '6Age', '7SibSp', '8Parch', '9Ticket', '10Fare', '11Cabin', '12Embarked']
				if first_data:
					first_data = False
					continue
				if not data[6]:
					continue
				label = [int(data[1])]
				if data[5] == 'male':
					sex = 1
				else:
					sex = 0
				# ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
				passenger_lst = [int(data[2]), sex, float(data[6]), int(data[7]), int(data[8]), float(data[10])]
				data_lst.append(passenger_lst)
				label_lst.append(label)
	else:
		pass
	return np.array(data_lst).T, np.array(label_lst).T

def normalize(X):
    """
    :param X: numpy_array, the dimension is (n, m)
    :return: numpy_array, the values are normalized, where the dimension is still (n, m)
    """
    min_arr = np.amin(X, axis = 1, keepdims = True)
    max_arr = np.amax(X, axis = 1, keepdims = True)
    return (X - min_arr) / (max_arr - min_arr)



In [104]:

def batch_gradient_descent(X, Y):
	"""
	:param X: numpy_array, the array holding all the training data
	:param Y: numpy_array, the array holding all the ture labels in X
	:return: numpy_array, the trained weights with dimension (n, m)
	"""
	np.random.seed(0)
	n,m = X.shape

	# Initialize w and b
	####################
	#                  #
	#       TODO:      #
	#                  #
	####################


	W = np.random.rand(n,1) - 0.5 # -0.5 : zero centre
	b = np.random.rand(1,1) - 0.5

	# Start Training
	####################
	#                  #
	#       TODO:      #
	#                  #
	####################

	for epoch in range(NUM_EPOCHS):
    		#Forward prop

			K = W.T.dot(X) + b
			H = 1/(1+np.exp(-K))
			L = -(Y*np.log(H)+(1-Y)*np.log(1-H))
			J = (1/m)*np.sum(L)

			if epoch % 10000 == 0:
    				print('Cost:', J)
			
			#W = W - ALPHA * ((1/m)) * np.sum(X.dot((H-Y).T), axis=1, keepdims=True)
			W = W - ALPHA * ((1/m)) * X.dot((H-Y).T)
			#b = b - ALPHA * ((1/m) * np.sum(H-Y))
	return W, b


In [105]:
import time


TRAIN = 'titanic_data/train.csv'
NUM_EPOCHS = 60000
ALPHA = 0.05

start = time.time()
X_train, Y = data_preprocessing()
print('Y.shape', Y.shape)
print('X.shape', X_train.shape)
# ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
n, m = X_train.shape
X = normalize(X_train)
W, b = batch_gradient_descent(X, Y)
#print(W)

scores = W.T.dot(X) + b
predictions = np.where(scores > 0, 1,0)
acc = np.equal(predictions, Y)
num_acc = np.sum(acc)
print('Acc:', num_acc/m)
###############################
#                             #
#            TODO:            #
#                             #
###############################
end = time.time()
print('Total run time (Batch-GD):', end-start)

Y.shape (1, 714)
X.shape (6, 714)
Cost: 0.7420080950021505
Cost: 0.5217881448785981
Cost: 0.5170357279862925
Cost: 0.5147853895420947
Cost: 0.513581704643919
Cost: 0.5128975609257845
Acc: 0.7773109243697479
Total run time (Batch-GD): 2.5509510040283203


In [106]:
np.random.seed(0)
n,m = X.shape

# Initialize w and b
####################
#                  #
#       TODO:      #
#                  #
####################


W = np.random.rand(n,1) - 0.5 # -0.5 : zero centre
b = np.random.rand(1,1) - 0.5

# Start Training
####################
#                  #
#       TODO:      #
#                  #
####################

K = W.T.dot(X) + b
H = 1/(1+np.exp(-K))
L = -(Y*np.log(H)+(1-Y)*np.log(1-H))
J = (1/m)*np.sum(L)


#W = W - ALPHA * ((1/m)) * np.sum(X.dot((H-Y).T), axis=1, keepdims=True)
W = W - ALPHA * ((1/m)) * X.dot((H-Y).T)
#b = b - ALPHA * ((1/m) * np.sum(H-Y))

In [107]:
X.dot((H-Y).T)

array([[112.45540762],
       [159.73898287],
       [ 40.17331425],
       [ 10.45838889],
       [  1.25952428],
       [ -3.62167248]])

In [112]:
X.dot((H-Y).T)

array([[112.45540762],
       [159.73898287],
       [ 40.17331425],
       [ 10.45838889],
       [  1.25952428],
       [ -3.62167248]])

In [117]:
arr = np.array([[4,9,5],[3,2,9],[9,9,8],[1,2,3]])

In [118]:
arr[0]

array([4, 9, 5])

In [121]:
arr[1]

array([3, 2, 9])