In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.optimize as opt
from sklearn.preprocessing import PolynomialFeatures

In [36]:
df = pd.read_csv('data/train.csv')
df = df.drop(columns = ['Name', 'Ticket', 'Cabin'])
df['gender'] = np.where(df['Sex'] == 'female', 1, 0)
port_dic = {'S': 0, 'C':1, 'Q':2, np.NaN:0}

#Most passangers embarked from Southampton
df['port'] = pd.Series(df['Embarked']).map(port_dic)

df['Fare'] = df['Fare'].astype('int64')
df['Age'] = df['Age'].replace('nan', np.nan)
rand_age = pd.Series(np.random.randint(df['Age'].mean() - df['Age'].std(), df['Age'].mean() + df['Age'].std(),
                            size = 1000))
#Fill missing ages with age that falls within one standard deviation of mean
df['Age'] = df['Age'].fillna(rand_age)

In [41]:
#Set up sigmoid function
def sigmoid(x):
    '''Returns sigmoid function applied to each element'''
    
    return (1/(1 + np.exp(-x)))

In [42]:
#Split data set into train and cross validation
train = df[:int(len(df) *.8)]
cv = df[int(len(df)*.8):]

In [68]:
y_train = np.array(train['Survived'])
y_train = np.reshape(y_train, (len(y_train),1))

y_cv = np.array(cv['Survived'])
y_cv = np.reshape(y_cv, (len(y_cv), 1))

In [101]:
pclass_test = np.array(train['Pclass'])
gender_test = np.array(train['gender'])
fare_test = np.array(train['Fare'])
port_test = np.array(train['port'])
age_test = np.array(train['Age'])
age_pclass_test = np.array(age_test * pclass_test)
x0_test = np.ones((1,len(train)))

X_train = np.vstack((x0_test, pclass_test, gender_test, fare_test, port_test, age_test, age_pclass_test))

(m, n) = X_train.shape
theta = np.zeros((m,1))
lambd = 1

pclass_cv = np.array(cv['Pclass'])
gender_cv = np.array(cv['gender'])
fare_cv = np.array(cv['Fare'])
port_cv = np.array(cv['port'])
x0_cv = np.ones((1,len(cv)))

X_cv = np.vstack((x0_cv, pclass_cv, gender_cv, fare_cv, port_cv))

In [104]:
#Without polynomial features X is 5, 712 and theta is 5, 1)
print(X_train.shape)
print(theta.shape)
X_train

(7, 712)
(7, 1)


array([[ 1.,  1.,  1., ...,  1.,  1.,  1.],
       [ 3.,  1.,  3., ...,  3.,  1.,  1.],
       [ 0.,  1.,  1., ...,  0.,  1.,  0.],
       ...,
       [ 0.,  1.,  0., ...,  1.,  1.,  0.],
       [22., 38., 26., ..., 27., 24., 25.],
       [66., 38., 78., ..., 81., 24., 25.]])

In [113]:
(theta.T @ X_train).shape

(712, 1)

In [118]:
def costFunction(theta, X, y, lambd):
    '''Returns the regularized cost function'''
    m = len(y)
    h = sigmoid(theta.T @ X)
    theta[0] = 0
    x = y.T @ np.log(h)
    J = (-1/m) * np.sum((y.T @ np.log(h)) + ((1-y.T) @ np.log(1-h))) + ((1/(2 * lambd)) * sum(np.square(theta)))
    
    return J
    


In [119]:
costFunction(theta, X_train, y_train, lambd)

ValueError: shapes (1,712) and (1,712) not aligned: 712 (dim 1) != 1 (dim 0)

In [74]:
def gradientDescent(theta, X, y, lambd):
    '''Returns the gradient'''
    m = len(y)
    h = sigmoid(X.T @ theta)
    theta[0] = 0
    grad = (1/m) * np.sum(X @ (h - y)) + ((lambd/m) * theta)
    return grad

In [75]:
gradientDescent(theta, X_train, y_train, lambd)

array([[13.67497893],
       [13.67497893],
       [13.67497893],
       [13.67497893],
       [13.67497893],
       [13.67497893],
       [13.67497893]])

In [76]:
output = opt.fmin_tnc(func = costFunction, x0 = theta.flatten(), fprime = gradientDescent, 
                      args = (X_train, y_train.flatten(), lambd)) 
theta_opt = output[0]
print(theta_opt)

[ 0.        -0.0028296 -0.0028296 -0.0028296 -0.0028296 -0.0028296
 -0.0028296]


In [79]:
#pred = [sigmoid(np.dot(X_cv.T, theta_opt)) >= .05]
#np.mean(pred == y_cv.flatten())

In [78]:
pred = [sigmoid(np.dot(X_train.T, theta_opt)) >= .05]
np.mean(pred == y_train.flatten())

0.3904494382022472

In [None]:
#0.3904494382022472