In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.optimize as opt
from sklearn.preprocessing import PolynomialFeatures

In [2]:
df = pd.read_csv('data/train.csv')

df['gender'] = np.where(df['Sex'] == 'female', 1, 0)
port_dic = {'S': 0, 'C':1, 'Q':2, np.NaN:0}

#Most passangers embarked from Southampton
df['port'] = pd.Series(df['Embarked']).map(port_dic)

df['Fare'] = df['Fare'].astype('int64')
df['Age'] = df['Age'].replace('nan', np.nan)
rand_age = pd.Series(np.random.randint(df['Age'].mean() - df['Age'].std(), df['Age'].mean() + df['Age'].std(),
                            size = 1000))
#Fill missing ages with age that falls within one standard deviation of mean
df['Age'] = df['Age'].fillna(rand_age)
df = df.drop(columns = ['Name', 'Ticket', 'Cabin', 'Sex', 'PassengerId', 'Embarked'])

In [3]:
df_test = pd.read_csv('data/test.csv')
df_test['gender'] = np.where(df_test['Sex'] == 'female', 1, 0)
port_dic = {'S': 0, 'C':1, 'Q':2, np.NaN:0}

#Most passangers embarked from Southampton
df_test['port'] = pd.Series(df_test['Embarked']).map(port_dic)

#The mean fare for pclass = 3 is around 13
df_test['Fare'] = df_test['Fare'].fillna(13)
df_test['Fare'] = df_test['Fare'].astype('int64')

df_test['Age'] = df_test['Age'].replace('nan', np.nan)
#Fill missing ages with age that falls within one standard deviation of mean
df_test['Age'] = df_test['Age'].fillna(rand_age)
test_pss_id = df_test['PassengerId']
df_test = df_test.drop(columns = ['Name', 'Ticket', 'Cabin', 'Sex', 'PassengerId', 'Embarked'])

In [4]:
#Set up sigmoid function
def sigmoid(x):
    '''Returns sigmoid function applied to each element'''
    
    return (1/(1 + np.exp(-x)))

In [5]:
#Split data set into train and cross validation
train = df[:int(len(df) *.8)]
cv = df[int(len(df)*.8):]

In [6]:
#Create y values with survival data, drop from dataframe
y_train = np.array(train['Survived'])
y_train = np.reshape(y_train, (len(y_train),1))

y_cv = np.array(cv['Survived'])
y_cv = np.reshape(y_cv, (len(y_cv), 1))

train = train.drop(columns = 'Survived')
cv = cv.drop(columns = 'Survived')

In [61]:
#Create features for training, cross-validation, and testing
#poly = PolynomialFeatures(8, include_bias = False)

X_train = train.values
intercept = np.zeros((len(X_train),1))
X_train = np.hstack((intercept, X_train))
#X_train = poly.fit_transform(X_train)
(m_t,n_t) = X_train.shape
theta_train = np.zeros((n_t,1))

X_cv = cv.values
intercept = np.zeros((len(X_cv),1))
X_cv = np.hstack((intercept, X_cv))
(m_cv,n_cv) = X_cv.shape
theta = np.zeros((n_cv,1))

X_test = df_test.values
intercept = np.zeros((len(X_test),1))
X_test = np.hstack((intercept, X_test))
(m_test,n_test) = X_test.shape
theta = np.zeros((n_test,1))

lambd = 1

In [62]:
def costFunction(theta, X, y, lambd):
    '''Returns the regularized cost function for logisitc regression'''
    m = len(y)
    h = sigmoid(X @ theta)
    theta[0] = 0
    J = (-1/m) * np.sum((y.T @ np.log(h)) + ((1-y).T @ np.log(1-h))) + ((1/(2 * lambd)) * sum(np.square(theta)))
    
    return J

In [63]:
costFunction(theta_train, X_train, y_train, lambd)

array([0.69314718])

In [64]:
def gradientDescent(theta, X, y, lambd):
    '''Returns the gradient of the logistic regression cost function'''
    m = len(y)
    h = sigmoid(X @ theta)
    theta[0] = 0
    grad = (1/m) * np.sum(X.T @ (h - y)) + ((lambd/m) * theta)
    return grad

In [65]:
gradientDescent(theta_train, X_train, y_train, lambd)

array([[1.99982444],
       [1.99982444],
       [1.99982444],
       [1.99982444],
       [1.99982444],
       [1.99982444],
       [1.99982444],
       [1.99982444]])

In [66]:
'''Minimizes our cost function with respect to theta'''

output = opt.fmin_tnc(func = costFunction, x0 = theta_train.flatten(), fprime = gradientDescent, 
                      args = (X_train, y_train.flatten(), lambd)) 
theta_opt = output[0]
print(theta_opt)

[ 0.         -0.00115473 -0.00115473 -0.00115473 -0.00115473 -0.00115473
 -0.00115473 -0.00115473]


In [67]:
'''Accuracy on training model'''

pred_train = [sigmoid(X_train @ theta_opt) >= .5]
np.mean(pred_train == y_train)

0.6095505617977528

In [68]:
'''Accuracy on cross-validation'''

pred_cv = [sigmoid(X_cv @theta_opt) >= .5]
np.mean(pred_cv == y_cv)

0.6424581005586593

In [38]:
'''Make csv file with predictions for test data'''

pred_test = [sigmoid(X_test @ theta_opt) >= .5]
pred_test = pd.Series(pred_test)
pred_test = pd.Series(np.where(pred_test.all() == False, 0, 1))
predictions = pd.DataFrame({'PassengerId': test_pss_id, 'Survived': pred_test})
predictions.to_csv('data/predictions_model_1', index = False)