In [152]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.optimize as opt
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing

In [159]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [160]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [161]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [162]:
'''Create series with ages within one std of training age data'''
rand_age = pd.Series(np.random.randint(df['Age'].mean() - df['Age'].std(), df['Age'].mean() + df['Age'].std(),
                            size = 1000))

In [163]:
def preprocessing(df, rand_age):
    
    '''Processes the data to create new features and drop unneeded features'''
    
    #Create binary gender columns
    df['Gender'] = np.where(df['Sex'] == 'female', 1, 0)

    #Most passangers embark from Southampton, so we fill missing values with that
    port_dic = {'S': 0, 'C':1, 'Q':2, np.NaN:0}
    df['Port'] = pd.Series(df['Embarked']).map(port_dic)


    #Fill missing ages with age that falls within one standard deviation of mean
    df['Age'] = df['Age'].replace('nan', np.nan)
    df['Age'] = df['Age'].fillna(rand_age)

    #Create two new columns
    df['Family'] = df['Parch'] + df['SibSp']
    df['SocialStatus'] = df['Age'] * df['Pclass']

    #Drop columns that aren't useful
    try:
        df = df.drop(columns = ['Name', 'Ticket', 'Cabin', 'Sex', 'PassengerId', 'Embarked'])
    except:
        df = df
        
    return df

In [164]:
df = preprocessing(df, rand_age)

In [165]:
df_test = preprocessing(df_test, rand_age)

In [166]:
#Set up sigmoid function
def sigmoid(x):
    '''Returns sigmoid function applied to each element'''
    
    return (1/(1 + np.exp(-x)))

In [167]:
#Split data set into train and cross validation
train = df[:int(len(df) *.8)]
cv = df[int(len(df)*.8):]

In [168]:
#Create y values with survival data, drop from dataframe
y_train = np.array(train['Survived'])
y_train = np.reshape(y_train, (len(y_train),1))

y_cv = np.array(cv['Survived'])
y_cv = np.reshape(y_cv, (len(y_cv), 1))

train = train.drop(columns = 'Survived')
cv = cv.drop(columns = 'Survived')

In [169]:
display(train.head())
print(train.info())

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Gender,Port,Family,SocialStatus
0,3,22.0,1,0,7.25,0,0,1,66.0
1,1,38.0,1,0,71.2833,1,1,1,38.0
2,3,26.0,0,0,7.925,1,0,0,78.0
3,1,35.0,1,0,53.1,1,0,1,35.0
4,3,35.0,0,0,8.05,0,0,0,105.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 9 columns):
Pclass          712 non-null int64
Age             712 non-null float64
SibSp           712 non-null int64
Parch           712 non-null int64
Fare            712 non-null float64
Gender          712 non-null int64
Port            712 non-null int64
Family          712 non-null int64
SocialStatus    712 non-null float64
dtypes: float64(3), int64(6)
memory usage: 50.1 KB
None


In [170]:
#Create features for training, cross-validation, and testing
#poly = PolynomialFeatures(10, include_bias = False)

X_train = train.values
intercept = np.zeros((len(X_train),1))
X_train = np.hstack((intercept, X_train))
#X_train = preprocessing.scale(X_train)
#X_train = poly.fit_transform(X_train)
(m_t,n_t) = X_train.shape
theta_train = np.zeros((n_t,1))

X_cv = cv.values
intercept = np.zeros((len(X_cv),1))
X_cv = np.hstack((intercept, X_cv))

X_test = df_test.values
intercept = np.zeros((len(X_test),1))
X_test = np.hstack((intercept, X_test))

lambd = 1

In [171]:
X_train.shape

(712, 10)

In [172]:
np.array(X_train)

array([[ 0.,  3., 22., ...,  0.,  1., 66.],
       [ 0.,  1., 38., ...,  1.,  1., 38.],
       [ 0.,  3., 26., ...,  0.,  0., 78.],
       ...,
       [ 0.,  3., 31., ...,  1.,  2., 93.],
       [ 0.,  1., 24., ...,  1.,  0., 24.],
       [ 0.,  1., 24., ...,  0.,  0., 24.]])

In [173]:
def costFunction(theta, X, y, lambd):
    '''Returns the regularized cost function for logisitc regression'''
    m = len(y)
    h = sigmoid(X @ theta)
    theta[0] = 0
    J = (-1/m) * np.sum((y.T @ np.log(h)) + ((1-y).T @ np.log(1-h))) + ((1/(2 * lambd)) * sum(np.square(theta)))
    
    return J

In [174]:
costFunction(theta_train, X_train, y_train, lambd)

array([0.69314718])

In [175]:
def gradientDescent(theta, X, y, lambd):
    '''Returns the gradient of the logistic regression cost function'''
    m = len(y)
    h = sigmoid(X @ theta)
    theta[0] = 0
    grad = (1/m) * (X.T @ (h-y)) + ((lambd/m) * theta)
    return grad

In [176]:
gradientDescent(theta_train, X_train, y_train, lambd)

array([[ 0.00000000e+00],
       [ 3.81320225e-01],
       [ 3.77159410e+00],
       [ 7.58426966e-02],
       [ 1.82584270e-02],
       [-1.96971615e+00],
       [-8.70786517e-02],
       [ 2.80898876e-03],
       [ 9.41011236e-02],
       [ 1.19862500e+01]])

In [177]:
'''Minimizes our cost function with respect to theta'''

output = opt.fmin_tnc(func = costFunction, x0 = theta_train.flatten(), fprime = gradientDescent, 
                      args = (X_train, y_train.flatten(), lambd)) 
theta_opt = output[0]
print(theta_opt)

[ 0.         -0.01920447 -0.00071469 -0.03439584  0.02423532  0.00660136
  0.23207239  0.0611494  -0.00944814 -0.01062241]


In [178]:
'''Accuracy on training model'''

pred_train = [sigmoid(X_train @ theta_opt) >= .5]
np.mean(pred_train == y_train)

0.5692384168665573

In [179]:
'''Accuracy on cross-validation'''

pred_cv = [sigmoid(X_cv @theta_opt) >= .5]
np.mean(pred_cv == y_cv)

0.5867482288318092

In [182]:
'''Make csv file with predictions for test data'''

pred_test = [sigmoid(X_test @ theta_opt) >= .5]
pred_test = pd.Series(pred_test)
pred_test = pd.Series(np.where(pred_test.all() == False, 0, 1))
predictions = pd.DataFrame({'PassengerId': test_pss_id, 'Survived': pred_test})
#predictions.to_csv('data/predictions_model_2', index = False)

  This is separate from the ipykernel package so we can avoid doing imports until


In [181]:
predictions

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0
