In [327]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.optimize as opt
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing

In [328]:
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [329]:
test_passengerId = df_test['PassengerId']

In [330]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [331]:
rand_fem_age = rand_age_female = pd.Series(np.random.randint(df.groupby('Sex')['Age'].mean()['female'] - df.groupby('Sex')['Age'].std()['female'], 
                                            df.groupby('Sex')['Age'].mean()['female'] + df.groupby('Sex')['Age'].std()['female'],
                            size = 1000))

rand_male_age = pd.Series(np.random.randint(df.groupby('Sex')['Age'].mean()['male'] - df.groupby('Sex')['Age'].std()['male'], 
                                            df.groupby('Sex')['Age'].mean()['male'] + df.groupby('Sex')['Age'].std()['male'],
                            size = 1000))

In [332]:
def preprocess(df):
    
    '''Processes the data to create new features and drop unneeded features'''
    
    #Create binary gender columns
    df['Gender'] = np.where(df['Sex'] == 'female', 1, 0)

    #Most passangers embark from Southampton, so we fill missing values with that
    port_dic = {'S': .3, 'C':.5, 'Q':.4, np.NaN:.3}
    df['Port'] = pd.Series(df['Embarked']).map(port_dic)


    #Fill missing ages with age that falls within one standard deviation of mean
    df.loc[(df['Age'].isnull()) & (df['Sex'] == 'male'), 'Age'] = rand_male_age
    df.loc[(df['Age'].isnull()) & (df['Sex'] == 'female'), 'Age'] = rand_fem_age
    
    class_map ={1:6, 2:4, 3:2}
    df['Pclass'] = df['Pclass'].map(class_map)

    #Create three new columns
    df['Family'] = df['Parch'] + df['SibSp']
    
    titles = df['Name'].str.partition(',')[2].str.split()
    df['Title'] = titles.map(lambda x: x[0])
    #titles_mapping = {'Mr.': 0, 'Miss.': 1, 'Mrs.': 1, 'Master.': 2, 'Dr':2, 'Rev.': 2, 'Major.':3, 'Col.':3, 'Mlle':1,
                 #'Capt.':3, 'Lady.':4, 'Sir.':2, 'Jonkheer.':2, 'Ms':1, 'Don.':2, 'Mme.':1, 'the.':2, '__missing__':2}
    
    
    titles_mapping = {'Capt.': 1000, 'Rev.': 1000, 'Don.': 1000, 'Jonkheer.': 1000, 'Mr.': 50, 'Dr.': 1, 'Major.': 1, 'Master.': 1,
                      'Col.': 1, 'Miss.': 100, 'Mrs.': 120, 'Sir': 1001, 'Mlle.': 1001, 'Mme.': 1001, 'Ms.': 1001, 'Lady.': 1001, 'the': 1001} 
    
    df['Title'] = df['Title'].map(titles_mapping)
    df['Title'] = df['Title'].fillna(0)
    #Anyone less than the age of five gets a new title 
    df.loc[df['Age'] <= 10, 'Title'] = 125
    
    #df['SocialStatus'] = df['Age'] * df['Pclass']
    df['SocialStatus'] = (df['Gender']+10) * df['Title'] * df['Pclass']
    df['SexAgeMult'] = (df['Gender'] + 10) * df['Age']

    #Drop columns that aren't useful
    try:
        df = df.drop(columns = ['Name', 'Ticket', 'Cabin', 'Sex', 'PassengerId', 'Embarked', 'Port'])
    except:
        df = df
        
    return df

In [333]:
df = preprocess(df)

In [334]:
df_test = preprocess(df_test)

In [335]:
#Set up sigmoid function
def sigmoid(x):
    '''Returns sigmoid function applied to each element'''
    
    return (1/(1 + np.exp(-x)))

In [336]:
#Split data set into train and cross validation
train = df[:int(len(df) *.8)]
cv = df[int(len(df)*.8):]

In [337]:
#Create y values with survival data, drop from dataframe
y_train = np.array(train['Survived'])
y_train = np.reshape(y_train, (len(y_train),1))

y_cv = np.array(cv['Survived'])
y_cv = np.reshape(y_cv, (len(y_cv), 1))

train = train.drop(columns = 'Survived')
cv = cv.drop(columns = 'Survived')

In [338]:
display(train.head(10))

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Gender,Family,Title,SocialStatus,SexAgeMult
0,2,22.0,1,0,7.25,0,1,50.0,1000.0,220.0
1,6,38.0,1,0,71.2833,1,1,120.0,7920.0,418.0
2,2,26.0,0,0,7.925,1,0,100.0,2200.0,286.0
3,6,35.0,1,0,53.1,1,1,120.0,7920.0,385.0
4,2,35.0,0,0,8.05,0,0,50.0,1000.0,350.0
5,2,26.0,0,0,8.4583,0,0,50.0,1000.0,260.0
6,6,54.0,0,0,51.8625,0,0,50.0,3000.0,540.0
7,2,2.0,3,1,21.075,0,4,125.0,2500.0,20.0
8,2,27.0,0,2,11.1333,1,2,120.0,2640.0,297.0
9,4,14.0,1,0,30.0708,1,1,120.0,5280.0,154.0


In [339]:
#Create features for training, cross-validation, and testing
#poly = PolynomialFeatures(10, include_bias = False)

X_train = train.values
intercept = np.zeros((len(X_train),1))
X_train = np.hstack((intercept, X_train))
#X_train = preprocessing.scale(X_train)
#X_train = poly.fit_transform(X_train)
(m_t,n_t) = X_train.shape
theta_train = np.zeros((n_t,1))

X_cv = cv.values
intercept = np.zeros((len(X_cv),1))
X_cv = np.hstack((intercept, X_cv))

X_test = df_test.values
intercept = np.zeros((len(X_test),1))
X_test = np.hstack((intercept, X_test))

lambd = .00000000001

In [340]:
def costFunction(theta, X, y, lambd):
    '''Returns the regularized cost function for logisitc regression'''
    m = len(y)
    h = sigmoid(X @ theta)
    theta[0] = 0
    J = (-1/m) * np.sum((y.T @ np.log(h)) + ((1-y).T @ np.log(1-h))) + ((1/(2 * lambd)) * sum(np.square(theta)))
    
    return J

In [341]:
costFunction(theta_train, X_train, y_train, lambd)

array([0.69314718])

In [342]:
def gradientDescent(theta, X, y, lambd):
    '''Returns the gradient of the logistic regression cost function'''
    m = len(y)
    h = sigmoid(X @ theta)
    theta[0] = 0
    grad = (1/m) * (X.T @ (h-y)) + ((lambd/m) * theta)
    return grad

In [343]:
gradientDescent(theta_train, X_train, y_train, lambd)

array([[ 0.00000000e+00],
       [ 1.13764045e-01],
       [ 3.91766152e+00],
       [ 7.58426966e-02],
       [ 1.82584270e-02],
       [-1.96971615e+00],
       [-8.70786517e-02],
       [ 9.41011236e-02],
       [ 3.27247191e-01],
       [-3.11873596e+02],
       [ 3.66123596e+01]])

In [344]:
'''Minimizes our cost function with respect to theta'''

output = opt.fmin_tnc(func = costFunction, x0 = theta_train.flatten(), fprime = gradientDescent, 
                      args = (X_train, y_train.flatten(), lambd)) 
theta_opt = output[0]
print(theta_opt)

[ 0.00000000e+00  8.23724485e-12 -2.21365213e-11 -2.04736469e-12
  3.62306940e-13  2.07683845e-10  5.11739755e-12 -1.68505775e-12
 -1.59021041e-10  4.40369009e-12 -6.07845847e-11]


In [351]:
'''Accuracy on training model'''

pred_train = [sigmoid(X_train @ theta_opt) >= .5]
np.mean(pred_train == y_train) * 100

57.63161217018053

In [350]:
'''Accuracy on cross-validation'''

pred_cv = [sigmoid(X_cv @ theta_opt) >= .5]
np.mean(pred_cv == y_cv) * 100

60.266533503948075

In [21]:
'''Make csv file with predictions for test data'''

pred_test = [sigmoid(X_test @ theta_opt) >= .5]
pred_test = pd.Series(pred_test)
pred_test = pd.Series(np.where(pred_test.all() == False, 0, 1))
predictions = pd.DataFrame({'PassengerId': test_passengerId, 'Survived': pred_test})
#predictions.to_csv('data/predictions_model_1stQuarter', index = False)

  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
predictions

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0
