In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('train.csv')
df = df.drop(labels=['PassengerId' , 'Name' , 'Ticket', 'Cabin' , 'Fare'],axis=1)
df['Sex'] = df['Sex'].map({"male": 0, "female": 1})
df['Age'] = df['Age'].fillna(df['Age'].mean())
print(df.head(n=10))





   Survived  Pclass  Sex        Age  SibSp  Parch Embarked
0         0       3    0  22.000000      1      0        S
1         1       1    1  38.000000      1      0        C
2         1       3    1  26.000000      0      0        S
3         1       1    1  35.000000      1      0        S
4         0       3    0  35.000000      0      0        S
5         0       3    0  29.699118      0      0        Q
6         0       1    0  54.000000      0      0        S
7         0       3    0   2.000000      3      1        S
8         1       3    1  27.000000      0      2        S
9         1       2    1  14.000000      1      0        C


In [4]:
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df,embarked_dummies],axis=1)
df = df.drop(labels=['Embarked'],axis=1)

In [5]:
print(df.head(n=10))

   Survived  Pclass  Sex        Age  SibSp  Parch  Embarked_C  Embarked_Q  \
0         0       3    0  22.000000      1      0       False       False   
1         1       1    1  38.000000      1      0        True       False   
2         1       3    1  26.000000      0      0       False       False   
3         1       1    1  35.000000      1      0       False       False   
4         0       3    0  35.000000      0      0       False       False   
5         0       3    0  29.699118      0      0       False        True   
6         0       1    0  54.000000      0      0       False       False   
7         0       3    0   2.000000      3      1       False       False   
8         1       3    1  27.000000      0      2       False       False   
9         1       2    1  14.000000      1      0        True       False   

   Embarked_S  
0        True  
1       False  
2        True  
3        True  
4        True  
5       False  
6        True  
7        True  
8       

$$\sigma(z) = \frac{1}{1 + e^{-z}}$$

$$\text{Cost}(p, y) = -[y \cdot \log(p) + (1-y) \cdot \log(1-p)]$$

In [6]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

# not actually being used since we write the derivate of this in the gradient decent
def loss_fx(y_true,y_predict):
  return -np.mean(y_true * np.log(y_predict) + (1 - y_true) * np.log(1 - y_predict))


In [7]:
def gradient_decent(X , y_true , epochs , l ):
  num_features = X.shape[1]
  weights = np.zeros(num_features)

  for i in range(epochs):
    z = np.dot(X,weights)
    y_predict = sigmoid(z)

    m = len(y_true)
    gradient = (1/m) * np.dot(X.T,y_predict - y_true)
    weights -= l * gradient

  return weights


In [None]:
X = df.drop(labels=['Survived'],axis=1)
y_true = df['Survived']

# scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
weights = gradient_decent(X_scaled,y_true,10000,0.1)
print(weights)

[-0.91937278  1.32125266 -0.49805296 -0.30633908 -0.08628599 -0.19085539
 -0.17924618 -0.43963549]


In [None]:
# preparing the testing datasets
test_df = pd.read_csv('test.csv')

age_mean_train = X['Age'].mean()
test_df['Age'] = test_df['Age'].fillna(age_mean_train)

test_df['Sex'] = test_df['Sex'].map({"male": 0, "female": 1})
embarked_dummies = pd.get_dummies(test_df['Embarked'], prefix='Embarked')
test_df = pd.concat([test_df, embarked_dummies], axis=1)

train_columns = X.columns
X_test = test_df.reindex(columns=train_columns, fill_value=0)
X_test_scaled = scaler.transform(X_test)

In [None]:
z_test = np.dot(X_test_scaled, weights)
y_pred_test_probs = sigmoid(z_test)
y_pred_test_classes = (y_pred_test_probs > 0.5).astype(int)

In [None]:
y_test_true = pd.read_csv('gender_submission.csv')

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test_true['Survived'],y_pred_test_classes)
print(accuracy)

0.8851674641148325
