In [None]:
import pandas as pd
import numpy as np
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)
print(data.columns) #dataframe from pd so cant use keys
print(data.head())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0

In [None]:
print(data.isnull()) #shows False if not null, and True if null i.e. missing
print(data.isnull().sum()) #sums all the True across columns: Cabin most null, then age, embarked (but only 2)

     PassengerId  Survived  Pclass   Name    Sex    Age  SibSp  Parch  Ticket  \
0          False     False   False  False  False  False  False  False   False   
1          False     False   False  False  False  False  False  False   False   
2          False     False   False  False  False  False  False  False   False   
3          False     False   False  False  False  False  False  False   False   
4          False     False   False  False  False  False  False  False   False   
..           ...       ...     ...    ...    ...    ...    ...    ...     ...   
886        False     False   False  False  False  False  False  False   False   
887        False     False   False  False  False  False  False  False   False   
888        False     False   False  False  False   True  False  False   False   
889        False     False   False  False  False  False  False  False   False   
890        False     False   False  False  False  False  False  False   False   

      Fare  Cabin  Embarked

In [None]:
#clean data: remove cabin too sparse, age replace with median-wont skew the mean preserving distrubtion w.o bias and is robust to outliers
data = data.drop ("Cabin", axis =1) #axis=1 says column, and "Cabin" is its name
print(data.columns) #Cabin has been removed from data so getting errors if i run this, should've renamed it to data_noCabin

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')


In [None]:
data["Age"] = data["Age"].fillna(data["Age"].median()) #fillna is replace NaN with () in this case the median of the column data[Age] and replaces that

In [None]:
#Embarked only has 2 missing values so let's pick the most common location
data["Embarked"] = data["Embarked"].fillna(data["Embarked"].mode()[0]) #instead of just picking S we use mode as I don;t know [0] after mode just selects first if multiple

In [None]:
data.dtypes #figure out what strs needed to be encoded numerically: name? sex? ticket? embarked?
#sex and embarked are easy to encode, name doesnt matter so can be dropped, ticket is one I'm unsure of
print(data['Sex'].unique()) #all sexes were NaN with my mistake luckily recoverable but once again should be using new names!!!!!
data['Sex'] = data['Sex'].str.strip().str.lower().map({'male': 0, 'female': 1}) #strip gets rid of spaces if any and str.lower makes all lower case so fits with my map
#Error if run again as now they're all numbers so cant do the conversion, really have to start renaming as a habit!!!!!!!!!

['male' 'female']


In [None]:
print(data['Sex'][0:5])

0    0
1    1
2    1
3    1
4    0
Name: Sex, dtype: int64


In [None]:
embarked_dummies = pd.get_dummies(data["Embarked"], prefix="Embarked") #selects embarked, contains S,Q,C
#pd.get_dummies() converts categorical values into binary for every unique in Embarked and creates new columns
#prefix=Embarked gives columns a name starting with Embarked then _S or Q or C
data = pd.concat([data, embarked_dummies], axis=1) #concatenate/merge new dataframes, axis=1 does it horizontally as we're adding a column
data.drop("Embarked", axis=1, inplace=True) #remove Embarked, it's a column. inplace=True means adjust the dataframe itself

In [None]:
#convert to numbers
data['Embarked_S'] = data['Embarked_S'].astype(int)
data['Embarked_Q'] = data['Embarked_Q'].astype(int)
data['Embarked_C'] = data['Embarked_C'].astype(int)
print(data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare  Embarked_C  Embarked_Q  Embarked_S  
0         A/5 21171   7.2500           0           0           1  
1          PC 17599  71.2833           1           0           0  
2  STON/O2. 3101282   7.9250           0           0           1  
3            113803  53.1000

In [None]:
data=data.drop("Name", axis=1)
print(data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [None]:
print(data["Ticket"].nunique()) #681 unique tickets of 890

681


In [None]:
data_nt = data.drop("Ticket", axis = 1)
print(data_nt.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [None]:
#I want to separate target from features
x=data_nt.drop("Survived", axis = 1)
y = data_nt["Survived"]

#I want to set up training and test before
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 22) #test is 20% and set 1 random thats recurring

In [None]:
#Build Logistic Regression from scratch-ish
import numpy as np
def sigmoid(z):
    z = np.array(z, dtype=np.float64)
    return np.where(z >= 0, 1 / (1 + np.exp(-z)), np.exp(z) / (1 + np.exp(z)))

#This defines the sigmoid function using np.exp, this is also the log-partition function it is the inverse of the log-odds
#probability of a binary outcome live or die, say 30% and 70% output is 0-1 (lives on simplex) Probability was historically mean of distrubtion?
#odds= p(1)/p(0) output 0-infinte gives multiples
#log odds= log (odds) output is Real number (logit function)
#Therefore sigmoid is mapping of real number to simplex, and z is linear combination of features wTx+b=z

In [None]:
def initialize_weights(n_features): #this defines a function called i_n and takes in 1 arg(n_f) which is expceted to be an integer
  weights=np.zeros(n_features) #creates numpy array of features [0,0,..,0] based on int=n_features
  bias = 0
  return weights, bias #returns 2 values
  #represents weight coefficients we will learn

In [None]:
#Maximize likelihood:  train a model (logistic regression) to predict survival, need best parameters (w)
#MLE:“Given the parameters w, what’s the probability of observing true label
#MLE=P(y|x;w) generally MLE=yh^y * (1-yh)^1-y where y=1 or 0 if 1 MLE=y^h or 1-yh if samples independent, then take product of all passengers (this was just one)
#log MLE turns into addition: Sum of all ylogyh + (1-y)log(1-yh)
#now -logMLE=binary cross entropy loss, minimize negative is same as maximizing positive so just take -logMLE -Sigma(ylogyh)+(1-y)log(1-yh)
#when y=1 then loss is -logyh and y=0 then -log(1-yh)
#normalize BCE by doing: 1/n *Sigma ylogyh + (1-y)log(1-yh)

In [None]:
#We have a curved parameter manifold, each point is a logistic regression model=probability distrubtion
#Above manifold have your loss function defined by BCE which gives a scalar field
#Jacobian is first derivative gives gradient of loss function w/ steepest ascent, this is a covector field, 1-form. In coordinates partial derivatives of L/partial parameter co
#Hessian is 2nd derivative of loss fnx how the slope changes, 0,2 tensor, gives us curvature at each local point and defines a Riemannian metric (not 2form as not antisymmetric)
#NOTE: Minimizing BCE is equivalent to minimizing KL Divergence,  both of their 2nd derivatives give Fisher metric
#Also have a simplex, with points b/w by KL Divergence, strictly a different manifold than parameter manifold and the fisher metric is pulled back onto parameter

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
#THE FIX!

In [None]:
def BCE_loss(y_pred, y_true):
    eps = 1e-9
    y_pred = np.array(y_pred).astype(np.float64)  # force NumPy float array
    y_true = np.array(y_true).astype(np.float64)  # force NumPy float array

    # Use NumPy clip — not pandas — to avoid log(0) and future warnings, as well as infinite from logs
    y_pred = np.clip(y_pred, eps, 1 - eps)

    loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss


In [None]:
#Forward pass
def forward(x, weights, bias):
  z= np.dot(x, weights) + bias #wTx+b
  y_pred= sigmoid(z) #sigmoid(0) gives output of 0.5, as initial predictions
  return y_pred

In [None]:
print(x_train)

[[ 1.34783679 -1.57799795 -0.73561236 ...  2.10597115 -0.30610782
  -1.64570147]
 [ 1.42888881  0.81759809 -0.73561236 ... -0.47484031 -0.30610782
   0.60764362]
 [-1.54687826 -0.38019993  1.35941164 ...  2.10597115 -0.30610782
  -1.64570147]
 ...
 [ 1.42116957 -0.38019993 -0.73561236 ... -0.47484031 -0.30610782
   0.60764362]
 [-1.20337208  0.81759809  1.35941164 ... -0.47484031 -0.30610782
   0.60764362]
 [ 1.70292184  0.81759809  1.35941164 ... -0.47484031  3.26682279
  -1.64570147]]


In [None]:
n_features = x_train.shape[1] #num of features/columns
weights, bias = initialize_weights(n_features)
print(weights, bias)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] 0


In [None]:
def compute_gradients(x_train, y_true, y_pred):
    m = x_train.shape[0]
    dw = (1 / m) * np.dot(x_train.T, (y_pred - y_true)) #dL/dw shows how each feature contributes to the error
    db = (1 / m) * np.sum(y_pred - y_true) #dL/db same for every output, how bias contributes to error
    return dw, db


In [None]:
#Update gradients
def update_parameters(weights, bias, dw, db, learning_rate):
    weights = weights - learning_rate * dw
    bias = bias - learning_rate * db
    return weights, bias


In [None]:
def train(x_train, y_train, weights, bias, learning_rate=0.005, num_epochs=5000):
    y_train = np.array(y_train).astype(np.float64)  # ensure clean format

    for i in range(num_epochs):
        y_pred = forward(x_train, weights, bias)
        loss = BCE_loss(y_pred, y_train)

        dw, db = compute_gradients(x_train, y_train, y_pred)
        weights, bias = update_parameters(weights, bias, dw, db, learning_rate)

        if i % 100 == 0:
            print(f"Epoch {i}, Loss: {loss:.6f}")

    return weights, bias


In [None]:
#Actually Run and Train:
weights, bias = train(x_train, y_train, weights, bias, learning_rate=0.01, num_epochs=1000)

Epoch 0, Loss: 0.693147
Epoch 100, Loss: 0.593690
Epoch 200, Loss: 0.541586
Epoch 300, Loss: 0.510830
Epoch 400, Loss: 0.491124
Epoch 500, Loss: 0.477765
Epoch 600, Loss: 0.468331
Epoch 700, Loss: 0.461459
Epoch 800, Loss: 0.456331
Epoch 900, Loss: 0.452426


In [None]:
# Evaluate the model
y_pred_test = forward(x_test, weights, bias)>= 0.5  # predicted labels on test set
accuracy = np.mean(y_pred_test == y_test)            # compare with true test labels
print("Test accuracy:", accuracy)

Test accuracy: 0.770949720670391
