In [57]:
import pandas as pd
import numpy as np
from numpy.linalg import inv

In [58]:
titanic_csv = "https://raw.githubusercontent.com/dlsun/data-science-book/master/data/titanic.csv"
titanic_df = pd.read_csv(titanic_csv)
features = ['pclass','survived','sex','age']

titanic_df = titanic_df.loc[:,features]
display(titanic_df)
titanic_df.loc[:,'pclass']=titanic_df['pclass'].fillna(titanic_df['pclass'].mode()).astype(int)
titanic_df.loc[:,'age']=titanic_df['age'].fillna(titanic_df['age'].median())
titanic_df.loc[:,'age']=(titanic_df['age']/10).astype(str).str[0].astype(int)*10
titanic_df["sex"] = titanic_df["sex"].map(dict(male=1, female=0))
titanic_df.head()

Unnamed: 0,pclass,survived,sex,age
0,1,1,female,29.0000
1,1,1,male,0.9167
2,1,0,female,2.0000
3,1,0,male,30.0000
4,1,0,female,25.0000
...,...,...,...,...
1304,3,0,female,14.5000
1305,3,0,female,
1306,3,0,male,26.5000
1307,3,0,male,27.0000


Unnamed: 0,pclass,survived,sex,age
0,1,1,0,20
1,1,1,1,0
2,1,0,0,0
3,1,0,1,30
4,1,0,0,20


In [59]:
# computes priors
def compute_priors(df, response):
    y = df[response]
    priors = dict(y.value_counts(normalize=True))
    return priors

priors = compute_priors(titanic_df, "survived")
priors

{0: 0.6180290297937356, 1: 0.3819709702062643}

In [60]:
# computes the mu vector for each category
def compute_mu_vectors(df, response):
    y = df[response]
    mu_vectors = {}
    for category_k in y.unique():
        df_k = df[y == category_k] # df with y = category_k
        X_k = df_k.drop(response, axis=1)
        mu_vectors[category_k] = dict(X_k.mean())
    return mu_vectors

mean_dict = compute_mu_vectors(titanic_df, "survived")
mean_dict

{1: {'pclass': 1.962, 'sex': 0.322, 'age': 23.86},
 0: {'pclass': 2.50061804697157,
  'sex': 0.8430160692212608,
  'age': 24.709517923362174}}

In [61]:
# computes covariance matrix
def compute_inv_sigma(df, response):
    X = df.drop(response, axis=1)
    return inv(X.cov())
    
compute_inv_sigma(titanic_df, "survived")

array([[ 1.709314  , -0.43131994,  0.04162978],
       [-0.43131994,  4.47844087, -0.01839694],
       [ 0.04162978, -0.01839694,  0.00660763]])

In [62]:
def classify_new_observation(df, response, new_x):
    y = df[response]
    
    priors = compute_priors(df, response)
    mu_vectors = compute_mu_vectors(df, response)
    inv_sigma = compute_inv_sigma(df, response)
    
    prob_dict = {}
    for category_k in y.unique():
        mu_k = pd.Series(mu_vectors[category_k]).to_numpy()
        first_term = new_x.transpose().dot(inv_sigma).dot(mu_k)
        second_term = .5 * mu_k.transpose().dot(inv_sigma).dot(mu_k)
        third_term = np.log(priors[category_k])
        prob_k = first_term - second_term + third_term
        
        prob_dict[category_k] = prob_k
        
    best_class, max_prob = next(iter(prob_dict.items()))
    for class_k, prob_k in prob_dict.items():
        if max_prob < prob_k:
            max_prob = prob_k
            best_class = class_k
      
    return best_class
        
X = titanic_df.drop("survived", axis=1)
new_x = X.iloc[0, :]
new_x = new_x.to_numpy()

class_k = classify_new_observation(titanic_df, "survived", new_x)
class_k
    
        



1

In [82]:
# returns a dictionary of priors (one key, value pair for each category)
def compute_priors(df, response):
    y = df[response]
    priors_dict = {}
    priors = dict(y.value_counts(normalize=True))
    return priors

# returns a dictionary of mu vectors (one key, value pair for each category)
def compute_mu_vectors(df, response):
    y = df[response]
    mu_vectors = {}
    for category_k in y.unique():
        df_k = df[y == category_k] # df with y = category_k
        X_k = df_k.drop(response, axis=1)
        mu_vectors[category_k] = dict(X_k.mean())
    return mu_vectors

# returns the inverse of the covariance matrix
def compute_inv_sigma(df, response):
    X = df.drop(response, axis=1)
    return inv(X.cov())

# returns the classification of a single obs
def classify_obs(x_i, y, mu_vectors, priors, inv_sigma):
        prob_dict = {}
        for category_k in y.unique():
            mu_k = pd.Series(mu_vectors[category_k]).to_numpy()
            first_term = x_i.transpose().dot(inv_sigma).dot(mu_k)
            second_term = .5 * mu_k.transpose().dot(inv_sigma).dot(mu_k)
            third_term = np.log(priors[category_k])
            prob_k = first_term - second_term + third_term
            prob_dict[category_k] = prob_k

        best_class, max_prob = next(iter(prob_dict.items()))
        for class_k, prob_k in prob_dict.items():
            if max_prob < prob_k:
                max_prob = prob_k
                best_class = class_k
        return best_class

class LDA:
    
    def fit(self, X_train, y_train):
        df_train = X_train.copy()
        response = y_train.name
        df_train[response] = y_train
        self.y = df_train[response]
        self.priors = compute_priors(df_train, response)
        self.mu_vectors = compute_mu_vectors(df_train, response)
        self.inv_sigma = compute_inv_sigma(df_train, response)
        
    def predict(self, df_test):
        y_pred = {}
        for i in range(len(df_test)):
            x_i = df_test.loc[i, :].to_numpy()
            y_pred[i] = classify_obs(x_i, self.y, self.mu_vectors, self.priors, self.inv_sigma)
        return pd.Series(y_pred)
    

In [83]:
y_train = titanic_df["survived"]
model = LDA()

X_train = titanic_df.drop("survived", axis=1)
y_train = titanic_df["survived"]

model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [84]:
(y_train == y_pred).mean()

0.7838044308632544