In [19]:
import pandas as pd
import os
import numpy as np
from numpy.linalg import inv

In [20]:
titanic_csv = "https://raw.githubusercontent.com/dlsun/data-science-book/master/data/titanic.csv"
titanic_df = pd.read_csv(titanic_csv)
features = ['pclass','survived','sex','age']

titanic_df = titanic_df.loc[:,features]
display(titanic_df)
titanic_df.loc[:,'pclass']=titanic_df['pclass'].fillna(titanic_df['pclass'].mode()).astype(int)
titanic_df.loc[:,'age']=titanic_df['age'].fillna(titanic_df['age'].median())
titanic_df.loc[:,'age']=(titanic_df['age']/10).astype(str).str[0].astype(int)*10
titanic_df["sex"] = titanic_df["sex"].map(dict(male=1, female=0))
titanic_df.head()

Unnamed: 0,pclass,survived,sex,age
0,1,1,female,29.0000
1,1,1,male,0.9167
2,1,0,female,2.0000
3,1,0,male,30.0000
4,1,0,female,25.0000
...,...,...,...,...
1304,3,0,female,14.5000
1305,3,0,female,
1306,3,0,male,26.5000
1307,3,0,male,27.0000


Unnamed: 0,pclass,survived,sex,age
0,1,1,0,20
1,1,1,1,0
2,1,0,0,0
3,1,0,1,30
4,1,0,0,20


In [21]:
# computes priors
def compute_priors(df, response):
    y = df[response]
    priors = dict(y.value_counts(normalize=True))
    return priors

priors = compute_priors(titanic_df, "survived")
priors

{0: 0.6180290297937356, 1: 0.3819709702062643}

In [22]:
# computes the mu vector for each category
def compute_mu_vectors(df, response):
    y = df[response]
    mu_vectors = {}
    for category_k in y.unique():
        df_k = df[y == category_k] # df with y = category_k
        X_k = df_k.drop(response, axis=1)
        mu_vectors[category_k] = dict(X_k.mean())
    return mu_vectors

mean_dict = compute_mu_vectors(titanic_df, "survived")
mean_dict

{1: {'pclass': 1.962, 'sex': 0.322, 'age': 23.86},
 0: {'pclass': 2.50061804697157,
  'sex': 0.8430160692212608,
  'age': 24.709517923362174}}

In [23]:
# computes covariance matrix
def compute_inv_sigma(df, response):
    X = df.drop(response, axis=1)
    return inv(X.cov())
    
compute_inv_sigma(titanic_df, "survived")

array([[ 1.709314  , -0.43131994,  0.04162978],
       [-0.43131994,  4.47844087, -0.01839694],
       [ 0.04162978, -0.01839694,  0.00660763]])

In [24]:
def classify_new_observation(df, response, new_x):
    y = df[response]
    
    priors = compute_priors(df, response)
    mu_vectors = compute_mu_vectors(df, response)
    inv_sigma = compute_inv_sigma(df, response)
    
    prob_dict = {}
    for category_k in y.unique():
        mu_k = pd.Series(mu_vectors[category_k]).to_numpy()
        first_term = new_x.transpose().dot(inv_sigma).dot(mu_k)
        second_term = .5 * mu_k.transpose().dot(inv_sigma).dot(mu_k)
        third_term = np.log(priors[category_k])
        prob_k = first_term - second_term + third_term
        
        prob_dict[category_k] = prob_k
        
    best_class, max_prob = next(iter(prob_dict.items()))
    for class_k, prob_k in prob_dict.items():
        if max_prob < prob_k:
            max_prob = prob_k
            best_class = class_k
      
    return best_class
        
X = titanic_df.drop("survived", axis=1)
new_x = X.iloc[0, :]
new_x = new_x.to_numpy()

class_k = classify_new_observation(titanic_df, "survived", new_x)
class_k
    
        



1

In [25]:
# returns a dictionary of priors (one key, value pair for each category)
def compute_priors(df, response):
    y = df[response]
    priors_dict = {}
    priors = dict(y.value_counts(normalize=True))
    return priors

# returns a dictionary of mu vectors (one key, value pair for each category)
def compute_mu_vectors(df, response):
    y = df[response]
    mu_vectors = {}
    for category_k in y.unique():
        df_k = df[y == category_k] # df with y = category_k
        X_k = df_k.drop(response, axis=1)
        mu_vectors[category_k] = dict(X_k.mean())
    return mu_vectors

# returns the inverse of the covariance matrix
def compute_inv_sigma(df, response):
    X = df.drop(response, axis=1)
    return inv(X.cov())

# returns the classification of a single obs
def classify_obs(x_i, y, mu_vectors, priors, inv_sigma):
        prob_dict = {}
        for category_k in y.unique():
            mu_k = pd.Series(mu_vectors[category_k]).to_numpy()
            first_term = x_i.transpose().dot(inv_sigma).dot(mu_k)
            second_term = .5 * mu_k.transpose().dot(inv_sigma).dot(mu_k)
            third_term = np.log(priors[category_k])
            prob_k = first_term - second_term + third_term
            prob_dict[category_k] = prob_k

        best_class, max_prob = next(iter(prob_dict.items()))
        for class_k, prob_k in prob_dict.items():
            if max_prob < prob_k:
                max_prob = prob_k
                best_class = class_k
        return best_class

class LDA:
    
    def fit(self, X_train, y_train):
        df_train = X_train.copy()
        response = y_train.name
        df_train[response] = y_train
        self.y = df_train[response]
        self.priors = compute_priors(df_train, response)
        self.mu_vectors = compute_mu_vectors(df_train, response)
        self.inv_sigma = compute_inv_sigma(df_train, response)
        
    def predict(self, df_test):
        y_pred = {}
        for i in range(len(df_test)):
            x_i = df_test.loc[i, :].to_numpy()
            y_pred[i] = classify_obs(x_i, self.y, self.mu_vectors, self.priors, self.inv_sigma)
        return pd.Series(y_pred)
    

In [26]:
y_train = titanic_df["survived"]
model = LDA()

X_train = titanic_df.drop("survived", axis=1)
y_train = titanic_df["survived"]

model.fit(X_train, y_train)
y_pred = model.predict(X_train)

In [27]:
(y_train == y_pred).mean()

0.7838044308632544

In [28]:
df_train = pd.read_csv("Training_data.csv").drop("Unnamed: 0", axis=1)
df_train.head()

Unnamed: 0,File,Review,Label
0,2893_10.txt,Walt Disney's CINDERELLA takes a story everybo...,1
1,7944_9.txt,"Have you ever, or do you have, a pet who's bee...",1
2,11725_10.txt,"I suck at gratuitous Boob references, so i'm j...",1
3,1587_10.txt,"Does anyone know, where I can see or download ...",1
4,10297_8.txt,Well not actually. This movie is very entertai...,1


In [29]:
df_test = pd.read_csv("Test_data.csv").drop("Unnamed: 0", axis=1)
df_test.head()

Unnamed: 0,File,Review,Label
0,2893_10.txt,"""Rush in Rio"" is, no doubt, one of the most ex...",1
1,8705_10.txt,I have seen a number of horror movies to know ...,1
2,11725_10.txt,I'm a fan of B grade 80s films in which the he...,1
3,9859_8.txt,"I think that Pierre Léaud, or his character, t...",1
4,12409_10.txt,This picture doesn't have any big explosions o...,1


In [34]:
os.chdir("/../../datasets/aclImdb/train")

In [35]:
os.listdir()

['pos',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt',
 'neg',
 'labeledBow.feat',
 'unsupBow.feat',
 'unsup']

In [36]:
f_labels = open("labeledBow.feat",'r')
labels_dirty = f_labels.read().split(" ")
labels = []
for i in labels_dirty:
    labels.append(i.replace("\n",''))
(f_labels).close()

In [37]:
labels

['9',
 '0:9',
 '1:1',
 '2:4',
 '3:4',
 '4:6',
 '5:4',
 '6:2',
 '7:2',
 '8:4',
 '10:4',
 '12:2',
 '26:1',
 '27:1',
 '28:1',
 '29:2',
 '32:1',
 '41:1',
 '45:1',
 '47:1',
 '50:1',
 '54:2',
 '57:1',
 '59:1',
 '63:2',
 '64:1',
 '66:1',
 '68:2',
 '70:1',
 '72:1',
 '78:1',
 '100:1',
 '106:1',
 '116:1',
 '122:1',
 '125:1',
 '136:1',
 '140:1',
 '142:1',
 '150:1',
 '167:1',
 '183:1',
 '201:1',
 '207:1',
 '208:1',
 '213:1',
 '217:1',
 '230:1',
 '255:1',
 '321:5',
 '343:1',
 '357:1',
 '370:1',
 '390:2',
 '468:1',
 '514:1',
 '571:1',
 '619:1',
 '671:1',
 '766:1',
 '877:1',
 '1057:1',
 '1179:1',
 '1192:1',
 '1402:2',
 '1416:1',
 '1477:2',
 '1940:1',
 '1941:1',
 '2096:1',
 '2243:1',
 '2285:1',
 '2379:1',
 '2934:1',
 '2938:1',
 '3520:1',
 '3647:1',
 '4938:1',
 '5138:4',
 '5715:1',
 '5726:1',
 '5731:1',
 '5812:1',
 '8319:1',
 '8567:1',
 '10480:1',
 '14239:1',
 '20604:1',
 '22409:4',
 '24551:1',
 '47304:17',
 '0:7',
 '1:4',
 '2:2',
 '3:2',
 '5:4',
 '6:1',
 '8:2',
 '9:2',
 '14:1',
 '16:1',
 '18:1',
 '20: