In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

## Categorical Naive Bayes Classifier

In [2]:
data=pd.read_csv('https://raw.githubusercontent.com/huynhthanh98/ML/master/lab-04/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data=data.drop(["Cabin"],axis=1)
data=data.dropna()

In [4]:
# Prepare data
features = ['Pclass', 'Sex', 'Embarked']
df = data[features]
y = data.Survived
df.head()

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [6]:
print(X_train.shape)
X_train.head()

(569, 3)


Unnamed: 0,Pclass,Sex,Embarked
472,2,female,S
432,2,female,S
666,2,male,S
30,1,male,C
291,1,female,C


In [7]:
class Categorical_NB:
    def __init__(self, alpha=1):
        pass
        self.alpha = alpha
        self._likelihood = {}
        self._classes = []
    def fit(self, X_train, y_train):
        if type(X_train) != np.ndarray:
            X_train = X_train.to_numpy()
        if type(y_train) != np.ndarray:
            y_train = y_train.to_numpy()

        self._classes = np.unique(y_train)
        self._prior = [np.sum(y_train == j)/len(y_train) for j in self._classes]
        self._compute_likelihood(X_train, y_train)
    def _compute_likelihood(self, X_train, y_train):
        pass
        if type(X_train) != np.ndarray:
            X_train = X_train.to_numpy()
        if type(y_train) != np.ndarray:
            y_train = y_train.to_numpy()
        
        for c in self._classes:
            tmp_list = []
            for col in range(X_train.shape[1]):
                tmp_set = {}
                for f in np.unique(X_train[:,col]):
                    tmp_set[f] = (np.sum(X_train[:,col][y_train == c]==f) + self.alpha) /\
                                 (np.sum(y_train==c) + len(self._classes)*self.alpha)
                tmp_list.append(tmp_set)
            self._likelihood[c] = tmp_list
        
    def P_Xtest_giv_y(self, X_test):
        if type(X_test) != np.ndarray:
            X_test = X_test.to_numpy()

        X_test_proba = np.zeros_like(X_test)
        P_Xtest_giv_y = {}
        for c in self._classes:
            for j in range(X_test.shape[1]):
                for i in range(X_test.shape[0]):
                    k = X_test[i, j]
                    X_test_proba[i, j] = self._likelihood[c][j][k]
            P_Xtest_giv_y[c] = np.prod(X_test_proba,axis=1)
        return P_Xtest_giv_y

    def _predict(self, X_test):
        if type(X_test) != np.ndarray:
            X_test = X_test.to_numpy()
            
        P_X_giv_y = self.P_Xtest_giv_y(X_test)

        Postorior0 = self._prior[0]*P_X_giv_y[0]
        Postorior1 = self._prior[1]*P_X_giv_y[1]
       
        return (Postorior1 > Postorior0).astype(float)

    def predict(self, X_test):
        if type(X_test) != np.ndarray:
            X_test = X_test.to_numpy()
        return self._predict(X_test)

    def score(self, X_test, y_test):
        if type(X_test) != np.ndarray:
            X_test = X_test.to_numpy()
        pred = self.predict(X_test)
        return accuracy_score(y_test, pred)

In [8]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [9]:
# My model
model = Categorical_NB()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

0.7622377622377622


In [10]:
# Sklearn model
from sklearn.naive_bayes import CategoricalNB
X_train_enc = MultiColumnLabelEncoder(columns = ['Sex','Embarked']).fit_transform(X_train)
X_test_enc = MultiColumnLabelEncoder(columns = ['Sex','Embarked']).fit_transform(X_test)
sklearn_model = CategoricalNB()
sklearn_model.fit(X_train_enc, y_train)
sklearn_pred = sklearn_model.predict(X_test_enc)
# print(sklearn_pred)
print(sklearn_model.score(X_test_enc, y_test))

0.7622377622377622


## Gaussian Naive Bayes Classifier

In [11]:
class Gaussian_NB:
    def __init__(self):
        pass
        self._means = {}
        self._stds = {}
        self._classes = []

    def fit(self, X_train, y_train):
        if type(X_train) != np.ndarray:
            X_train = X_train.to_numpy()
        if type(y_train) != np.ndarray:
            y_train = y_train.to_numpy()

        self._classes = np.unique(y_train)
        self._prior = {c:np.sum(y_train==c)/len(y_train) for c in self._classes}
        self._compute_params(X_train, y_train)

    def _compute_params(self, X_train, y_train):
        pass
        self._means = {c: X_train[y_train == c].mean(axis=0) for c in self._classes}
        self._stds = {c: X_train[y_train == c].std(axis=0) for c in self._classes}

    def _pdf(self, x, mean, std):
        return (1.0/(np.sqrt(2*np.pi)*std))*np.exp((-(x-mean)**2)/(2*std**2))

    def P_Xtest_giv_y(self, X_test):
        if type(X_test) != np.ndarray:
            X_test = X_test.to_numpy()
        
        P_X_test_giv_y = {}

        for c in self._classes:
            X_test_proba = np.zeros(X_test.shape)
            for j in range(X_test.shape[1]):
                mean = self._means[c][j]
                std = self._stds[c][j]
                for i in range(X_test.shape[0]):
                    x = X_test[i,j]
                    X_test_proba[i,j] = self._pdf(x, mean, std) # Bug is here, cannot assign to X_test_proba[i, j]
            P_X_test_giv_y[c] = np.prod(X_test_proba, axis=1)
        
        return P_X_test_giv_y

    def predict(self, X_test):
        if type(X_test) != np.ndarray:
            X_test = X_test.to_numpy()
        return self._predict(X_test)

    def _predict(self, X_test):
        updated_likelihood = self.P_Xtest_giv_y(X_test)
        Postorior = {c: self._prior[c] * updated_likelihood[c] for c in self._classes}
        return (Postorior[1] > Postorior[0]).astype(float)

    def score(self, X_test, y_test):
        if type(X_test) != np.ndarray:
            X_test = X_test.to_numpy()
        if type(y_test) != np.ndarray:
            y_test = y_test.to_numpy()
        pred = self.predict(X_test)
        return accuracy_score(pred, y_test)

In [12]:
# Prepare data
cols = ['SibSp', 'Parch', 'Fare']
X = data[cols]
y = data.Survived
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# My model
model = Gaussian_NB()
model.fit(X_train, y_train)
print(model.score(X_test,y_test))

0.6363636363636364


In [14]:
from sklearn.naive_bayes import GaussianNB
# Sklearn model
clf = GaussianNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(clf.score(X_test, y_test))
# print(pred)

0.6363636363636364


## Mix model

In [15]:
class Mix_Model:
    def __init__(self):
        pass
        self._classes = []
        self._prior = {}
        self._cate_model = None
        self._num_model = None

    def fit(self, X_train, y_train):
        X_train_cat = X_train.select_dtypes(include=['object'])
        X_train_num = X_train.select_dtypes(exclude=['object'])

        if type(X_train_cat) != np.ndarray:
            X_train_cat = X_train_cat.to_numpy() 
        if type(X_train_num) != np.ndarray:
            X_train_num = X_train_num.to_numpy() 
        if type(y_train) != np.ndarray:
            y_train = y_train.to_numpy()

        self._cate_model = Categorical_NB()
        self._cate_model.fit(X_train_cat, y_train)

        self._num_model = Gaussian_NB()
        self._num_model.fit(X_train_num, y_train)

        self._classes = np.unique(y_train)
        self._prior = {c:np.sum(y_train==c)/len(y_train) for c in self._classes}
        
    def _predict(self, X_test_num, X_test_cat):

        cat_likelihood = self._cate_model.P_Xtest_giv_y(X_test_cat)
        num_likelihood = self._num_model.P_Xtest_giv_y(X_test_num)
        Postorior = {c: cat_likelihood[c] * num_likelihood[c] / self._prior[c] for c in self._classes}
        return (Postorior[1] > Postorior[0]).astype(float)
    
    def predict(self, X_test):

        X_test_num = X_test.select_dtypes(exclude=['object'])
        X_test_cat = X_test.select_dtypes(include=['object'])

        if type(X_test_num) != np.ndarray:
            X_test_num = X_test_num.to_numpy()
        if type(X_test_cat) != np.ndarray:
            X_test_cat = X_test_cat.to_numpy()

        return self._predict(X_test_num, X_test_cat)

    def score(self, X_test, y_test):
        pred = self.predict(X_test)
        return accuracy_score(pred, y_test)

In [16]:
# Read data
df = pd.read_csv("https://raw.githubusercontent.com/dinhvietcuong1996/Lab-MachineLearningCourse/master/Lab04/lienminh.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9879 entries, 0 to 9878
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   killsDiff          9879 non-null   int64 
 1   minionsKilledDiff  9879 non-null   int64 
 2   wardPlacedDiff     9879 non-null   int64 
 3   firstBlood         9879 non-null   object
 4   heralds            9879 non-null   object
 5   dragons            9879 non-null   object
 6   teamWins           9879 non-null   object
dtypes: int64(3), object(4)
memory usage: 540.4+ KB


Unnamed: 0,killsDiff,minionsKilledDiff,wardPlacedDiff,firstBlood,heralds,dragons,teamWins
0,3,-2,13,blue,none,none,red
1,0,-66,0,red,red,red,red
2,-4,-17,0,red,none,blue,red
3,-1,-34,28,red,blue,none,red
4,0,-15,58,red,none,red,red


In [17]:
df = df.dropna()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9879 entries, 0 to 9878
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   killsDiff          9879 non-null   int64 
 1   minionsKilledDiff  9879 non-null   int64 
 2   wardPlacedDiff     9879 non-null   int64 
 3   firstBlood         9879 non-null   object
 4   heralds            9879 non-null   object
 5   dragons            9879 non-null   object
 6   teamWins           9879 non-null   object
dtypes: int64(3), object(4)
memory usage: 617.4+ KB


Unnamed: 0,killsDiff,minionsKilledDiff,wardPlacedDiff,firstBlood,heralds,dragons,teamWins
0,3,-2,13,blue,none,none,red
1,0,-66,0,red,red,red,red
2,-4,-17,0,red,none,blue,red
3,-1,-34,28,red,blue,none,red
4,0,-15,58,red,none,red,red


In [18]:
df_enc = MultiColumnLabelEncoder(columns=['teamWins']).fit_transform(df)
df_enc.head()

Unnamed: 0,killsDiff,minionsKilledDiff,wardPlacedDiff,firstBlood,heralds,dragons,teamWins
0,3,-2,13,blue,none,none,1
1,0,-66,0,red,red,red,1
2,-4,-17,0,red,none,blue,1
3,-1,-34,28,red,blue,none,1
4,0,-15,58,red,none,red,1


In [19]:
X = df_enc.drop('teamWins',axis=1)
y = df_enc.teamWins

In [20]:
# My mix model
mix_model = Mix_Model()
mix_model.fit(X, y)
pred_mixmodel = mix_model.predict(X)
print(mix_model.score(X, y))

0.7116104868913857
