# Gaussian Naive Bayes Classification

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_iris

In [3]:
iris = load_iris()

In [4]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

* 설명변수: 꽃받침 길이, 꽃받침 폭 , 꽃잎 길이 , 꽃잎 폭  <br> 
(sepal length , sepal width , petal length , petal width)


* 타겟변수: 붓꽃의 품종 - 총 3가지 종류  <br>
(Iris-Setosa , Iris-Versicolour , Iris-Virginica - 0 1 2)

In [5]:
X = pd.DataFrame(iris.data)
y = pd.DataFrame(iris.target)

In [6]:
X

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [7]:
y

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# 1-1) assignment

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=48)

* X와 y를 train set과 test set으로 split 
* test_size: 나누고자 하는 비율 (test 0.2, train=1-0.2=0.8)
* random_state: seed 고정
    - seed를 지정해 주지 않으면, 데이터를 랜덤으로 바꿔가면서 split하게 됨
    - seed를 지정해 주면, 같은 값이 추출되도록 고정시킬 수 있음

# 1-2) assignment
* 가우시안 나이브 베이즈

In [10]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
gnb.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.9

* priors: X_train과 y_train 값으로 사전 확률 계산
* GaussianNB 모델에 fitting하여 계산된 사전확률값을 통해 X_test로 y_pred 값 예측 
* score: X_test를 모델에 피팅해 구한 y_pred값과 y_test값 사이의 accuracy   

# Assignment 2 : Naive Bayes Classification 해보기
- gamble, money, hi: spam 메세지 내의 단어 존재 여부   
spam: spam메세지이면 1, 아니면 0 

- 설명변수: gamble, money, hi / 종속변수: spam

In [11]:
gamble_spam = {'gamble' : [1,0,1,0,1,0,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,
                           1,0,1,1,0,1,0,1,0,1,1,1,1,0,0,0,1,0,1,0,1,0,1,0,1,
                           0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,1,0,1,1,1,1,1,0,0,
                           1,0,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,1,0,1,1,1,0,1,1],
               'money' : [1,1,1,0,1,0,0,0,1,0,0,0,1,0,1,1,0,1,1,0,1,1,1,1,1,
                          0,0,0,1,1,1,0,0,0,1,1,0,0,0,1,0,1,1,0,1,0,0,1,0,1,
                          1,0,1,1,0,1,0,1,0,1,1,0,0,0,1,1,0,0,0,1,1,1,1,1,1,
                          1,1,0,1,0,1,1,0,0,1,0,1,1,1,1,0,0,1,0,0,1,0,0,1,0],
               'hi' : [0,1,0,1,0,1,0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,0,0,0,
                       1,0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0,0,1,0,1,0,1,1,0,
                       1,0,0,0,1,1,1,1,0,1,0,1,1,0,0,1,1,1,1,0,0,0,0,0,0,
                       1,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0],
                'spam' : [1,0,1,0,1,0,0,0,1,0,0,0,1,1,1,1,0,1,1,0,0,0,1,1,0,
                          1,0,1,1,0,0,1,1,0,0,0,0,1,1,0,1,0,0,1,0,1,0,1,0,1,
                          0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,
                          1,0,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,1,0,1,1,1,0,1,1]}

In [12]:
df  = pd.DataFrame(gamble_spam, columns = ['gamble', 'money', 'hi', 'spam'])

# 2-1) assignment

In [13]:
spam_data = df.as_matrix()

  """Entry point for launching an IPython kernel.


In [14]:
spam_data

array([[1, 1, 0, 1],
       [0, 1, 1, 0],
       [1, 1, 0, 1],
       [0, 0, 1, 0],
       [1, 1, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [1, 0, 0, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 0, 1, 0],
       [1, 1, 0, 1],
       [0, 1, 1, 1],
       [0, 0, 1, 0],
       [0, 1, 1, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 1],
       [0, 1, 0, 1],
       [0, 1, 0, 0],
       [1, 0, 1, 1],
       [0, 0, 0, 0],
       [1, 0, 0, 1],
       [1, 1, 1, 1],
       [0, 1, 0, 0],
       [1, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 1, 1],
       [0, 0, 0, 0],
       [1, 1, 1, 0],
       [1, 1, 1, 0],
       [1, 0, 1, 0],
       [1, 0, 0, 1],
       [0, 0, 1, 1],
       [0, 1, 1, 0],
       [0, 0, 1, 1],
       [1, 1, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 1, 1],
       [0, 1, 0, 0],
       [1, 0, 1, 1],
       [0, 0, 0, 0],
       [1, 1,

# 2-2) assignment

#### 1) P(spam=1), P(spam=0)

In [15]:
p_spam = sum(spam_data[:,3]==1)/len(spam_data) # P(spam=1)
p_spam_not = 1 - p_spam # P(spam=0)

#### 2) gamble , money , hi의 조건부 확률
* P(gamble=1|spam=1), P(money=1|spam=1), P(hi=1|spam=1)
* P(gamble=1|spam=0), P(money=1|spam=0), P(hi=1|spam=0)

In [16]:
p_gamble_spam = sum((spam_data[:, 0] == 1) & (spam_data[:, 3] == 1)) / sum(spam_data[:, 3] == 1) # P(gamble=1|spam=1)
p_gamble_spam_not = sum((spam_data[:, 0] == 1) & (spam_data[:, 3] == 0)) / sum(spam_data[:, 3] == 0) # P(gamble=1|spam=0)

p_money_spam = sum((spam_data[:, 1] == 1) & (spam_data[:, 3] == 1)) / sum(spam_data[:, 3] == 1) # P(money=1|spam=1)
p_money_spam_not = sum((spam_data[:, 1] == 1) & (spam_data[:, 3] == 0)) / sum(spam_data[:, 3] == 0) # P(money=1|spam=0)

p_hi_spam = sum((spam_data[:, 2] == 1) & (spam_data[:, 3] == 1)) / sum(spam_data[:, 3] == 1) # P(hi=1|spam=1)
p_hi_spam_not = sum((spam_data[:, 2] == 1) & (spam_data[:, 3] == 0)) / sum(spam_data[:, 3] == 0) # P(hi=1|spam=0)

#### 3) P(* |spam=1)값 리스트, P(* |spam=0)값 리스트 생성

In [17]:
proba = [p_gamble_spam,p_money_spam,p_hi_spam] # P(*|spam=1)값 리스트
proba_not = [p_gamble_spam_not,p_money_spam_not,p_hi_spam_not] # P(*|spam=0)값 리스트

In [18]:
proba, proba_not

([0.8333333333333334, 0.5476190476190477, 0.4523809523809524],
 [0.1896551724137931, 0.5344827586206896, 0.4482758620689655])

#### 4) test set 
* ex) [0,1,0]인 경우 = (gamble=0,money=1,hi=0인 경우) -> spam인지 아닌지 확률 계산
*  [0,0,0] ~ [1,1,1] 8가지 모든 경우에 대해 확률 P(* |spam=1) 구함

In [19]:
test = [[i,j,k] for i in range(2) for j in range(2) for k in range(2)]

In [20]:
test

[[0, 0, 0],
 [0, 0, 1],
 [0, 1, 0],
 [0, 1, 1],
 [1, 0, 0],
 [1, 0, 1],
 [1, 1, 0],
 [1, 1, 1]]

# 2-3) assignment

#### 1) 조건부 확률 계산 
* x: 해당 독립변수가 0인지 1인지를 받는 인자
* p: 해당 독립변수가 1일때의 조건부 확률

In [21]:
# P(X=x|Y=1) = xP(X=1|Y=1)+(1-x)P(X=0|Y=1) 응용

def con_proba(x,p):
    return x*p + (1-x)*(1-p)

#### 2) 확률값 반환 
* test set 각각의 경우에 대해 확률값을 반환해주는 함수 생성 

In [22]:
def process(p_spam,p_spam_not,test,proba,proba_not):
    result = []
    for i in range(8):
        a = p_spam
        b = p_spam_not
        for j in range(3):
            a = a*con_proba(test[i][j],proba[j]) # P(test set case|spam=1)
            b = b*con_proba(test[i][j],proba_not[j]) # P(test set case|spam=0)
        summation = a+b
        result.append([a/summation,b/summation])
    return result

#### 3) 결과
* 왼쪽이 spam 메세지일 확률, 오른쪽이 spam 메세지가 아닐 확률
* gamble money hi라는 단어가 들어가면 들어갈수록 spam메세지인걸 알 수 있음 
* 특히 5,6,7,8 행에서 spam 메세지일 확률이 높음 (gamble = 1)   
-> gamble 이라는 단어가 있으면 spam 메세지로 분류될 확률이 높음   

In [23]:
process(p_spam,p_spam_not,test,proba,proba_not)

[[0.12561158047604412, 0.874388419523956],
 [0.12744440801721746, 0.8725555919827825],
 [0.1315383089295994, 0.8684616910704007],
 [0.13344441688939654, 0.8665555831106033],
 [0.7542408952456083, 0.24575910475439158],
 [0.7573019784074859, 0.24269802159251408],
 [0.7639150506833852, 0.23608494931661478],
 [0.7668928774284339, 0.23310712257156604]]