##나이브 베이즈 분류기 구현

<br>
1. 베이즈 정리. <br>

 $P(A| W_1, W_2, W_3,...) = {P(W_1, W_2, W_3,...|A) \cdot P(A) \over P(W_1, W_2, W_3,....)}$ <br>
 $P(B| W_1, W_2, W_3,...) = {P(W_1, W_2, W_3,...|B) \cdot P(B) \over P(W_1, W_2, W_3,....)}$

<br>
2. 동일한 분모를 무시하면 다음 비례관계가 성립된다. <br>

 $P(A| W_1, W_2, W_3, \ldots) \sim P(W_1, W_2, W_3,...|A) \cdot P(A) $ <br>
 $P(B| W_1, W_2, W_3, \ldots) \sim P(W_1, W_2, W_3,...|B) \cdot P(B) $
 
<br>
3. 독립적인 단어 분포를 전제하면 다음과 같이 분해할 수 있다. <br>

 $P(A| W_1, W_2, W_3, \ldots) \sim  P(W_1|A)\cdot P(W_2|A)\cdot P(W_3|A) \cdots P(A) $ <br>
 $P(B| W_1, W_2, W_3, \ldots) \sim P(W_1|B)\cdot P(W_2|B)\cdot P(W_3|B) \cdots P(B) $

<br>
4. 이제는 로그함수를 적용해 본다. <br>

$Log_A \sim  Log(P(W_1|A)) +  Log(P(W_2|A)) + Log(P(W_3|A)) + \ldots + Log(P(A)) $ <br>
$Log_B \sim  Log(P(W_1|B)) +  Log(P(W_2|B)) + Log(P(W_3|B)) + \ldots + Log(P(B)) $
<br>

**<span style="color:blue">결론</span>**:  $Log_A$와 $Log_B$를 비교해서 큰 쪽으로 인식!

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# A 유형의 트윗을 읽어온다. 
f = open("/content/drive/MyDrive/인공지능사관학교/빅데이터/03_확률 모델링_업데이트/data/tweets_A.txt","r",encoding="ms949")    # Encoding 주의!
ta = f.readlines()
f.close()

# B 유형의 트윗을 읽어온다. 
f = open("/content/drive/MyDrive/인공지능사관학교/빅데이터/03_확률 모델링_업데이트/data/tweets_B.txt","r", encoding="ms949")   # Encoding 주의!
tb = f.readlines()
f.close()

In [3]:
print(len(ta))
print(len(tb))

150
150


##데이터 전처리 

In [4]:
def preprocessor(tweet):
  freq_dict = {}
  for a_line in tweet:
        a_line = a_line.lower()               # 소문자화.
        a_line = re.sub(r"\W"," ",a_line)     # 특수문자 제거.
        a_line = re.sub(r"\d", " ", a_line)   # 숫자 제거.
        a_line = re.sub("a|the|and|or|because|at", " ",a_line)  #  불용어 제거.
        a_line = a_line.split()  
        for a_word in a_line:
          if len(a_word) > 3:
            if a_word in freq_dict:
              freq_dict[a_word]+=1
            else:
              freq_dict[a_word] = 2 #처음 있지만 이 단어가 없다고 해서 도수를 0 으로 세팅하면 확률이 0 이된다  = log 0 은 -inf 에러가 발생할수 있음
  return freq_dict
              

##학습 모형 준비 

In [5]:
# Series로 변환.
freq_a = pd.Series(preprocessor(ta)).sort_values(ascending=False)
freq_b = pd.Series(preprocessor(tb)).sort_values(ascending=False) 

In [6]:
n_voca = 300
freq_a = freq_a.iloc[:n_voca]
freq_b = freq_b.iloc[:n_voca]

In [7]:
freq_a_sum = freq_a.sum()
log_prob_a = dict(np.log(freq_a/freq_a.sum()))
log_prob_a

{'been': -5.927592470513571,
 'best': -6.215274542965353,
 'bhijeetmk': -6.620739651073516,
 'bien': -6.215274542965353,
 'biggoldring': -6.620739651073516,
 'bility': -6.620739651073516,
 'blocked': -6.620739651073516,
 'blog': -5.011301738639416,
 'blueh': -6.620739651073516,
 'both': -6.620739651073516,
 'bounce': -5.367976682578148,
 'bout': -5.011301738639416,
 'bugs': -6.620739651073516,
 'bulk': -6.620739651073516,
 'bywbf': -6.215274542965353,
 'ccion': -6.620739651073516,
 'ccount': -5.116662254297243,
 'cemsism': -6.620739651073516,
 'ceofsp': -6.620739651073516,
 'check': -6.215274542965353,
 'checked': -6.620739651073516,
 'cher': -5.927592470513571,
 'chez': -6.620739651073516,
 'chrislem': -5.522127362405407,
 'client': -5.704448919199361,
 'clients': -6.620739651073516,
 'comp': -6.215274542965353,
 'confusing': -6.620739651073516,
 'consider': -6.620739651073516,
 'cont': -6.620739651073516,
 'ctcode': -6.620739651073516,
 'ction': -4.674829502018204,
 'currently': -6.2

In [8]:
freq_b_sum = freq_b.sum()
log_prob_b = dict(np.log(freq_b/freq_b_sum))
log_prob_b

{'become': -6.26530121273771,
 'been': -5.8598361046295455,
 'behind': -6.26530121273771,
 'best': -5.8598361046295455,
 'bico': -6.26530121273771,
 'body': -6.26530121273771,
 'bouin': -6.26530121273771,
 'bout': -5.349010480863555,
 'brit': -6.26530121273771,
 'burnziey': -6.26530121273771,
 'butts': -5.8598361046295455,
 'cccp': -6.26530121273771,
 'ccount': -6.26530121273771,
 'ccounts': -6.26530121273771,
 'ccp_gu': -5.8598361046295455,
 'ccp_m': -6.26530121273771,
 'ccpg': -5.349010480863555,
 'ceships': -5.8598361046295455,
 'chful': -6.26530121273771,
 'choice': -5.8598361046295455,
 'chrisjboyl': -6.26530121273771,
 'christm': -6.26530121273771,
 'city': -6.26530121273771,
 'cket': -6.26530121273771,
 'ckground': -6.26530121273771,
 'clone': -6.26530121273771,
 'coffee': -6.26530121273771,
 'come': -6.26530121273771,
 'comme': -6.26530121273771,
 'como': -5.8598361046295455,
 'compl': -6.26530121273771,
 'compren': -6.26530121273771,
 'condition': -6.26530121273771,
 'cono': -

##test data 가져오기 

In [9]:
f = open("/content/drive/MyDrive/인공지능사관학교/빅데이터/03_확률 모델링_업데이트/data/tweets_test.txt","r", encoding="ms949")   # Encoding 주의!
tt = f.readlines()
f.close()
tt

['Just love @Hyperionapp transactional email service - http://Hyperion.com Sorry @SendGrid and @mailjet #timetomoveon\n',
 "@rossdeane Mind submitting a request at http://help.Hyperion.com with account details if you haven't already? Glad to take a look!\n",
 "@veroapp Any chance you'll be adding Hyperion support to Vero?\n",
 '@Elie__ @camj59 jparle de relai SMTP!1 million de mail chez Hyperion / mois compare a 1 million sur lite sendgrid y a pas photo avec mailjet\n',
 'would like to send emails for welcome, password resets, payment notifications, etc. what should i use? was looking at mailgun/Hyperion\n',
 'From Coworker about using Hyperion:  "I would entrust email handling to a Pokemon".\n',
 '@Hyperion Realised I did that about 5 seconds after hitting send!\n',
 'Holy shit. It’s here. http://www.Hyperion.com/ \n',
 'Our new subscriber profile page: activity timeline, aggregate engagement stats, and Hyperion integratio #BJCBranding http://bit.ly/13waU5c \n',
 '@Hyperionapp increas

In [13]:
# 테스트 트윗의 유형 정보를 읽어온다 (Y_test).
f = open("/content/drive/MyDrive/인공지능사관학교/빅데이터/03_확률 모델링_업데이트/data/tweets_test_class.txt","r")
Y_test_raw = f.read()                 # 한 덩어리로 읽어온다.
Y_test = Y_test_raw.split()           # 분절을 통해서 깔끔히!
f.close()

##예측 실시

In [16]:
Y_pred = []
for a_sentence in tt:
    log_prob_sum_a = 0.
    log_prob_sum_b = 0.
    a_sent_preprocessed = preprocessor([a_sentence])
    for a_word, a_freq in a_sent_preprocessed.items():
        if a_word in log_prob_a:
            log_prob_sum_a += log_prob_a[a_word]
        else:
            log_prob_sum_a += np.log(1.0/freq_a_sum)
            
        if a_word in log_prob_b:
            log_prob_sum_b += log_prob_b[a_word]
        else:
            log_prob_sum_b += np.log(1.0/freq_b_sum)
            
    if (log_prob_sum_a > log_prob_sum_b):
        Y_pred.append("A")
    else:
        Y_pred.append("B")

In [21]:
correct = pd.Series([y1==y2] for (y1,y2) in zip(Y_pred,Y_test))
print(correct)

0      [True]
1      [True]
2      [True]
3      [True]
4      [True]
5      [True]
6      [True]
7     [False]
8      [True]
9      [True]
10     [True]
11     [True]
12     [True]
13     [True]
14     [True]
15     [True]
16     [True]
17     [True]
18     [True]
19     [True]
dtype: object
