In [3]:
import pandas as pd

#Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table(r'C:\Users\digus\Desktop\MachineLearning\smsspamcollection\SMSSpamCollection',
                  sep = '\t',
                  header = None,
                  names=['label', 'sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#Map applies a function to all the items in an input list or df column.
df['label'] = df.label.map({'ham':0, 'spam':1}) #ham => 0, spam => 1로 분류
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [6]:
message = df.sms_message.tolist()  #df의 sms_message열을 list로 반환
message

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though",
 "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
 'Had you

In [7]:
#각 단어의 발생 빈도를 사전형식으로 반환
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer() #set the variable

count_vector.fit(message)
count_vector.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

In [8]:
mes_array = count_vector.transform(message).toarray()
mes_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
                                                   df['label'],
                                                   random_state=1)
print ("Our Original set contains", df.shape[0], "observations")
print ("Our training set contains", X_train.shape[0], "observations")
print ("Our testing set contains", X_test.shape[0], "observations")

Our Original set contains 5572 observations
Our training set contains 4179 observations
Our testing set contains 1393 observations


In [10]:
train = count_vector.fit_transform(X_train)
test = count_vector.transform(X_test)

## Bernoulli document model 
* 시험 항목 W = (label = spam)의 클래스 레이블 예측
* 사후 확률 P(Ci|W) ->  P(spam|W) vs P(ham|W). 즉, P(W|spam)P(spam) vs P(W|ham)P(ham)
* 우도 P(W|Ci) -> P(W|spam), P(W|ham)
* 사전확률 P(Ci) = P(spam), P(ham)
* 사후 확률을 직접 구하는건 어렵기 때문에 사전확률과 우도를 미리 구한 다음 계산한다

### 1. 직접 구현하기

In [120]:
import os, math, random, re, glob
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

In [247]:
import pandas as pd

#Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table(r'C:\Users\digus\Desktop\MachineLearning\smsspamcollection\SMSSpamCollection',
                  sep = '\t',
                  header = None,
                  names=['label', 'sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [236]:
#Map applies a function to all the items in an input list or df column.
#df['label'] = df.label.map({'ham':0, 'spam':1}) #ham => 0, spam => 1로 분류
#df.head()

In [237]:
#messageList = df.sms_message.tolist()  #df의 sms_message열을 list로 반환
#messageList

In [248]:
def split_train_test(data, prob):
    np.random.seed(42)        
    shuffled_indices = np.random.permutation(len(data))
    #np.random.shuffle : array를 셔플해서 inplace한다. : 원본 array 자체가 변함
    test_set_size = int(len(data) * prob) #테스트 세트의 크기 = 전체 data 개수 * 테스트세트의 비율
    test_indices = shuffled_indices[:test_set_size] #suffle된 data array에서 test_set_size 크기 만큼을 잘라 test set의 data로 받자
    train_indices = shuffled_indices[test_set_size:] #suffle된 data array에서 위에서 자르고 남은 뒷부분을 train set의 data로 받자
    return data.iloc[train_indices], data.iloc[test_indices]

def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)


In [282]:
#for fn in df['label']:
#    is_spam = "ham" not in fn

False

In [249]:
print("전체 데이터 개수 : {}".format(len(df)))
print("\n")
train_data, test_data = split_train_test(df, 0.25)
print("훈련 데이터 개수 : {}".format(len(train_data)))
print("테스트 데이터 개수 : {}".format(len(test_data)))


전체 데이터 개수 : 5572


훈련 데이터 개수 : 4179
테스트 데이터 개수 : 1393


In [250]:
spamNum = len(train_data[train_data['label']=="spam"])
hamNum = len(train_data[train_data['label']=="ham"])
spam_train = train_data[train_data['label']=="spam"]
ham_train = train_data[train_data['label']=="ham"]
print("훈련 데이터 중 스팸 메일 개수 : {}".format(spamNum))
print("훈련 데이터 중 일반 메일 개수 : {}".format(hamNum))

훈련 데이터 중 스팸 메일 개수 : 561
훈련 데이터 중 일반 메일 개수 : 3618


In [251]:
test_spamNum = len(test_data[test_data['label']=="spam"])
test_hamNum = len(test_data[test_data['label']=="ham"])
test_spam = test_data[test_data['label']=="spam"]
test_ham = test_data[test_data['label']=="ham"]
print("테스트 데이터 중 스팸 메일 개수 : {}".format(test_spamNum))
print("테스트 데이터 중 일반 메일 개수 : {}".format(test_hamNum))

테스트 데이터 중 스팸 메일 개수 : 186
테스트 데이터 중 일반 메일 개수 : 1207


In [252]:
train_data_list = train_data.sms_message.tolist()
test_data_list = test_data.sms_message.tolist()
spam_train_list = spam_train.sms_message.tolist()
test_spam_list = test_spam.sms_message.tolist()

In [253]:
# 사전확률 계산
prior_spam = spamNum / (spamNum + hamNum)
prior_ham = hamNum / (spamNum + hamNum)

print("스팸메일의 사전확률 P(spam): ", prior_spam)
print("일반메일의 사전확률 P(ham): ", prior_ham)


스팸메일의 사전확률 P(spam):  0.1342426417803302
일반메일의 사전확률 P(ham):  0.8657573582196698


In [316]:
for i in train_data:
     is_spam = df[i][df["label"]=="spam"]
is_spam

2       Free entry in 2 a wkly comp to win FA Cup fina...
5       FreeMsg Hey there darling it's been 3 week's n...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
15      XXXMobileMovieClub: To use your credit, click ...
19      England v Macedonia - dont miss the goals/team...
34      Thanks for your subscription to Ringtone UK yo...
42      07732584351 - Rodger Burns - MSG = We tried to...
54      SMS. ac Sptv: The New Jersey Devils and the De...
56      Congrats! 1 year special cinema pass for 2 is ...
65      As a valued customer, I am pleased to advise y...
67      Urgent UR awarded a complimentary trip to Euro...
68      Did you hear about the new "Divorce Barbie"? I...
93      Please call our customer service representativ...
95      Your free ringtone is waiting to be collected....
114     GENT! 

In [318]:
all_words = {'spam':[], 'ham':[]}
bern_counts = defaultdict(lambda: [0,0])

for message in train_data:
    for word in tokenize(message):
        bern_counts[word][0 if is_spam.items() else 1] += 1

In [307]:
# 우도 계산
# spam : w라는 단어가 포함된 스팸문서 수
# 분자에 k, 분모에 2k만큼 더해준다
k=0.5 #스무딩
bern_likelihood = [(w, (spam + k)/(spamNum + 2*k),
                  (ham + k)/(hamNum + 2*k))
                  for w, (spam, ham) in bern_counts.items()]

print("학습 단어장의 크기:{}".format(len(bern_counts)))

학습 단어장의 크기:3


In [308]:
def bernoulliModel(message):
    global prior_spam, prior_ham
    
    message_words = tokenize(message)
    #log_prob_spam = log_prob_ham = 0.0
    
    log_prob_spam, log_prob_ham = math.log(prior_spam), math.log(prior_ham)
    
    for word, prob_spam, prob_ham in bern_likelihood:
        if word in message_words:
            log_prob_spam += math.log(prob_spam)
            log_prob_ham += math.log(prob_ham)
            
        else:
            log_prob_spam += math.log(1.0 - prob_spam)
            log_prob_ham += math.log(1.0 - prob_ham)
    prob_spam = math.exp(log_prob_spam)
    prob_ham = math.exp(log_prob_ham)
    return prob_spam / (prob_spam + prob_ham)

In [309]:
bern_classified = [(subject,is_spam, bernoulliModel(subject))
                  for subject in test_data]

In [314]:
counts = Counter((is_spam, spam_prob > 0.5)
                for _, spam_prob, is_spam in bern_classified)
counts

TypeError: '>' not supported between instances of 'str' and 'float'

In [137]:
print("정밀도 : {:f}".format(counts[(True,True)] / (counts[(True,True)]+counts[(False,True)])))
print("재현율 : {:f}".format(counts[(True,True)] / (counts[(True,True)]+counts[(True,False)])))

ZeroDivisionError: division by zero

### 2. sklearn 모듈을 이용한 Bernoulli

In [43]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [44]:
message = df.sms_message.tolist()  #df의 sms_message열을 list로 반환
message

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though",
 "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
 'Had you

In [45]:
#각 단어의 발생 빈도를 사전형식으로 반환
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer() #set the variable

count_vector.fit(message)
count_vector.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

In [46]:
X_data = count_vector.transform(message).toarray()
X_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                   df['label'],
                                                   random_state=1)
model = BernoulliNB()
#class sklearn.naive_bayes.BernoulliNB(alpha=1.0,binarize=0.0, fit_prior=True, class_prior=None)
# alpha -> 라플라스 스무딩 (default = 1.0).
# 각 단어에 대한 확률의 분모, 분자에 전부 숫자를 더해서 분자가 0이 되는 것을 방지
model.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [48]:
print("테스트 데이터의 분류 성능: ", model.score(X_test, y_test))

테스트 데이터의 분류 성능:  0.9813352476669059


## Multinomial document model

### 1. 직접 구현하기

In [49]:
import os, math, random, re, glob
from collections import Counter, defaultdict

In [50]:
import pandas as pd

#Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table(r'C:\Users\digus\Desktop\MachineLearning\smsspamcollection\SMSSpamCollection',
                  sep = '\t',
                  header = None,
                  names=['label', 'sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [51]:
def split_train_test(data, prob):
    np.random.seed(42)        
    shuffled_indices = np.random.permutation(len(data))
    #np.random.shuffle : array를 셔플해서 inplace한다. : 원본 array 자체가 변함
    test_set_size = int(len(data) * prob) #테스트 세트의 크기 = 전체 data 개수 * 테스트세트의 비율
    test_indices = shuffled_indices[:test_set_size] #suffle된 data array에서 test_set_size 크기 만큼을 잘라 test set의 data로 받자
    train_indices = shuffled_indices[test_set_size:] #suffle된 data array에서 위에서 자르고 남은 뒷부분을 train set의 data로 받자
    return data.iloc[train_indices], data.iloc[test_indices]

def tokenize2(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return all_words

In [52]:
for fn in df:
    is_spam = 'label'=="ham" not in fn

In [53]:
print("전체 데이터 개수 : {}".format(len(df)))
print("\n")
train_data, test_data = split_train_test(df, 0.25)
print("훈련 데이터 개수 : {}".format(len(train_data)))
print("테스트 데이터 개수 : {}".format(len(test_data)))


전체 데이터 개수 : 5572


훈련 데이터 개수 : 4179
테스트 데이터 개수 : 1393


In [55]:

spamNum = len(train_data[train_data['label']=="spam"])
hamNum = len(train_data) - spamNum
spam_train = train_data[train_data['label']=="spam"]
ham_train = train_data[train_data['label']=="ham"]
print("훈련 데이터 중 스팸 메일 개수 : {}".format(spamNum))
print("훈련 데이터 중 일반 메일 개수 : {}".format(hamNum))

훈련 데이터 중 스팸 메일 개수 : 561
훈련 데이터 중 일반 메일 개수 : 3618


In [56]:
test_spamNum = len(test_data[test_data['label']=="spam"])
test_hamNum = len(test_data) - test_spamNum
test_spam = test_data[test_data['label']=="spam"]
test_ham = test_data[test_data['label']=="ham"]
print("테스트 데이터 중 스팸 메일 개수 : {}".format(test_spamNum))
print("테스트 데이터 중 일반 메일 개수 : {}".format(test_hamNum))

테스트 데이터 중 스팸 메일 개수 : 186
테스트 데이터 중 일반 메일 개수 : 1207


In [57]:
train_data_list = train_data.sms_message.tolist()
test_data_list = test_data.sms_message.tolist()
spam_train_list = spam_train.sms_message.tolist()
test_spam_list = test_spam.sms_message.tolist()

In [58]:
# 사전확률 계산
prior_spam = spamNum / (spamNum + hamNum)
prior_ham = hamNum / (spamNum + hamNum)

print("스팸메일의 사전확률 P(spam): ", prior_spam)
print("일반메일의 사전확률 P(ham): ", prior_ham)


스팸메일의 사전확률 P(spam):  0.1342426417803302
일반메일의 사전확률 P(ham):  0.8657573582196698


In [59]:
#is_trainset_spam = train_data["label"]=="spam"
#is_trainset_spam

In [60]:
for ln in train_data["label"]:
    is_trainset_spam = "ham" not in ln

In [61]:
all_words = {'spam':[], 'ham':[]}
multi_counts = defaultdict(lambda: [0,0])
for message in train_data:
    for word in tokenize2(message):
        multi_counts[word][0 if is_trainset_spam else 1] += 1
        if is_trainset_spam:
            all_words['spam'].append(word)
        else:
            all_words['ham'].append(word)

In [319]:
Vfs = len(all_words['spam'])
Vfh = len(all_words['ham'])
    
Vs = len(set(all_words['spam']))
Vh = len(set(all_words['ham']))

print("스팸메일에 있는 단어수(중복X) : {}".format(Vs))
print("일반메일에 있는 단어수(중복X) : {}".format(Vh))      

스팸메일에 있는 단어수(중복X) : 0
일반메일에 있는 단어수(중복X) : 0


In [320]:
k=0.1
multi_likelihood = [(w, (spam + k)/(Vfs + k*Vs),
                    (ham + k)/ (Vfh + 2*Vh))
                   for w, (spam, ham) in multi_counts.items()]

ZeroDivisionError: float division by zero

In [None]:
def multinomialModel(message):

    global prior_spam, prior_ham
   
    message_words = tokenize2(message)

    log_prob_spam ,log_prob_ham = math.log(prior_spam),math.log(prior_ham)
    
    for word, prob_spam, prob_ham in multi_likelihood:
        if word in message_words:
            log_prob_spam += math.log(prob_spam)
            log_prob_ham += math.log(prob_ham)    
        
    prob_spam     = math.exp(log_prob_spam)
    prob_ham = math.exp(log_prob_ham)

    return prob_spam / (prob_spam + prob_ham) 

In [None]:
multi_classified = [(subject, is_spam, multinomialModel(subject))
              for subject, is_spam in test_data]

In [None]:
counts = Counter((is_spam, spam_prob > 0.5) 
                for _, is_spam, spam_prob in mtnm_classified)

In [None]:
print("정밀도 : {:f}".format( counts[(True,True)] / (counts[(True,True)]+counts[(False,True)]) ) )
print("재현율 : {:f}".format( counts[(True,True)] / (counts[(True,True)]+counts[(True,False)]) ) )

### 2. sklearn 모듈을 이용한 Multinomial

In [80]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                   df['label'],
                                                   random_state=1)
model2 = MultinomialNB()
model2.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [85]:
print("테스트 데이터의 분류 성능: ", model2.score(X_test, y_test))

테스트 데이터의 분류 성능:  0.9834888729361091


* 중간 코드의 에러를 해결하지 못하여 결과를 끝까지 내지 못하였습니다....