In [26]:
import math, sys
#from konlpy.tag import Twitter
class BayesianFilter:
    """ 베이지안 필터 """
    def __init__(self):
        self.words = set() # 출현한 단어 기록
        self.word_dict = {} # 카테고리마다의 출현 횟수 기록
        self.category_dict = {} # 카테고리 출현 횟수 기록
    # 형태소 분석하기 --- (※1)
    def split(self, text):
        results = []
        list = text.split()
        for word in list:
            results.append(word)
        return results
    # 단어와 카테고리의 출현 횟수 세기 --- (※2)
    def inc_word(self, word, category):
        # 단어를 카테고리에 추가하기
        if not category in self.word_dict:
            self.word_dict[category] = {}
        if not word in self.word_dict[category]:
            self.word_dict[category][word] = 0
        self.word_dict[category][word] += 1
        self.words.add(word)
    def inc_category(self, category):
        # 카테고리 계산하기
        if not category in self.category_dict:
            self.category_dict[category] = 0
        self.category_dict[category] += 1
    
    # 텍스트 학습하기 --- (※3)
    def fit(self, text, category):
        """ 텍스트 학습 """
        word_list = self.split(text)
        for word in word_list:
            self.inc_word(word, category)
        self.inc_category(category)
    
    # 단어 리스트에 점수 매기기--- (※4)
    def score(self, words, category):
        score = math.log(self.category_prob(category))
        for word in words:
            score += math.log(self.word_prob(word, category))
        return score
    
    # 예측하기 --- (※5)
    def predict(self, text):
        best_category = None
        max_score = -sys.maxsize 
        words = self.split(text)
        for word in words:
            if word in harmful_url_dic:
                score_list = []
                for category in self.category_dict.keys():
                    score = self.score(words, category)
                    score_list.append((category, score))
                    if score > max_score:
                        max_score = score
                        best_category = category
                
        
        return best_category, score_list
    # 카테고리 내부의 단어 출현 횟수 구하기
    def get_word_count(self, word, category):
        if word in self.word_dict[category]:
            return self.word_dict[category][word]
        else:
            return 0
    # 카테고리 계산
    def category_prob(self, category):
        sum_categories = sum(self.category_dict.values())
        category_v = self.category_dict[category]
        return category_v / sum_categories
        
    # 카테고리 내부의 단어 출현 비율 계산 --- (※6)
    def word_prob(self, word, category):
        n = self.get_word_count(word, category) + 1 # ---(※6a)
        d = sum(self.word_dict[category].values()) + len(self.words)
        return n / d
    


In [27]:
harmful_url_dic = {'sex': 1, 'porn': 1, 'gay': 1, 'tube': 1, 'pornhub':1,
                   'xxx': 1, 'fuck': 1, 'dick': 1, 'tit': 2,
                   'anal': 1, 'asian': 1, 'adult': 1, 'shemal': 1,'lesbian': 1,

                   'teen': 2, 'big': 2, 'girl': 2, 'pussi': 2, 'ass': 2, 'amateur': 2, 'matur': 2,
                   'cock': 2,'milf': 2, 'sexi': 2, 'babe':2, 'nud':2, 'nude': 2,
                   'blond': 2, 'blowjob': 2, 'cum': 2, 'young': 2,
                   'hardcor': 2, 'hardcore': 2 ,

                   'game': 3, 'casino': 3, 'play': 3, 'onlin': 3,'online':3, 
                   'card': 3,  'race': 3,  'admin': 3,

                   'escort': 4, 'servic': 4, 'service':4, 'agenc': 4,
                   'galleri': 4,'massag': 4, 'model': 4, 
                   'vip': 4, 'profil': 4,
                   
                   
                   'cam':5, 'video':5, 'chat':5, 'webcam': 5,
                   
                   'domain':7, 'domains':7, 'gateway':7
}


In [28]:
bf = BayesianFilter()
for i in harmful_url_dic:
    bf.fit(i,harmful_url_dic[i])
    

In [29]:
pre, scorelist = bf.predict("good")
print(pre)
print(scorelist)

2
[(1, -5.726597152422373), (2, -5.391124416134243), (3, -6.137996996050351), (4, -6.035481432524756), (5, -6.767630770887971), (7, -7.038783541388542)]
