In [219]:
import re
import os
import sys
import pandas as pd

data = pd.DataFrame(columns=['word', 'spam', 'not_spam'])

In [220]:
NOT_SPAM = 0
SPAM = 1


class MetaSingleton(type):
    _instances = {}

    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super(MetaSingleton, cls).__call__(*args, **kwargs)
        return cls._instances[cls]


class Spam(metaclass=MetaSingleton):
    words_freq = {}
    total_words_count = {}
    count_spam = 0
    pA = 0
    pNotA = 0
    replace_char_list = {
        '!': '',
        '"': '',
        "'": '',
        '.': '',
        ',': '',
        '(': '',
        ')': '',
        '[': '',
        ']': '',
        '/': '',
        '{': '',
        '}': '',
        '?': '',
        '*': '',
        '#': '',
        '№': '',
        ':': '',
        ';': '',
        '%': '',
        '_': '',
        '$': '',
    }
    text_arr = {}

    # обучить данные
    def train(self, train_words):
        for text in train_words:
            # считаем количество спам предложений
            if text[1] == SPAM:
                self.count_spam += 1
            self._calculate_word_frequencies(text[0], text[1])

        self.pA = self.count_spam / len(train_words)
        self.pNotA = 1 - self.pA

    # расчет количества слов в каждом предложении
    def _calculate_word_frequencies(self, body, label):
        word_list = self._clear_text(body).lower().split(' ')

        for word in word_list:
            if word == '':
                continue

            # общий подсчет слов
            if not self.total_words_count.get(word):
                self.total_words_count[word] = 2

            self.total_words_count[word] += 1

            # количество слов в спаме и нет [не спам, спам]
            if not self.words_freq.get(word):
                self.words_freq[word] = [1, 1]

            self.words_freq[word][label] += 1

    # расчет вероятности слова
    def _calculate_P_Bi_A(self, word, label):
        return self.words_freq[word][label] / self.total_words_count[word]

    # расчет вероятности предложения
    def _calculate_P_B_A(self, text, label):
        word_list = text.lower().split(' ')
        pba = 1
        for word in word_list:
            if not self.words_freq.get(word):
                continue
            val = self._calculate_P_Bi_A(word, label)

            if not self.text_arr.get(word):
                self.text_arr[word] = [word, val]
            elif len(self.text_arr[word]) == 2:
                self.text_arr[word].append(val)
                self.text_arr[word].append(self.words_freq[word][SPAM])
                self.text_arr[word].append(self.words_freq[word][NOT_SPAM])
                self.text_arr[word].append(self.total_words_count[word])

            pba *= val

        return pba

    # отчистить текст от знаков
    def _clear_text(self, text):
        # убрать числа
        text = re.sub(r'\d+', '', text)

        for i, j in self.replace_char_list.items():
            text = text.replace(i, j)

        return text

    # классифицировать
    def classify(self, email):
        self.text_arr = {}
        email = self._clear_text(email.lower())
        spam_rate = self._calculate_P_B_A(email, SPAM) * self.pA / self.pNotA

        not_spam_rate = self._calculate_P_B_A(email, NOT_SPAM) * self.pA / self.pNotA
        return spam_rate > not_spam_rate


train_data = []


def train():
    Spam().train(train_data)


In [221]:
df = pd.read_csv(r'W:\python\DataSceince\classifier\application\raw\spam_or_not_spam.csv')
df = df.dropna()

# так как в программе используется обратные значения, меняем их местами 1 - значение спама
df.label = df.label.replace(to_replace={1:0,0:1})

In [222]:
# быстрый осмотр показал, что во многих письмах встречается слово NUMBER, таким образом мы его полностью уберем
def remove_number(text):
    text = text.replace('NUMBER', '')

    # после удаления останутся лишние пробелы их тоже надо убрать
    text = re.sub(r'\s+', ' ', text)
    return text

# применим функцию
df.email = df.email.apply(remove_number)

train_data = df.values.tolist()
train()

In [252]:
texts = [
    # 'Hi, My name is Warren E. Buffett an American business magnate, investor and philanthropist. am the most successful investor in the world. I believe strongly in‘giving while living’ I had one idea that never changed in my mind? that you should use your wealth to help people and i have decided to give {$1,500,000.00} One Million Five Hundred Thousand United Dollars, to randomly selected individuals worldwide. On receipt of this email, you should count yourself as the lucky individual. Your email address was chosen online while searching at random. Kindly get back to me at your earliest convenience before i travel to japan for my treatment , so I know your email address is valid. Thank you for accepting our offer, we are indeed grateful You Can Google my name for more information: God bless you. Best Regard Mr.Warren E. Buffett Billionaire investor !',
    # "Hi guys I want to build a website like REDACTED and I wanted to get your perspective of whether that site is good from the users' perspective before I go ahead and build something similar. I think that the design of the site is very modern and nice but I am not sure how people would react to a similar site? I look forward to your feedback. Many thanks!",

]

# for text in texts:
#     print(Spam().classify(text))

text = 'As a result of your application for the position of Data Engineer, I would like to invite you to attend an interview on May 30, at 9 a.m. at our office in Washington, DC. You will have an interview with the department manager, Moris Peterson. The interview will last about 45 minutes. If the date or time of the interview is inconvenient, please contact me by phone or email to arrange another appointment. We look forward to seeing you.'

text = Spam()._clear_text(text).lower()

In [253]:
tmp = []

for item in Spam().text_arr.values():
    tmp.append(item)

data = pd.DataFrame(tmp, columns=['word', 'spam', 'not_spam', 'count_spam', 'count_not_spam', 'count'])

# data

text

'as a result of your application for the position of data engineer i would like to invite you to attend an interview on may  at  am at our office in washington dc you will have an interview with the department manager moris peterson the interview will last about  minutes if the date or time of the interview is inconvenient please contact me by phone or email to arrange another appointment we look forward to seeing you'

In [270]:
# расчет вероятности слова
def calculate_P_Bi_A(word, label):
    return Spam().words_freq[word][label] / Spam().total_words_count[word]

# расчет вероятности предложения
def calculate_P_B_A(text, label):
    word_list = text.lower().split(' ')
    pba = 1
    for word in word_list:
        if not Spam().words_freq.get(word):
            continue
        pba *= Spam()._calculate_P_Bi_A(word, label)
    return pba
data.head()

Unnamed: 0,word,spam,not_spam,count_spam,count_not_spam,count
0,as,0.753429,0.246571,2527,827,3354
1,a,0.807901,0.192099,10552,2509,13061
2,result,0.76,0.24,57,18,75
3,of,0.749246,0.749246,10679,3574,14253
4,your,0.381077,0.618923,1168,1897,3065


In [286]:
count_spam = Spam().count_spam
pA = Spam().pA
pB = Spam().pNotA
print('P(A) - {:.10f}'.format(pA))
print('P(B) - {:.10f}'.format(pB))

pBA = calculate_P_B_A(text, SPAM)
pBNotA = calculate_P_B_A(text, NOT_SPAM)
print('P(B|A) - {:.20f}'.format(pBA))
print('P(B|notA) - {:.80f}'.format(pBNotA))

pAB = pBA * pA / pB
print('P(A)P(B|A){:.20f}'.format(pBA * pA))
print('P(A|B) - {:.20f}'.format(pAB))
pNotAB = pBNotA * pA / pB
print('P(notA|B) - {:.80f}'.format(pNotAB))


P(A) - 0.8336112037
P(B) - 0.1663887963
P(B|A) - 0.00000000000461337270
P(B|notA) - 0.00000000000000000000000000000000000000000000003069700194229710996788892127098282
P(A)P(B|A)0.00000000000384575917
P(A|B) - 0.00000000002311308970
P(notA|B) - 0.00000000000000000000000000000000000000000000015379259490128814105679486294754932
