<a href="https://colab.research.google.com/github/igorjoz/machine-learning-course-and-projects/blob/main/21_Ham_%26_Spam_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import random
import math

def load_email_data(spam_path, ham_path):
  with open(spam_path, encoding="utf-8") as f1, open(ham_path, encoding="utf-8") as f2:
    spam_data = json.load(f1)
    ham_data = json.load(f2)
  return spam_data + ham_data

def train_test_split(data, test_ratio=0.2):
  random.shuffle(data)
  cut = int(len(data) * (1 - test_ratio))
  return data[:cut], data[cut:]

def preprocess(text):
  return text.lower().replace('-', ' ').replace('.', ' ').replace('?', ' ').replace('!', ' ').replace(',', ' ').split()

def train_nb(train, alpha=1.0):
  class_counts = {}
  word_counts = {}
  total_words = {}

  for record in train:
    label = record["label"]
    class_counts[label] = class_counts.get(label, 0) + 1
    word_counts.setdefault(label, {})
    total_words.setdefault(label, 0)

    words = preprocess(record["text"])

    for word in words:
      word_counts[label][word] = word_counts[label].get(word, 0) + 1
      total_words[label] += 1

  vocab = set()
  for word_count in word_counts.values():
    vocab.update(word_count.keys())

  model = {
      "class_counts": class_counts,
      "word_counts": word_counts,
      "total_words": total_words,
      "vocab": vocab,
      "alpha": alpha,
      "total_docs": len(train)
  }

  print(model)
  return model

def log_prob(model, words, class_name):
    logp = math.log(model["class_counts"][class_name] / model["total_docs"])
    V = len(model["vocab"])
    a = model["alpha"]
    for word in words:
        wc = model["word_counts"][class_name].get(word, 0)
        logp += math.log((wc + a) / (model["total_words"][class_name] + a * V))
    return logp

def predict(model, text):
    words = preprocess(text)
    best_class, best_log = None, -float("inf")
    for c in model["class_counts"]:
        lp = log_prob(model, words, c)
        if lp > best_log:
            best_class, best_log = c, lp
    return best_class

def evaluate_model(model, test_data):
    correct = 0
    for rec in test_data:
        prediction = predict(model, rec["text"])
        if prediction == rec["label"]:
            correct += 1
    accuracy = correct / len(test_data)
    print(f"Skuteczność na zbiorze testowym: {accuracy * 100:.2f}%")
    return accuracy

In [2]:
from pprint import pprint

def main():
  data = load_email_data("spam.json", "ham.json")
  train, test = train_test_split(data)

  # print(data)

  model = train_nb(train)
  # pprint(model)

  evaluate_model(model, test)

  while True:
    user_input = input("Wpisz wiadomość do klasyfikacji (lub 'q' aby zakończyć): ")
    if user_input.lower() == 'q':
      break
    prediction = predict(model, user_input)
    print(f"Klasyfikacja: {prediction.upper()}")

main()

{'class_counts': {'spam': 1472, 'ham': 1288}, 'word_counts': {'spam': {'odbierz': 72, 'bezpłatny': 27, 'e': 80, 'book': 26, 'o': 59, 'zdrowym': 9, 'odżywianiu': 9, 'twoje': 142, 'konto': 114, 'wymaga': 58, 'natychmiastowej': 35, 'weryfikacji': 36, 'tylko': 114, 'dzisiaj': 29, '–': 630, 'wyjątkowa': 28, 'oferta': 91, 'dla': 72, 'ciebie': 75, 'zostałeś': 107, 'wytypowany': 42, 'do': 202, 'udziału': 42, 'w': 172, 'ekskluzywnej': 40, 'promocji': 39, 'nie': 119, 'czekaj': 3, 'wypłać': 1, 'gotówkę': 1, 'natychmiast': 10, 'zainstaluj': 51, 'tę': 26, 'aplikację': 33, 'i': 160, 'zacznij': 26, 'zarabiać': 26, 'zainwestuj': 33, '100': 71, 'zł': 138, 'zarób': 34, '10': 35, '000': 33, 'tydzień': 32, 'zdobądź': 68, 'darmową': 37, 'kartę': 43, 'podarunkową': 40, 'już': 49, 'teraz': 199, 'twój': 93, 'sąsiad': 1, 'zarabia': 2, 'a': 1, 'ty': 1, 'numer': 36, 'został': 73, 'wybrany': 122, 'sprawdź': 101, 'szczegóły': 37, 'hasło': 15, 'paypal': 12, 'wygasło': 14, 'zresetuj': 14, 'je': 19, 'kliknij': 195, '