<a href="https://colab.research.google.com/github/igorjoz/machine-learning-course-and-projects/blob/main/22_Ham_%26_Spam_classification_gmail_integration_lekcja.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import random
import math

def load_email_data(spam_path, ham_path):
  with open(spam_path, encoding="utf-8") as f1, open(ham_path, encoding="utf-8") as f2:
    spam_data = json.load(f1)
    ham_data = json.load(f2)
  return spam_data + ham_data

def train_test_split(data, test_ratio=0.2):
  random.shuffle(data)
  cut = int(len(data) * (1 - test_ratio))
  return data[:cut], data[cut:]

def preprocess(text):
  return text.lower().replace('-', ' ').replace('.', ' ').replace('?', ' ').replace('!', ' ').replace(',', ' ').split()

def train_nb(train, alpha=1.0):
  class_counts = {}
  word_counts = {}
  total_words = {}

  for record in train:
    label = record["label"]
    class_counts[label] = class_counts.get(label, 0) + 1
    word_counts.setdefault(label, {})
    total_words.setdefault(label, 0)

    words = preprocess(record["text"])

    for word in words:
      word_counts[label][word] = word_counts[label].get(word, 0) + 1
      total_words[label] += 1

  vocab = set()
  for word_count in word_counts.values():
    vocab.update(word_count.keys())

  model = {
      "class_counts": class_counts,
      "word_counts": word_counts,
      "total_words": total_words,
      "vocab": vocab,
      "alpha": alpha,
      "total_docs": len(train)
  }

  print(model)
  return model

def log_prob(model, words, class_name):
    logp = math.log(model["class_counts"][class_name] / model["total_docs"])
    V = len(model["vocab"])
    a = model["alpha"]
    for word in words:
        wc = model["word_counts"][class_name].get(word, 0)
        logp += math.log((wc + a) / (model["total_words"][class_name] + a * V))
    return logp

def predict(model, text):
    words = preprocess(text)
    best_class, best_log = None, -float("inf")
    for c in model["class_counts"]:
        lp = log_prob(model, words, c)
        if lp > best_log:
            best_class, best_log = c, lp
    return best_class

def evaluate_model(model, test_data):
    correct = 0
    for rec in test_data:
        prediction = predict(model, rec["text"])
        if prediction == rec["label"]:
            correct += 1
    accuracy = correct / len(test_data)
    print(f"Skuteczność na zbiorze testowym: {accuracy * 100:.2f}%")
    return accuracy

In [None]:
import base64
import pickle
from google.auth.transport.requests import Request
from googleapiclient.discovery import build

def get_email_service():
  with open("token.pkl", "rb") as token:
    creds = pickle.load(token)

  if creds and creds.expired and creds.refresh_token:
    creds.refresh(Request())

  service = build("gmail", "v1", credentials = creds)
  return service

def fetch_unread_emails_from_label(model, label_name="Test"):
  service = get_email_service()

  label_test = get_label_id(service, label_name)
  label_ham = get_label_id(service, "ham")
  label_spam = get_label_id(service, "spam2")

  response = service.users().messages().list(
      userId='me',
      labelIds=[label_test, 'UNREAD'],
      maxResults=100
  ).execute()

  messages = response.get('messages', [])
  email_list = []

  for msg in messages:
      msg_id = msg['id']
      message = service.users().messages().get(userId='me', id=msg_id, format='full').execute()
      payload = message.get('payload', {})
      headers = payload.get('headers', [])

      subject = next((h['value'] for h in headers if h['name'] == 'Subject'), '')
      body = get_message_body(payload)
      full_text = f"{subject} {body.strip()}"

      prediction = predict(model, full_text)

      add_labels = [label_spam if prediction == 'spam' else label_ham]
      remove_labels = [label_test]

      service.users().messages().modify(
          userId='me',
          id=msg_id,
          body={
              'addLabelIds': add_labels,
              'removeLabelIds': remove_labels
          }
      ).execute()

      print(f"[ZMIANA] Wiadomość '{subject[:40]}...' → {prediction.upper()} (etykieta zmieniona)")

def get_label_id(service, label_name):
    labels = service.users().labels().list(userId='me').execute().get('labels', [])
    for label in labels:
        if label['name'].lower() == label_name.lower():
            return label['id']
    return None


def get_message_body(payload):
    parts = payload.get('parts')
    if parts:
        for part in parts:
            if part['mimeType'] == 'text/plain':
                data = part['body'].get('data')
                if data:
                    return base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
    else:
        body_data = payload['body'].get('data')
        if body_data:
            return base64.urlsafe_b64decode(body_data).decode('utf-8', errors='ignore')
    return "(brak treści)"



In [None]:
from pprint import pprint

def main():
  data = load_email_data("spam.json", "ham.json")
  train, test = train_test_split(data)

  # print(data)

  model = train_nb(train)
  # pprint(model)

  evaluate_model(model, test)

  fetch_unread_emails_from_label(model)

  # while True:
  #   user_input = input("Wpisz wiadomość do klasyfikacji (lub 'q' aby zakończyć): ")
  #   if user_input.lower() == 'q':
  #     break
  #   prediction = predict(model, user_input)
  #   print(f"Klasyfikacja: {prediction.upper()}")

main()

{'class_counts': {'spam': 1447, 'ham': 1313}, 'word_counts': {'spam': {'twoje': 143, 'hasło': 18, 'do': 203, 'banku': 52, 'wygasło': 17, '–': 627, 'zresetuj': 17, 'je': 22, 'teraz': 203, 'zainwestuj': 37, '100': 78, 'zł': 146, 'i': 162, 'zarób': 36, '10': 39, '000': 37, 'w': 160, 'tydzień': 34, 'konto': 111, 'e': 72, 'mail': 39, 'zostało': 45, 'wytypowane': 37, 'nagrody': 40, 'gratulacje': 51, 'zostałeś': 111, 'wybrany': 130, 'jako': 26, 'zwycięzca': 26, 'potwierdź': 80, 'swoje': 92, 'dane': 113, 'aby': 127, 'uniknąć': 35, 'dezaktywacji': 1, 'konta': 68, 'zarejestruj': 42, 'się': 62, 'otrzymaj': 72, 'bonus': 47, 'zamów': 4, 'darmową': 37, 'próbkę': 2, 'zobacz': 5, 'efekty': 1, 'na': 193, 'własne': 1, 'oczy': 1, 'kup': 25, 'kupon': 12, 'bez': 47, 'vat': 24, 'oferta': 96, 'ważna': 24, 'tylko': 121, 'dziś': 37, 'rejestracja': 24, 'webinar': 23, 'o': 51, 'rozwijaniu': 10, 'firmy': 10, 'miejsca': 23, 'ograniczone': 22, 'zamówienie': 3, 'nie': 111, 'dostarczone': 1, 'sprawdź': 99, 'dlaczego'