In [9]:
# ml_spam_detection.ipynb

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Esempio di dataset spam (SpamAssassin) già preprocessato in CSV
df = pd.read_csv('spam_dataset.csv')  # campi: [text, label]

X = df['text']
y = df['label']  # 1=spam, 0=ham

# Divisione train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Trasformazione TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Modello Naive Bayes
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Valutazione
y_pred = model.predict(X_test_vec)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Esempio di predizione
text = ['''
From approvals@mindspring.com  Mon Aug 26 21:57:45 2002
Return-Path: <approvals@mindspring.com>
Delivered-To: zzzz@localhost.spamassassin.taint.org
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 7154E43F99
	for <zzzz@localhost>; Mon, 26 Aug 2002 16:57:45 -0400 (EDT)
Received: from mail.webnote.net [193.120.211.219]
	by localhost with POP3 (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Mon, 26 Aug 2002 21:57:45 +0100 (IST)
Received: from bmbgglawweb.bmbgglaw ([202.164.173.138])
	by webnote.net (8.9.3/8.9.3) with ESMTP id VAA27944
	for <zzzz@spamassassin.taint.org>; Mon, 26 Aug 2002 21:51:58 +0100
From: approvals@mindspring.com
Received: from smtp-gw-4.msn.com (200.173.221.83 [200.173.221.83]) by bmbgglawweb.bmbgglaw with SMTP (Microsoft Exchange Internet Mail Service Version 5.5.2653.13)
	id RMT9XDW1; Fri, 23 Aug 2002 10:55:23 +0800
Message-ID: <00003ed74393$00003e20$00002f2f@mx02.earthlink.net>
To: <Valued.Recipient@webnote.net>
Subject: MSNBC: Rates Hit 35 year Low 4.75% ...12304
Date: Thu, 22 Aug 2002 20:49:30 -1800
MIME-Version: 1.0
Content-Type: text/plain;
	charset=""Windows-1252""
Content-Transfer-Encoding: 7bit

===================================================================

Now you can have HUNDREDS of lenders compete for your loan!

FACT: Interest Rates are at their lowest point in 40 years!

You're eligible even with less than perfect credit !!

	* Refinancing
	* New Home Loans
	* Debt Consolidation
	* Debt Consultation
	* Auto Loans
	* Credit Cards
	* Student Loans
	* Second Mortgage
	* Home Equity

This Service is 100% FREE without any obligation.

Visit Our Web Site at:  http://marketing-fashion.com/user0201/index.asp?Afft=QM3

====================================================================

To Unsubscribe: http://marketing-fashion.com/light/watch.asp''']

text_vec = vectorizer.transform(text)
print(model.predict(text_vec))  # [
# 1=spam
# 0=ham
# ]

[[1307   82]
 [  36  446]]
              precision    recall  f1-score   support

           0       0.97      0.94      0.96      1389
           1       0.84      0.93      0.88       482

    accuracy                           0.94      1871
   macro avg       0.91      0.93      0.92      1871
weighted avg       0.94      0.94      0.94      1871

[1]
