In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/linghuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/linghuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/linghuang/nltk_data...


True

### Text Cleaning

In [3]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

email_text = "This is an example"
cleaned_text = preprocess_text(email_text)
print(cleaned_text)


example


### Feature Extraction

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ["This is the first email.", "This is the second email."]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.toarray())

[[0.4090901  0.57496187 0.4090901  0.         0.4090901  0.4090901 ]
 [0.4090901  0.         0.4090901  0.57496187 0.4090901  0.4090901 ]]


### Model Selection

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Assume X and y are your features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

### Model Deployment

In [None]:
from flask import Flask, request, jsonify
import pickle

app = Flask(__name__)

# Load the trained model
with open('spam_classifier.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

@app.route('/predict', methods=['POST'])
def predict():
    email_text = request.json['text']
    # Preprocess and extract features here
    prediction = model.predict([email_text])
    return jsonify({'spam': bool(prediction[0])})

if __name__ == '__main__':
    app.run(debug=True)