In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/linghuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/linghuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/linghuang/nltk_data...


True

### Text Cleaning

In [3]:
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

email_text = "This is an example"
cleaned_text = preprocess_text(email_text)
print(cleaned_text)


example


In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [6]:
text = "Mary, don’t slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', 'n’t', 'slap', 'the', 'green', 'witch']


In [8]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.5.15-cp310-cp310-macosx_11_0_arm64.whl.metadata (40 kB)
Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.5.15-cp310-cp310-macosx_11_0_arm64.whl (278 kB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: regex, joblib, nltk
Successfully installed joblib-1.4.2 nltk-3.8.1 regex-2024.5.15


In [9]:
from nltk.tokenize import TreebankWordTokenizer

text = "Mary, don’t slap the green witch"
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(text))

['Mary', ',', 'don’t', 'slap', 'the', 'green', 'witch']


In [10]:
from nltk.tokenize import TweetTokenizer

tweet=u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


### Feature Extraction

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ["This is the first email.", "This is the second email."]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.toarray())

[[0.4090901  0.57496187 0.4090901  0.         0.4090901  0.4090901 ]
 [0.4090901  0.         0.4090901  0.57496187 0.4090901  0.4090901 ]]


### Model Selection

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Assume X and y are your features and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

### Model Deployment

In [None]:
from flask import Flask, request, jsonify
import pickle

app = Flask(__name__)

# Load the trained model
with open('mocel.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

@app.route('/predict', methods=['POST'])
def predict():
    features = request.json['text']
    # Preprocess and extract features here
    prediction = model.predict([features])
    return jsonify({'predictions': bool(prediction[0])})

if __name__ == '__main__':
    app.run(debug=True)