In [7]:
#INSTALLING AND IMPORTING REQUIRED PACKAGES
!pip install scikit-learn pandas

import pandas as pd
import numpy as np
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report



In [1]:
#PREPROCESSING
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [2]:
#Load & Prepare Spam Dataset
def load_spam_dataset(path):
    df = pd.read_csv(path, encoding='latin-1')[['v1', 'v2']]
    df.columns = ['label', 'text']
    df['text'] = df['text'].apply(preprocess_text)
    return df

In [3]:
#Create Priority Dataset
def load_priority_dataset():
    data = {
        'text': [
            "Please submit your project report today.",
            "Reminder: team lunch at 1PM",
            "Urgent: server is down. Call IT immediately",
            "Update your timesheet before Friday",
            "Fire drill will be conducted tomorrow",
            "Company retreat is next weekend"
        ],
        'priority': ['High', 'Low', 'High', 'Medium', 'Medium', 'Low']
    }
    df = pd.DataFrame(data)
    df['text'] = df['text'].apply(preprocess_text)
    return df

In [4]:
# TF-IDF + Train Function
def extract_features(train_texts, test_texts):
    tfidf = TfidfVectorizer(stop_words='english')
    X_train = tfidf.fit_transform(train_texts)
    X_test = tfidf.transform(test_texts)
    return X_train, X_test, tfidf

def train_model(X_train, y_train):
    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model

In [5]:
#Prediction Function
def predict_email(model, tfidf, email_text, label_encoder=None):
    cleaned = preprocess_text(email_text)
    features = tfidf.transform([cleaned])
    probs = model.predict_proba(features)[0]
    pred_idx = np.argmax(probs)
    pred_class = model.classes_[pred_idx]
    confidence = probs[pred_idx]
    if label_encoder:
        pred_class = label_encoder.inverse_transform([pred_class])[0]
    return pred_class, confidence

In [13]:
#Train Both Models
# 1. Spam/Ham Classifier
spam_df = load_spam_dataset("/content/spam.csv")
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(spam_df['text'], spam_df['label'], test_size=0.2, random_state=42)
X_train_s_vec, X_test_s_vec, tfidf_spam = extract_features(X_train_s, X_test_s)
model_spam = train_model(X_train_s_vec, y_train_s)

# 2. Priority Classifier
priority_df = load_priority_dataset()
le_priority = LabelEncoder()
priority_df['priority_encoded'] = le_priority.fit_transform(priority_df['priority'])
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(priority_df['text'], priority_df['priority_encoded'], test_size=0.2, random_state=42)
X_train_p_vec, X_test_p_vec, tfidf_priority = extract_features(X_train_p, X_test_p)
model_priority = train_model(X_train_p_vec, y_train_p)

In [14]:
#Evaluate Models

# Spam Report
y_pred_s = model_spam.predict(X_test_s_vec)
print("Spam/Ham Report:\n")
print(classification_report(y_test_s, y_pred_s))

# Priority Report
y_pred_p = model_priority.predict(X_test_p_vec)
y_true_p = le_priority.inverse_transform(y_test_p)
y_pred_p_labels = le_priority.inverse_transform(y_pred_p)
print("Priority Level Report:\n")
print(classification_report(y_true_p, y_pred_p_labels))

Spam/Ham Report:

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Priority Level Report:

              precision    recall  f1-score   support

        High       0.00      0.00      0.00       1.0
         Low       0.00      0.00      0.00       1.0
      Medium       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
email = "Hi team, please update me on the server issue today. It's very urgent."

# Spam/Ham Prediction
spam_result, spam_conf = predict_email(model_spam, tfidf_spam, email)
print(f"\n[Spam/Ham] → {spam_result.upper()} (Confidence: {spam_conf:.2f})")

# Priority Prediction
priority_result, priority_conf = predict_email(model_priority, tfidf_priority, email, le_priority)
print(f"[Priority] → {priority_result} (Confidence: {priority_conf:.2f})")


[Spam/Ham] → HAM (Confidence: 0.76)
[Priority] → Medium (Confidence: 0.44)
