In [2]:
import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pickle
import os

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Function to load or train the model
def load_or_train_model(dataset_path='C:\\Users\\DELL\\OneDrive\\Desktop\\aiml\\spamhamdata.csv'):
    model_file = 'spam_classifier_model.pkl'
    vectorizer_file = 'tfidf_vectorizer.pkl'

    # Check if model and vectorizer exist
    if os.path.exists(model_file) and os.path.exists(vectorizer_file):
        with open(model_file, 'rb') as f:
            model = pickle.load(f)
        with open(vectorizer_file, 'rb') as f:
            vectorizer = pickle.load(f)
        return model, vectorizer

    # Load dataset
    try:
        df = pd.read_csv(dataset_path)
    except FileNotFoundError:
        st.error(f"Dataset file '{dataset_path}' not found. Please upload the dataset.")
        return None, None

    # Assume dataset has 'message' and 'label' columns
    # Adjust column names if your dataset uses different ones
    if 'message' not in df.columns or 'label' not in df.columns:
        st.error("Dataset must contain 'message' and 'label' columns.")
        return None, None

    # Preprocess the text data
    df['processed_text'] = df['message'].apply(preprocess_text)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        df['processed_text'], df['label'], test_size=0.2, random_state=42
    )

    # Vectorize the text using TF-IDF
    vectorizer = TfidfVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train Naive Bayes classifier
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)

    # Evaluate model
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    st.write(f"Model Accuracy on Test Set: {accuracy:.2f}")

    # Save model and vectorizer
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)
    with open(vectorizer_file, 'wb') as f:
        pickle.dump(vectorizer, f)

    return model, vectorizer

# Streamlit app
st.title("Spam/Ham Classifier")
st.write("Enter a message to classify it as Spam or Ham")

# Allow user to upload dataset (optional for Streamlit Cloud)
uploaded_file = st.file_uploader("Upload your dataset (CSV)", type=["csv"])
dataset_path = 'spam_dataset.csv'

if uploaded_file is not None:
    # Save uploaded file to disk
    with open(dataset_path, 'wb') as f:
        f.write(uploaded_file.getbuffer())
    st.success("Dataset uploaded successfully!")

# Load or train model
model, vectorizer = load_or_train_model(dataset_path)

# Input text from user
user_input = st.text_area("Enter your message:", height=100)

if st.button("Classify"):
    if user_input:
        if model is None or vectorizer is None:
            st.error("Model could not be loaded or trained. Check dataset.")
        else:
            # Preprocess user input
            processed_input = preprocess_text(user_input)
            # Vectorize input
            input_vec = vectorizer.transform([processed_input])
            # Predict
            prediction = model.predict(input_vec)[0]
            # Display result
            st.write(f"Prediction: **{prediction.upper()}**")
    else:
        st.error("Please enter a message to classify.")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-05-31 16:12:08.133 
  command:

    streamlit run C:\Users\DELL\AppData\Roaming\Python\Python313\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-05-31 16:12:08.203 Session state does not function when running a script without `streamlit run`
