In [2]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load the dataset
file_path = "/content/spam.csv"  # Update the path to your file
df = pd.read_csv(file_path, encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:

# Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['v1'] = le.fit_transform(df['v1'])  # ham -> 0, spam -> 1

# Load spaCy English tokenizer, POS tagger, lemmatizer, etc.
nlp = spacy.load('en_core_web_sm')



In [7]:
# Function for text preprocessing
def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])


In [6]:
# Apply text preprocessing
df['v2'] = df['v2'].apply(preprocess_text)


In [8]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['v2'])
y = df['v1']


In [9]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [14]:

print(f'Classification Report:\n{report}')
print(f'Accuracy: {accuracy}')

# Save the model and the vectorizer
joblib.dump(model, 'spam_classifier_model.joblib')
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.82      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115

Accuracy: 0.9757847533632287


['tfidf_vectorizer.joblib']

In [12]:
# Function to predict user input
def predict_message(message):
    # Load the model and vectorizer
    model = joblib.load('spam_classifier_model.joblib')
    tfidf = joblib.load('tfidf_vectorizer.joblib')

    # Preprocess the message
    message = preprocess_text(message)

    # Transform the message using the TF-IDF vectorizer
    message_transformed = tfidf.transform([message])

    # Predict using the model
    prediction = model.predict(message_transformed)

    # Convert the prediction back to ham or spam
    return 'spam' if prediction[0] == 1 else 'ham'


In [13]:

# Example usage
user_input = "Hello sree... will catch up at 9 pm."
prediction = predict_message(user_input)
print(f"The message is: {prediction}")

The message is: ham
