In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('train.csv')

In [3]:
# Explore the dataset (optional)
print(df.head())

                                                 sms  label
0  Go until jurong point, crazy.. Available only ...      0
1                    Ok lar... Joking wif u oni...\n      0
2  Free entry in 2 a wkly comp to win FA Cup fina...      1
3  U dun say so early hor... U c already then say...      0
4  Nah I don't think he goes to usf, he lives aro...      0


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['sms'], df['label'], test_size=0.2, random_state=42)

In [7]:
# Text representation using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.85)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
# Initialize and train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


In [9]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

In [10]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [11]:
# Print the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(report)


Accuracy: 97.40%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       954
           1       1.00      0.82      0.90       161

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [12]:
# Assume 'new_message' is the new SMS message you want to classify
new_message = ["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]

In [13]:

# Transform the new message using the same TF-IDF vectorizer
new_message_tfidf = tfidf_vectorizer.transform(new_message)

In [14]:
# Make predictions using the trained model
prediction = model.predict(new_message_tfidf)

In [15]:
# Print the prediction
if prediction[0] == 1:
    print("The message is classified as spam.")
else:
    print("The message is classified as non-spam.")

The message is classified as spam.


In [16]:
# Assume 'new_message' is the new SMS message you want to classify
new_message = ["Hello! How's you and how did saturday go? I was just texting to see if you'd decided to do anything tomo. Not that i'm trying to invite myself or anything!"]


In [17]:
# Transform the new message using the same TF-IDF vectorizer
new_message_tfidf = tfidf_vectorizer.transform(new_message)

# Make predictions using the trained model
prediction = model.predict(new_message_tfidf)

In [18]:
# Print the prediction
if prediction[0] == 1:
    print("The message is classified as spam.")
else:
    print("The message is classified as non-spam.")

The message is classified as non-spam.
