In [30]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [31]:
import pandas as pd

#url = "/kaggle/input/sms-spam-collection-dataset/spam.csv"
sms_data =pd.read_csv("spam.csv", encoding='ISO-8859-1')

# Display the first few rows
print(sms_data.head())


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [32]:
sms_data=sms_data.iloc[:,[0,1]] # drop the last two na columns which are not useful for our task too
sms_data.head(10)
sms_data.columns = ['label', 'message'] # rename columns 
sms_data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [33]:
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')

# Convert labels to binary
sms_data['label'] = sms_data['label'].map({'ham': 0, 'spam': 1})

# Text preprocessing function
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    text = ' '.join(filtered_words)
    return text

# Apply preprocessing to the 'message' column
sms_data['message'] = sms_data['message'].apply(preprocess_text)

# Display the processed data
print(sms_data.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jhala\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   label                                            message
0      0  go jurong point crazy available bugis n great ...
1      0                            ok lar joking wif u oni
2      1  free entry 2 wkly comp win fa cup final tkts 2...
3      0                u dun say early hor u c already say
4      0        nah dont think goes usf lives around though


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=3000)  # Adjust max_features as needed

# Fit TF-IDF vectorizer and transform the 'message' column
tfidf_matrix = tfidf_vectorizer.fit_transform(sms_data['message'])



In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, sms_data['label'], test_size=0.2, random_state=42)

# Initialize Logistic Regression classifier
classifier = LogisticRegression(max_iter=1000)

# Train the classifier
classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.95

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.96      0.66      0.78       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115



In [36]:
# Example usage:
new_messages = ["Free Prizes, Gift Cards or Coupons."]
new_messages_tfidf = tfidf_vectorizer.transform(new_messages)
predicted_labels = classifier.predict(new_messages_tfidf)
print(predicted_labels)  # [1](spam) [0](not spam)


[1]


In [37]:
# Example usage:
new_messages = ["Hi my name is Jhalak"]
new_messages_tfidf = tfidf_vectorizer.transform(new_messages)
predicted_labels = classifier.predict(new_messages_tfidf)
print(predicted_labels)  # [1](spam) [0](not spam)

[0]
