LOADING THE DATA FROM SPAM.CSV

In [12]:
import pandas as pd

# Loading the dataset
data_path = 'spam.csv'
df = pd.read_csv(data_path, encoding='latin-1')

In [13]:
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [14]:
# Dropping  unnecessary columns and renaming the columns for clarity
df = df.iloc[:, :2]
df.columns = ['Label', 'Message']

In [15]:
print(df.head())

  Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


PREPROCESSING THE DATA

In [16]:
import string
import nltk
nltk.download('stopwords')

def preprocess_text(text):
    # Removing punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Converting to lowercase
    text = text.lower()
    # Removing stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Applying the preprocessing to the SMS messages
df['Message'] = df['Message'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CONVERTING TEXT TO VECTORS

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing the TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer()

# Fitting and transforming the data
X_tfidf = tfidf_vect.fit_transform(df['Message'])

# Displaying the shape of the feature matrix
print(X_tfidf.shape)


(5572, 9376)


In [18]:
from sklearn.model_selection import train_test_split

# Preparing the labels
y = df['Label']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


TRAINING DIFFERENT CLASSIFIERS ON THE DATASET

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Training Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)

# Training Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Training SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Evaluate the models
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_lr))


Naive Bayes Accuracy: 0.9659192825112107
Logistic Regression Accuracy: 0.9426008968609866
SVM Accuracy: 0.967713004484305

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

         ham       0.94      1.00      0.97       965
        spam       0.96      0.60      0.74       150

    accuracy                           0.94      1115
   macro avg       0.95      0.80      0.85      1115
weighted avg       0.94      0.94      0.94      1115



SAVING THE MODEL

In [21]:
#As SVM model gave the Highest Accuracy So using that classifier for our model
import joblib
svm_model_path = 'svm_spam_classifier.pkl'
joblib.dump(svm_model, svm_model_path)
svm_model_path

'svm_spam_classifier.pkl'

In [22]:
import joblib

joblib.dump({'model': svm_model, 'vectorizer': tfidf_vect}, 'model_and_vectorizer.pkl')


['model_and_vectorizer.pkl']