# NLP: Text SPAM Classifier

# Import Dataset

In [None]:
# import libraries
import pandas as pd #handles the csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression # import logisticregression model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix #handles models output metric report

In [3]:
# load dataset-use pandas library
df=pd.read_csv('sms_data.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# from the head before, it can be seen there is 3 unneeded columns
# lets remove it
df=df[['v1','v2']]
df=df.rename(columns={'v1':'label','v2':'text'})
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# so now, lets observe the class label
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [11]:
# it shows that there is only two class labels
# ham or spam
# to train a model, we need to convert the value into 
# numeric values 0 or 1
df['label']=df['label'].map({'ham':0,'spam':1})
df['label'].value_counts()

label
0    4825
1     747
Name: count, dtype: int64

In [29]:
# then in order to train model, let's split data
# into 8:2 ratio, 80% training and 20% testing
x_train,x_test, y_train, y_test=train_test_split(
    df['text'],df['label'], # y should be class label- i think?-yes, confirmed
    random_state=42,test_size=0.2, # random_state will control the data shuffling (0~42)
    stratify=df['label'] # test size is 0.2-20%, stratify- do stratified sampling- the column automatically become class label
)


In [30]:
# set up the countvectorizer and tfidfvectorizer
count_vectorizer=CountVectorizer(stop_words='english') # use prebuilt english stop words dictionary
tfidf_vectorizer=TfidfVectorizer(stop_words='english',max_df=0.7) # set condition that words can only appear in maximum 70% of documents to be included in vocabulary

Things to consider:
- CountVectorizer vs Tf-Idf Vectorizer
---------------------------------------
1-Count vectorizer only generate vectors where each  element represent the raw count of a word in the documents.

2- TfIdf vectorizer  calculates TF_IDF scores for each word, considering how often it appears in a document (term-freq) and how rare it is across all documents (idf)

3- Count good for simple count task -basic analysis

4- TFIdf considered better for text classification tasks.


In [None]:
# Transform text data into Tf-idf-weighted document-term matrix
x_train_count=count_vectorizer.fit_transform(x_train)
x_test_count=count_vectorizer.transform(x_test)
x_train_tfidf=tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf=tfidf_vectorizer.transform(x_test)

Why in transforming train and test set using different operators?

-fit_transform (let the model learn new words and transform the text into vectors)
-transform (just convert text into vectors)

so, fit transform is suitable for train set, while transform is suitable for test set

In [35]:
# train logistic regression model using CountVectorizer
logres_countv=LogisticRegression()
logres_countv.fit(x_train_count,y_train)

# test the train model using test set
y_pred_countv = logres_countv.predict(x_test_count)

# utilize accuracy score to show the model accuracy
print("CountVectorizer Accuracy: ", accuracy_score(y_test,y_pred_countv))

# show the classification report
print(classification_report(y_test,y_pred_countv))

# print the confusion matrix
print("Confusion Matrix: \n", confusion_matrix(y_test,y_pred_countv))

CountVectorizer Accuracy:  0.9766816143497757
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix: 
 [[966   0]
 [ 26 123]]


In [39]:
# train the logistic regression model using TFIdf vectorizer
logres_tfidf=LogisticRegression()
logres_tfidf.fit(x_train_tfidf,y_train)

#test the trained model
y_pred_tfidf=logres_tfidf.predict(x_test_tfidf)

# print the accuracy report
print("TF-IDF Vectorizer Accuracy: ", accuracy_score(y_test,y_pred_tfidf))

# print classification report 
print(classification_report(y_test,y_pred_tfidf))

#print confusion matrix
print("Confusion Matrix: ")
print(confusion_matrix(y_test,y_pred_tfidf))


TF-IDF Vectorizer Accuracy:  0.967713004484305
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix: 
[[966   0]
 [ 36 113]]


In [40]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

Models trained on vectors from CountVectorizer:

In [43]:
# initiate a set of models - logres, naivebayes, svm, random forest
models={'Logistic Regression': logres_countv,
        'Naive Bayes':MultinomialNB(),
        'Random Forest': RandomForestClassifier(random_state=42),
        'SVM':SVC(random_state=42)}

print(models)

print("Results with CountVectorizer converter: ")
print("-"*70)
for name, model in models.items():
    #train each model -fit(x train set(count or tfidf vector), y train set)
    model.fit(x_train_count,y_train)
    #produce prediction- predict(x test set (count or tfidf vectir))
    y_pred=model.predict(x_test_count)
    # print the result for each model
    print(f"\n{name}: ")
    print(f"Accuracy: {accuracy_score(y_test,y_pred):.4f}")
    print("Classification Report: ")
    print(classification_report(y_test,y_pred))
    print("Confusion Matrix: ")
    print(confusion_matrix(y_test,y_pred))
    print("-"*70)

    

{'Logistic Regression': LogisticRegression(), 'Naive Bayes': MultinomialNB(), 'Random Forest': RandomForestClassifier(random_state=42), 'SVM': SVC(random_state=42)}
Results with CountVectorizer converter: 
----------------------------------------------------------------------

Logistic Regression: 
Accuracy: 0.9767
Classification Report: 
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix: 
[[966   0]
 [ 26 123]]
----------------------------------------------------------------------

Naive Bayes: 
Accuracy: 0.9839
Classification Report: 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.96      0.92      0.94       149


Models trained on vectors using TF-IDF Vectorizer:

In [44]:
print("Result with TF-IDF Vectorizer: ")
print("-"*70)

for name,model in models.items():
    model.fit(x_train_tfidf,y_train)
    y_pred=model.predict(x_test_tfidf)
    # print the result for each model
    print(f"\n{name}: ")
    print(f"Accuracy: {accuracy_score(y_test,y_pred):.4f}")
    print("Classification Report: ")
    print(classification_report(y_test,y_pred))
    print("Confusion Matrix: ")
    print(confusion_matrix(y_test,y_pred))
    print("-"*70)


Result with TF-IDF Vectorizer: 
----------------------------------------------------------------------

Logistic Regression: 
Accuracy: 0.9677
Classification Report: 
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.76      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix: 
[[966   0]
 [ 36 113]]
----------------------------------------------------------------------

Naive Bayes: 
Accuracy: 0.9686
Classification Report: 
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Confusion 