In [13]:
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pandas as pd
import seaborn as sns


In [3]:
df = pd.read_csv('data/smsspamcollection/SMSSpamCollection',
                 sep='\t', names=['labels', 'messages'])
df


Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.labels.value_counts(normalize=True)


ham     0.865937
spam    0.134063
Name: labels, dtype: float64

In [5]:
ps = PorterStemmer()


In [6]:
corpus = []
for i in range(5572):
    review = re.sub('[^a-zA-Z]', ' ', df['messages'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word)
              for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


In [35]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
bog = CountVectorizer(max_features=4000)
tfidf = TfidfVectorizer(max_features=4000)


In [8]:
x_bog = bog.fit_transform(corpus).toarray()
x_tfidf = tfidf.fit_transform(corpus).toarray()


In [9]:
x_bog.shape


(5572, 4000)

In [34]:
x_tfidf[0]


array([0., 0., 0., ..., 0., 0., 0.])

In [12]:
x_bog[0]


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [10]:
y = pd.get_dummies(df.labels, drop_first=True).to_numpy()
y


array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]], dtype=uint8)

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x_bog, y, random_state=7, test_size=0.3)


In [15]:
x_train.shape


(3900, 4000)

In [16]:
y_test.shape


(1672, 1)

In [17]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

gbclf = GaussianNB()
mbclf = MultinomialNB()

gbclf.fit(x_train, y_train.ravel())
mbclf.fit(x_train, y_train.ravel())


MultinomialNB()

In [18]:
print(gbclf.score(x_train, y_train))
print(mbclf.score(x_train, y_train))


0.9033333333333333
0.9902564102564102


In [19]:
y_mb_pred = mbclf.predict(x_test)
y_gb_pred = gbclf.predict(x_test)


In [20]:
y_gb_pred.shape


(1672,)

In [21]:
accuracy_score(y_test, y_gb_pred)


0.8672248803827751

In [22]:
accuracy_score(y_test, y_mb_pred)


0.9814593301435407

In [23]:
print(classification_report(y_test, y_mb_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1442
           1       0.92      0.95      0.93       230

    accuracy                           0.98      1672
   macro avg       0.95      0.97      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [24]:
print(classification_report(y_test, y_gb_pred))


              precision    recall  f1-score   support

           0       0.98      0.86      0.92      1442
           1       0.51      0.91      0.65       230

    accuracy                           0.87      1672
   macro avg       0.75      0.89      0.79      1672
weighted avg       0.92      0.87      0.88      1672



In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

dt = DecisionTreeClassifier()
lr = LogisticRegression()


In [27]:
dt.fit(x_train, y_train.ravel())
lr.fit(x_train, y_train.ravel())


LogisticRegression()

In [28]:
y_dt_pred = dt.predict(x_test)
y_lr_pred = lr.predict(x_test)


In [29]:
print(classification_report(y_test, y_dt_pred))
print(classification_report(y_test, y_lr_pred))


              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1442
           1       0.93      0.85      0.89       230

    accuracy                           0.97      1672
   macro avg       0.95      0.92      0.93      1672
weighted avg       0.97      0.97      0.97      1672

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1442
           1       0.99      0.87      0.93       230

    accuracy                           0.98      1672
   macro avg       0.98      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672

