In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\t')

In [4]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [7]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = df[['length', 'punct']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [11]:
X_train.shape

(3900, 2)

In [12]:
from sklearn.linear_model import LogisticRegression

In [14]:
lm = LogisticRegression(solver = 'lbfgs')

In [15]:
lm.fit(X_train, y_train)

In [16]:
res = lm.predict(X_test)

In [17]:
from sklearn import metrics

In [18]:
metrics.confusion_matrix(y_test, res)

array([[1399,   40],
       [ 232,    1]], dtype=int64)

In [19]:
print(metrics.classification_report(y_test, res))

              precision    recall  f1-score   support

         ham       0.86      0.97      0.91      1439
        spam       0.02      0.00      0.01       233

    accuracy                           0.84      1672
   macro avg       0.44      0.49      0.46      1672
weighted avg       0.74      0.84      0.79      1672



In [20]:
print(metrics.accuracy_score(y_test, res))

0.8373205741626795


In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
mnb = MultinomialNB()

In [24]:
mnb.fit(X_train, y_train)

In [25]:
result = mnb.predict(X_test)

In [26]:
print(metrics.confusion_matrix(y_test, result))

[[1436    3]
 [ 231    2]]


In [27]:
print(metrics.classification_report(y_test, result))

              precision    recall  f1-score   support

         ham       0.86      1.00      0.92      1439
        spam       0.40      0.01      0.02       233

    accuracy                           0.86      1672
   macro avg       0.63      0.50      0.47      1672
weighted avg       0.80      0.86      0.80      1672



In [28]:
print(metrics.accuracy_score(y_test, result))

0.8600478468899522


In [30]:
from sklearn.svm import SVC
svc= SVC()
svc.fit(X_train, y_train)
pred = svc.predict(X_test)
print(metrics.confusion_matrix(y_test, pred))
print(metrics.classification_report(y_test, pred))
print(metrics.accuracy_score(y_test, pred))

[[1385   54]
 [ 134   99]]
              precision    recall  f1-score   support

         ham       0.91      0.96      0.94      1439
        spam       0.65      0.42      0.51       233

    accuracy                           0.89      1672
   macro avg       0.78      0.69      0.72      1672
weighted avg       0.87      0.89      0.88      1672

0.8875598086124402


In [31]:
X = df['message']
y = df['label']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
Tf_vec = TfidfVectorizer()

In [34]:
X_train_tf = Tf_vec.fit_transform(X_train)

In [35]:
from sklearn.svm import LinearSVC
SVC = LinearSVC()

In [36]:
SVC.fit(X_train_tf, y_train)

In [37]:
from sklearn.pipeline import Pipeline
Tf_pipe = Pipeline([('Tf_vec', TfidfVectorizer()), ('SVC', LinearSVC())])

In [38]:
Tf_pipe.fit(X_train, y_train)

In [39]:
pred = Tf_pipe.predict(X_test)

In [40]:
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[1445    3]
 [  10  214]]
              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

    accuracy                           0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672

