In [96]:
import pandas as pd
messages = pd.read_csv('SMSSpamCollection.txt', sep = '\t', header = None, names = ['Labels', 'SMS'])

In [97]:
print(messages.head())
print(messages.shape)

  Labels                                                SMS
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...
(5572, 2)


In [98]:
messages['Labels'].value_counts()

ham     4825
spam     747
Name: Labels, dtype: int64

In [99]:
messages['Labels'].value_counts(normalize = True)

ham     0.865937
spam    0.134063
Name: Labels, dtype: float64

In [100]:
messages_random = messages.sample(frac = 1, random_state = 1)
training_test_index = round(len(messages_random) * 0.7)
training_set = messages_random.iloc[: training_test_index, :].reset_index(drop = True)
test_set = messages_random.iloc[training_test_index :, :].reset_index(drop = True)

In [101]:

print("Тренировочная выборка")
print(training_set['Labels'].value_counts(normalize = True))
print()
print("Тестовая выборка")
print(test_set['Labels'].value_counts(normalize = True))


Тренировочная выборка
ham     0.865897
spam    0.134103
Name: Labels, dtype: float64

Тестовая выборка
ham     0.866029
spam    0.133971
Name: Labels, dtype: float64


In [102]:
training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ').str.lower()
training_set.head()

  training_set['SMS'] = training_set['SMS'].str.replace('\W', ' ').str.lower()


Unnamed: 0,Labels,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired
3,ham,havent
4,ham,i forgot 2 ask ü all smth there s a card on ...


In [103]:
vocabulary = []
training_set['SMS'] = training_set['SMS'].str.split()

In [104]:
for text in training_set['SMS']:
    for word in text:
        vocabulary.append(word)        
vocabulary = list(set(vocabulary))

In [105]:
word_counts_per_sms = { word: [0] * len(training_set['SMS']) for word in vocabulary }
for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [106]:
word_counts = pd.DataFrame(word_counts_per_sms)
training_set_clean = pd.concat([training_set, word_counts], axis = 1)
training_set_clean.head()

Unnamed: 0,Labels,SMS,bthere,allday,adoring,boss,decimal,way2sms,08006344447,sacked,...,inlude,parantella,sitter,bag,ijust,comp,jazz,inclusive,shudvetold,psychiatrist
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

input_train, input_test, output_train, output_test = train_test_split(
    training_set_clean['SMS'], training_set_clean['Labels'], test_size = 0.3)

MNB = MultinomialNB()

count_vectorizer = CountVectorizer()
input_train = count_vectorizer.fit_transform(
    input_train.apply(lambda x: ' '.join(x)))
input_test = count_vectorizer.transform(
    input_test.apply(lambda x: ' '.join(x)))

MNB.fit(input_train, output_train)
output_train_predict = MNB.predict(input_train)
output_test_predict = MNB.predict(input_test)

print("Тренировочная выборка")
print(classification_report(output_train, output_train_predict))
print("Тестовая выборка")
print(classification_report(output_test, output_test_predict))


Тренировочная выборка
              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      2356
        spam       0.99      0.97      0.98       374

    accuracy                           0.99      2730
   macro avg       0.99      0.98      0.99      2730
weighted avg       0.99      0.99      0.99      2730

Тестовая выборка
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1021
        spam       0.98      0.90      0.94       149

    accuracy                           0.98      1170
   macro avg       0.98      0.95      0.96      1170
weighted avg       0.98      0.98      0.98      1170

