|
| 1 | +# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python |
| 2 | +# https://www.udemy.com/data-science-natural-language-processing-in-python |
| 3 | + |
| 4 | +# Author: http://lazyprogrammer.me |
| 5 | +from __future__ import print_function, division |
| 6 | +from future.utils import iteritems |
| 7 | +from builtins import range |
| 8 | +# Note: you may need to update your version of future |
| 9 | +# sudo pip install -U future |
| 10 | + |
| 11 | +import numpy as np |
| 12 | +import pandas as pd |
| 13 | +import matplotlib.pyplot as plt |
| 14 | +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer |
| 15 | +from sklearn.model_selection import train_test_split |
| 16 | +from sklearn.naive_bayes import MultinomialNB |
| 17 | +from wordcloud import WordCloud |
| 18 | + |
| 19 | + |
| 20 | +# data from: |
| 21 | +# https://www.kaggle.com/uciml/sms-spam-collection-dataset |
| 22 | +# file contains some invalid chars |
| 23 | +# depending on which version of pandas you have |
| 24 | +# an error may be thrown |
| 25 | +df = pd.read_csv('../large_files/spam.csv', encoding='ISO-8859-1') |
| 26 | + |
| 27 | +# drop unnecessary columns |
| 28 | +df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1) |
| 29 | + |
| 30 | +# rename columns to something better |
| 31 | +df.columns = ['labels', 'data'] |
| 32 | + |
| 33 | +# create binary labels |
| 34 | +df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1}) |
| 35 | +Y = df['b_labels'].as_matrix() |
| 36 | + |
| 37 | +# try multiple ways of calculating features |
| 38 | +# tfidf = TfidfVectorizer(decode_error='ignore') |
| 39 | +# X = tfidf.fit_transform(df['data']) |
| 40 | + |
| 41 | +count_vectorizer = CountVectorizer(decode_error='ignore') |
| 42 | +X = count_vectorizer.fit_transform(df['data']) |
| 43 | + |
| 44 | +# split up the data |
| 45 | +Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33) |
| 46 | + |
| 47 | +# create the model, train it, print scores |
| 48 | +model = MultinomialNB() |
| 49 | +model.fit(Xtrain, Ytrain) |
| 50 | +print("train score:", model.score(Xtrain, Ytrain)) |
| 51 | +print("test score:", model.score(Xtest, Ytest)) |
| 52 | + |
| 53 | + |
| 54 | + |
| 55 | +# visualize the data |
| 56 | +def visualize(label): |
| 57 | + words = '' |
| 58 | + for msg in df[df['labels'] == label]['data']: |
| 59 | + msg = msg.lower() |
| 60 | + words += msg + ' ' |
| 61 | + wordcloud = WordCloud(width=600, height=400).generate(words) |
| 62 | + plt.imshow(wordcloud) |
| 63 | + plt.axis('off') |
| 64 | + plt.show() |
| 65 | + |
| 66 | +visualize('spam') |
| 67 | +visualize('ham') |
| 68 | + |
| 69 | + |
| 70 | +# see what we're getting wrong |
| 71 | +df['predictions'] = model.predict(X) |
| 72 | + |
| 73 | +# things that should be spam |
| 74 | +sneaky_spam = df[(df['predictions'] == 0) & (df['b_labels'] == 1)]['data'] |
| 75 | +for msg in sneaky_spam: |
| 76 | + print(msg) |
| 77 | + |
| 78 | +# things that should not be spam |
| 79 | +not_actually_spam = df[(df['predictions'] == 1) & (df['b_labels'] == 0)]['data'] |
| 80 | +for msg in not_actually_spam: |
| 81 | + print(msg) |
| 82 | + |
| 83 | + |
0 commit comments