Skip to content

Commit 6d45047

Browse files
committed
add spam2
1 parent a30b32a commit 6d45047

File tree

3 files changed

+92
-0
lines changed

3 files changed

+92
-0
lines changed

nlp_class/lsa.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@
2222

2323
# copy tokenizer from sentiment example
2424
stopwords = set(w.rstrip() for w in open('stopwords.txt'))
25+
26+
# note: an alternative source of stopwords
27+
# from nltk.corpus import stopwords
28+
# stopwords.words('english')
29+
2530
# add more stopwords specific to this problem
2631
stopwords = stopwords.union({
2732
'introduction', 'edition', 'series', 'application',

nlp_class/sentiment.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@
2727
# from http://www.lextek.com/manuals/onix/stopwords1.html
2828
stopwords = set(w.rstrip() for w in open('stopwords.txt'))
2929

30+
# note: an alternative source of stopwords
31+
# from nltk.corpus import stopwords
32+
# stopwords.words('english')
33+
3034
# load the reviews
3135
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
3236
positive_reviews = BeautifulSoup(open('electronics/positive.review').read())

nlp_class/spam2.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
2+
# https://www.udemy.com/data-science-natural-language-processing-in-python
3+
4+
# Author: http://lazyprogrammer.me
5+
from __future__ import print_function, division
6+
from future.utils import iteritems
7+
from builtins import range
8+
# Note: you may need to update your version of future
9+
# sudo pip install -U future
10+
11+
import numpy as np
12+
import pandas as pd
13+
import matplotlib.pyplot as plt
14+
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
15+
from sklearn.model_selection import train_test_split
16+
from sklearn.naive_bayes import MultinomialNB
17+
from wordcloud import WordCloud
18+
19+
20+
# data from:
21+
# https://www.kaggle.com/uciml/sms-spam-collection-dataset
22+
# file contains some invalid chars
23+
# depending on which version of pandas you have
24+
# an error may be thrown
25+
df = pd.read_csv('../large_files/spam.csv', encoding='ISO-8859-1')
26+
27+
# drop unnecessary columns
28+
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
29+
30+
# rename columns to something better
31+
df.columns = ['labels', 'data']
32+
33+
# create binary labels
34+
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
35+
Y = df['b_labels'].as_matrix()
36+
37+
# try multiple ways of calculating features
38+
# tfidf = TfidfVectorizer(decode_error='ignore')
39+
# X = tfidf.fit_transform(df['data'])
40+
41+
count_vectorizer = CountVectorizer(decode_error='ignore')
42+
X = count_vectorizer.fit_transform(df['data'])
43+
44+
# split up the data
45+
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33)
46+
47+
# create the model, train it, print scores
48+
model = MultinomialNB()
49+
model.fit(Xtrain, Ytrain)
50+
print("train score:", model.score(Xtrain, Ytrain))
51+
print("test score:", model.score(Xtest, Ytest))
52+
53+
54+
55+
# visualize the data
56+
def visualize(label):
57+
words = ''
58+
for msg in df[df['labels'] == label]['data']:
59+
msg = msg.lower()
60+
words += msg + ' '
61+
wordcloud = WordCloud(width=600, height=400).generate(words)
62+
plt.imshow(wordcloud)
63+
plt.axis('off')
64+
plt.show()
65+
66+
visualize('spam')
67+
visualize('ham')
68+
69+
70+
# see what we're getting wrong
71+
df['predictions'] = model.predict(X)
72+
73+
# things that should be spam
74+
sneaky_spam = df[(df['predictions'] == 0) & (df['b_labels'] == 1)]['data']
75+
for msg in sneaky_spam:
76+
print(msg)
77+
78+
# things that should not be spam
79+
not_actually_spam = df[(df['predictions'] == 1) & (df['b_labels'] == 0)]['data']
80+
for msg in not_actually_spam:
81+
print(msg)
82+
83+

0 commit comments

Comments
 (0)