add spam2

bob7783 · bob7783 · commit 6d450479f456 · 2018-03-26T00:41:03.000-04:00
diff --git a/nlp_class/lsa.py b/nlp_class/lsa.py
@@ -22,6 +22,11 @@
 
 # copy tokenizer from sentiment example
 stopwords = set(w.rstrip() for w in open('stopwords.txt'))
+
+# note: an alternative source of stopwords
+# from nltk.corpus import stopwords
+# stopwords.words('english')
+
 # add more stopwords specific to this problem
 stopwords = stopwords.union({
     'introduction', 'edition', 'series', 'application',
diff --git a/nlp_class/sentiment.py b/nlp_class/sentiment.py
@@ -27,6 +27,10 @@
 # from http://www.lextek.com/manuals/onix/stopwords1.html
 stopwords = set(w.rstrip() for w in open('stopwords.txt'))
 
+# note: an alternative source of stopwords
+# from nltk.corpus import stopwords
+# stopwords.words('english')
+
 # load the reviews
 # data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
 positive_reviews = BeautifulSoup(open('electronics/positive.review').read())
diff --git a/nlp_class/spam2.py b/nlp_class/spam2.py
@@ -0,0 +1,83 @@
+# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
+# https://www.udemy.com/data-science-natural-language-processing-in-python
+
+# Author: http://lazyprogrammer.me
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import MultinomialNB
+from wordcloud import WordCloud
+
+
+# data from:
+# https://www.kaggle.com/uciml/sms-spam-collection-dataset
+# file contains some invalid chars
+# depending on which version of pandas you have
+# an error may be thrown
+df = pd.read_csv('../large_files/spam.csv', encoding='ISO-8859-1')
+
+# drop unnecessary columns
+df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
+
+# rename columns to something better
+df.columns = ['labels', 'data']
+
+# create binary labels
+df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
+Y = df['b_labels'].as_matrix()
+
+# try multiple ways of calculating features
+# tfidf = TfidfVectorizer(decode_error='ignore')
+# X = tfidf.fit_transform(df['data'])
+
+count_vectorizer = CountVectorizer(decode_error='ignore')
+X = count_vectorizer.fit_transform(df['data'])
+
+# split up the data
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33)
+
+# create the model, train it, print scores
+model = MultinomialNB()
+model.fit(Xtrain, Ytrain)
+print("train score:", model.score(Xtrain, Ytrain))
+print("test score:", model.score(Xtest, Ytest))
+
+
+
+# visualize the data
+def visualize(label):
+  words = ''
+  for msg in df[df['labels'] == label]['data']:
+    msg = msg.lower()
+    words += msg + ' '
+  wordcloud = WordCloud(width=600, height=400).generate(words)
+  plt.imshow(wordcloud)
+  plt.axis('off')
+  plt.show()
+
+visualize('spam')
+visualize('ham')
+
+
+# see what we're getting wrong
+df['predictions'] = model.predict(X)
+
+# things that should be spam
+sneaky_spam = df[(df['predictions'] == 0) & (df['b_labels'] == 1)]['data']
+for msg in sneaky_spam:
+  print(msg)
+
+# things that should not be spam
+not_actually_spam = df[(df['predictions'] == 1) & (df['b_labels'] == 0)]['data']
+for msg in not_actually_spam:
+  print(msg)
+
+