In [1]:
# The following iPython notebook shows how to take a series of text inputs
# (in this case status updates on the Experience Project), pre-process the
# text to remove extraneous words, HTML, and punctuation, and create a 
# bag of words that allows the user to further train random forest models.
# In this example, I stop at a point where the most common words in the 
# test set have been found.

#import the whole data file
import pandas as pd       
extext = pd.read_csv("EP_data.csv", header=0)
#id_text = pd.read_csv("EP_data.csv", header=0, usecols=[11,12])
extext.shape
print extext.head()

  gender  num_groups  num_entries  num_fans  num_friends     created  \
0      F         210           54        25            2  2014-12-20   
1      F          25            9         2            0  2014-08-28   
2      U           0        65535      5672            0  2008-04-21   
3      M         153            2         6            5  2014-12-13   
4      F          29            8        83           60  2014-12-27   

            last_login  num_logins   last_wall_activity last_circle_activity  \
0  2015-04-03 12:45:01           6  0000-00-00 00:00:00  0000-00-00 00:00:00   
1  2015-04-15 22:32:36          19  0000-00-00 00:00:00  0000-00-00 00:00:00   
2  2012-11-13 18:03:23           5  2009-03-27 21:22:12  0000-00-00 00:00:00   
3  2015-06-01 20:34:52          15  0000-00-00 00:00:00  2014-12-13 08:37:11   
4  2015-05-14 13:25:09          19  0000-00-00 00:00:00  2015-05-14 13:25:34   

    birthdate     gid                                            content  \
0  1997-03

In [4]:
extext.shape #what size is our dataset?

(534305, 16)

In [2]:
#I can select only the columns I want
subset= extext[['gid','content']]
print subset.head()

      gid                                            content
0  124249  I just want my mom to hug me and say that she ...
1    5276  I don't know why this is happening right now. ...
2   49025  I accept the wisdom of all religions and disca...
3   58852  HAPPY NEW  YEARS !!!!!!!!!!!!!!!     !!!!!!!!!...
4  970365  Showing now if anyone is interested...message ...


In [3]:
from bs4 import BeautifulSoup
import re
import nltk
#nltk.download()
from nltk.corpus import stopwords # Import the stop word list
def status_to_words( raw_status ):
    # Function to convert a raw status to a string of words
    # The input is a single string (a raw status update), and 
    # the output is a single string (a preprocessed status update)
    #
    # 1. Remove HTML
    status_text = BeautifulSoup(raw_status).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", status_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))


#Let's see an example of what status_to_words does
clean_status = status_to_words( subset['content'][1] )
print clean_status

know happening right best friend years much together really hard life trust many people trusted excited told pregnant knew much loved kids wanted baby soon started hanging people cannot stand thieves one girl stole others fake general talked much crap hanging m sure even consider friend anymore really hurt hanging people hasn good friend past year use caring selfish care anything anyone m sure going m end friendship m moving hours away acts like could care less hasn offered help move hasn even mentioned coming see really know


In [5]:
#How many statuses do we have? Should be same as row number of extext.shape above
num_status = subset["content"].size
print num_status

534305


In [6]:
#Now run status_to_words on all the statuses (or a subset) for preprocessing
print "Cleaning and parsing the status updates...\n"
clean_statuses = []
#num_status = subset["content"].size #for full set
num_status = 15000 #for subset of all statuses
for i in xrange( 0, num_status ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Status %d of %d\n" % ( i+1, num_status )                                                                    
    clean_statuses.append( status_to_words( subset["content"][i] ))

Cleaning and parsing the status updates...

Status 1000 of 15000

Status 2000 of 15000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 3000 of 15000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 4000 of 15000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 5000 of 15000

Status 6000 of 15000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 7000 of 15000

Status 8000 of 15000


  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 9000 of 15000

Status 10000 of 15000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 11000 of 15000

Status 12000 of 15000

Status 13000 of 15000

Status 14000 of 15000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 15000 of 15000



In [7]:
# create the Bag o' words. Each word that appears in the statuses after preprocessing gets a place
# in the bag (list). Each status gets a list denoting the number of times a particular word
# shows up in that status.  Check out 
# https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words

print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool. Note that some of the preprocessing steps above
# can be done directly with this tool; we may want to do that.
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_statuses)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [8]:
# how big is it?
print train_data_features.shape

(15000L, 5000L)


In [9]:
# so we see it's 15000 rows (corresponging to the the 15000 statuses)
# and 5000 columns (corresponding to the top 5000 words used in the
# statuses). Increase "max_features" in CountVectorizer to increase
# the number of words.

import numpy as np

#get the column names (the words in the bag)
vocab = vectorizer.get_feature_names() 

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag

26 abandoned
24 abide
261 abilities
126 ability
1466 able
43 aboard
56 abortion
90 abroad
212 absent
39 absolute
343 absolutely
258 abuse
72 abused
29 abusing
75 abusive
172 accept
43 acceptable
43 acceptance
64 accepted
44 accepting
69 access
48 accessibility
37 accessible
63 accident
22 accidentally
177 accomplish
175 accomplished
139 accomplishing
204 according
391 account
43 accounted
44 accounting
66 accounts
40 accredited
21 accurately
74 accused
22 acer
36 ache
90 achieve
172 achieved
26 achievements
37 acknowledge
42 acknowledged
506 acquire
171 acquired
243 acquiring
92 acquisition
265 across
195 act
60 acted
76 acting
354 action
27 actionable
116 actions
226 active
39 activities
239 activity
61 acts
62 actual
1550 actually
397 acupuncture
60 acupuncturere
36 acupuncturist
328 ad
94 adam
432 add
386 added
53 addicted
43 addiction
61 adding
199 addition
718 additional
80 additionally
156 address
33 addressed
21 adhere
23 adjustments
263 administration
26 admire
120 admit
42 adm

In [10]:
#let's order them by frequency

testwords = zip(vocab, dist)
for tag, count in sorted(testwords, key=lambda testwords: testwords[1], reverse=True):
    print count, tag

9440 like
6854 search
5643 one
5372 people
5252 would
5219 know
5043 time
5017 get
4899 site
4602 engine
4447 web
4421 really
4335 make
4113 even
4041 want
4033 much
3876 love
3857 good
3794 new
3761 google
3621 website
3560 day
3350 feel
3260 could
3253 see
3131 need
3057 life
3027 ve
2787 long
2693 content
2683 marketing
2681 think
2633 may
2589 also
2557 go
2551 going
2422 way
2418 got
2352 back
2298 best
2290 never
2260 internet
2238 re
2221 social
2217 things
2209 well
2202 use
2183 always
2099 women
2090 company
2060 business
2041 many
2016 optimization
1973 men
1910 keep
1886 year
1882 ll
1873 person
1865 years
1860 look
1848 still
1844 right
1831 help
1781 find
1766 friends
1763 inside
1763 us
1758 older
1730 someone
1690 work
1652 great
1634 every
1630 might
1623 lasik
1593 said
1587 lot
1574 anyone
1566 around
1559 something
1557 ever
1550 actually
1542 two
1515 better
1505 home
1498 take
1492 singapore
1481 say
1477 let
1466 able
1436 within
1415 eye
1395 first
1378 pages
13