In [9]:
# The following iPython notebook shows how to take a series of text inputs
# (in this case status updates on the Experience Project), pre-process the
# text to remove extraneous words, HTML, and punctuation, and create a 
# bag of words that allows the user to further train random forest models.
# In this example, I stop at a point where the most common words in the 
# test set have been found.

#import the whole data file
import pandas as pd       
extext = pd.read_csv("EP_data.csv", header=0)
#id_text = pd.read_csv("EP_data.csv", header=0, usecols=[11,12])
extext.shape
print extext.head()

  gender  num_groups  num_entries  num_fans  num_friends     created  \
0      F         210           54        25            2  2014-12-20   
1      F          25            9         2            0  2014-08-28   
2      U           0        65535      5672            0  2008-04-21   
3      M         153            2         6            5  2014-12-13   
4      F          29            8        83           60  2014-12-27   

            last_login  num_logins   last_wall_activity last_circle_activity  \
0  2015-04-03 12:45:01           6  0000-00-00 00:00:00  0000-00-00 00:00:00   
1  2015-04-15 22:32:36          19  0000-00-00 00:00:00  0000-00-00 00:00:00   
2  2012-11-13 18:03:23           5  2009-03-27 21:22:12  0000-00-00 00:00:00   
3  2015-06-01 20:34:52          15  0000-00-00 00:00:00  2014-12-13 08:37:11   
4  2015-05-14 13:25:09          19  0000-00-00 00:00:00  2015-05-14 13:25:34   

    birthdate     gid                                            content  \
0  1997-03

In [10]:
extext.shape[0] #what size is our dataset?

534305

In [139]:
#I can select only the columns I want
from sklearn.cross_validation import train_test_split

data = extext[['gid','content','gender']]
ntrain = 10000
data_train, data_test = train_test_split(data, train_size=ntrain)

subset= data_train
subset.index = range(subset.shape[0])
print subset.shape, data_test.shape

(10000, 3) (524305, 3)


In [103]:
from bs4 import BeautifulSoup
import re
import nltk
#nltk.download()
from nltk.corpus import stopwords # Import the stop word list
def status_to_words( raw_status ):
    # Function to convert a raw status to a string of words
    # The input is a single string (a raw status update), and 
    # the output is a single string (a preprocessed status update)
    #
    # 1. Remove HTML
    status_text = BeautifulSoup(raw_status).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", status_text)
    # remove http URLs
    letters_only = re.sub(r'^https?:\/\/.*[\r\n]*', '', letters_only)
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))
    #Remove 'm' and 'hasn' from the text
    stops = stops.union([u'hasn',u'm',u've',u'll',u're',u'didn',u'us',u'im',u'doesn',u'couldn',u'won',u'isn',u'http',u'www']) 
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))


#Let's see an example of what status_to_words does
clean_status = status_to_words( subset['content'][1] )
print (clean_status)

inbox meh friends


In [104]:
#How many statuses do we have? Should be same as row number of extext.shape above
num_status = subset["content"].size
print num_status

10000


In [105]:
#Now run status_to_words on all the statuses (or a subset) for preprocessing
print "Cleaning and parsing the status updates...\n"
clean_statuses = []
#num_status = subset["content"].size #for full set
num_status = ntrain #for subset of all statuses
for i in xrange( 0, num_status ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print "Status %d of %d\n" % ( i+1, num_status )                                                                    
    clean_statuses.append( status_to_words( subset["content"][i] ))

Cleaning and parsing the status updates...

Status 1000 of 10000

Status 2000 of 10000

Status 3000 of 10000

Status 4000 of 10000

Status 5000 of 10000

Status 6000 of 10000

Status 7000 of 10000

Status 8000 of 10000

Status 9000 of 10000

Status 10000 of 10000



In [106]:
# create the Bag o' words. Each word that appears in the statuses after preprocessing gets a place
# in the bag (list). Each status gets a list denoting the number of times a particular word
# shows up in that status.  Check out 
# https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words

print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool. Note that some of the preprocessing steps above
# can be done directly with this tool; we may want to do that.
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 500) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_statuses)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [108]:
# so we see it's 15000 rows (corresponging to the the 15000 statuses)
# and 5000 columns (corresponding to the top 5000 words used in the
# statuses). Increase "max_features" in CountVectorizer to increase
# the number of words.

import numpy as np

#get the column names (the words in the bag)
vocab = vectorizer.get_feature_names() 

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag

389 able
429 actually
143 afraid
358 age
149 agemeet
321 ago
168 air
365 almost
501 alone
186 along
293 already
927 also
146 although
1266 always
195 amazing
543 another
179 answer
151 anxiety
327 anymore
788 anyone
762 anything
142 anyway
842 around
337 ask
460 asked
172 attention
741 away
217 baby
1467 back
623 bad
147 barb
349 beautiful
151 became
308 become
323 bed
178 behind
475 believe
698 best
623 better
432 big
339 bit
196 black
424 body
184 book
216 boy
253 boyfriend
162 break
151 bring
188 brother
201 business
283 call
301 called
460 came
159 cannot
174 cant
269 car
499 care
270 cause
149 chance
315 change
155 chat
157 check
248 child
291 children
180 choose
147 church
306 close
143 clothes
147 cold
504 com
645 come
250 comes
197 coming
175 company
218 completely
143 connected
157 contact
187 control
1411 could
221 couple
215 course
190 crazy
167 cry
163 cut
319 dad
199 date
368 dating
159 daughter
1466 day
496 days
385 de
176 deal
226 decided
170 deep
171 depression
161 die


In [109]:
#let's order them by frequency

testwords = zip(vocab, dist)
for tag, count in sorted(testwords, key=lambda testwords: testwords[1], reverse=True):
    print count, tag

4029 like
2817 one
2721 know
2692 love
2445 want
2441 get
2368 would
2297 time
2278 people
2078 life
2003 feel
1760 really
1711 even
1642 never
1576 go
1506 think
1467 back
1466 day
1411 could
1376 much
1371 good
1353 make
1299 see
1272 way
1266 always
1238 got
1227 going
1215 someone
1191 things
1100 still
1051 years
1048 said
1029 friends
1022 need
1009 well
1003 say
991 something
971 right
927 also
909 men
901 first
881 help
856 find
855 person
843 many
842 around
826 little
816 new
812 ever
811 take
803 every
801 women
793 long
791 let
788 anyone
788 talk
762 anything
751 home
747 made
745 told
741 away
728 look
719 man
719 tell
707 world
705 work
701 thing
698 best
693 may
682 thought
682 went
680 started
678 god
668 everything
668 friend
649 since
645 come
643 year
638 night
637 looking
636 two
635 school
630 girl
628 family
627 older
624 getting
623 bad
623 better
620 happy
619 last
617 nothing
583 keep
581 wanted
578 old
574 hard
566 lot
560 heart
559 everyone
557 mind
556 some

In [110]:
from sklearn.ensemble import RandomForestClassifier

In [130]:
forest = RandomForestClassifier(n_estimators = 100) 

In [131]:
forest = forest.fit( train_data_features, subset["gender"] )

In [134]:
ntest = 20000
data_test = data_test[:ntest]
data_test.index = range(data_test.shape[0])
data_test.head()

Unnamed: 0,gid,content,gender
0,828259,"go ahead ,I'll be as honest as possible",M
1,1025005,"Anyone ever tried this?, multiple times?, ma...",M
2,968499,I got this game for my birthday last year and ...,M
3,381536,throw me against the wall and choke me,F
4,353615,i have a Fit body so i hope i can find boyfrie...,F


In [135]:
clean_test_statuses = [] 

print "Cleaning and parsing the test statuses...\n"
for i in xrange(0,num_test):
    if( (i+1) % 1000 == 0 ):
        print "Status %d of %d\n" % (i+1, ntest)
    clean_status = status_to_words( data_test["content"][i] )
    clean_test_statuses.append( clean_status )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_statuses)
test_data_features = test_data_features.toarray()
vocab = vectorizer.get_feature_names() 

# Sum up the counts of each vocabulary word
dist = np.sum(test_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print count, tag


Cleaning and parsing the test statuses...

Status 1000 of 20000

Status 2000 of 20000

Status 3000 of 20000

Status 4000 of 20000

Status 5000 of 20000

Status 6000 of 20000

Status 7000 of 20000

Status 8000 of 20000

Status 9000 of 20000

Status 10000 of 20000

354 able
394 actually
153 afraid
328 age
115 agemeet
334 ago
105 air
362 almost
470 alone
186 along
270 already
881 also
134 although
1179 always
219 amazing
539 another
148 answer
112 anxiety
298 anymore
777 anyone
681 anything
162 anyway
789 around
369 ask
374 asked
159 attention
679 away
213 baby
1357 back
576 bad
24 barb
384 beautiful
148 became
262 become
323 bed
177 behind
462 believe
605 best
621 better
405 big
315 bit
270 black
406 body
119 book
161 boy
256 boyfriend
146 break
152 bring
192 brother
167 business
314 call
254 called
479 came
147 cannot
143 cant
220 car
515 care
259 cause
174 chance
322 change
192 chat
136 check
231 child
205 children
147 choose
93 church
292 close
154 clothes
125 cold
351 com
657 come
24

In [136]:
# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"gid":data_test["gid"], "P_gender":result,"R_gender":data_test['gender']} )

# Use pandas to write the comma-separated output file
#output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )

In [137]:
P_correct = round(sum(output['P_gender']==output['R_gender']),2)/ntest
P_correct

0.256

In [140]:
print "ntest=",ntest

ntest= 20000


In [1]:
import random_forest as rf
import pandas as pd

In [7]:
#EP = pd.read_csv("EP_data.csv", header=0).dropna()
target = 'gender'
ntrain = 10000
ntest = 5000
result = rf.rand_forest_predict(EP,'content',target,ntrain,ntest,nfeature=1000,nestimator=50)

  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


Cleaning and parsing train statuses...

Status 1000 of 10000

Status 2000 of 10000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 3000 of 10000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup


Status 4000 of 10000

Status 5000 of 10000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 6000 of 10000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 7000 of 10000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 8000 of 10000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 9000 of 10000

Status 10000 of 10000

Creating the bag of words...

Fitting to a random forest with  50  parameters...

Cleaning and parsing test statuses...


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 1000 of 5000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 2000 of 5000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 3000 of 5000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 4000 of 5000


  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)



Status 5000 of 5000

Creating the bag of words...



  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


In [5]:
print "The proportion of correct prediction for ",target, " is ", 
round(sum(result['Prediction']==result['True Value']),2)/ntest

 The proportion of correct prediction for  gender  is 

0.4624


