In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re


# Tell iPython to include plots inline in the notebook
%matplotlib inline

data = pd.read_csv("downloaded2.tsv", sep = '\t')
print "Dataset has {} rows, {} columns".format(*data.shape)
print data.head()  # print the first 5 rows

Dataset has 9665 rows, 4 columns
                   id   tweet-id             sentiment  \
0  264183816548130816   15140428              positive   
1  263405084770172928  591166521              negative   
2  262163168678248449   35266263              negative   
3  264249301910310912   18516728              negative   
4  262682041215234048  254373818  objective-OR-neutral   

                                                text  
0  Gas by my house hit $3.39!!!! I'm going to Cha...  
1                                      Not Available  
2                                      Not Available  
3  Iranian general says Israel's Iron Dome can't ...  
4                                      Not Available  


In [2]:
#Let's drop the id-columns, they were used to download the twitter data, with twitter API.
df = data.drop(data.columns[[0,1]], axis=1)
print df.head()

              sentiment                                               text
0              positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1              negative                                      Not Available
2              negative                                      Not Available
3              negative  Iranian general says Israel's Iron Dome can't ...
4  objective-OR-neutral                                      Not Available


In [3]:
#Now let's drop all the rows in which the tweet was no longer available.
df = df[df.text != "Not Available"]
df = df.reset_index(drop=True) #reset the index after dropping the above rows
print df.head()
print "Dataset has {} rows, {} columns".format(*df.shape)
print df[:20]

  sentiment                                               text
0  positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1  negative  Iranian general says Israel's Iron Dome can't ...
2  positive  with J Davlar 11th. Main rivals are team Polan...
3  negative  Talking about ACT's &amp;&amp; SAT's, deciding...
4  negative  They may have a SuperBowl in Dallas, but Dalla...
Dataset has 7549 rows, 2 columns
               sentiment                                               text
0               positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1               negative  Iranian general says Israel's Iron Dome can't ...
2               positive  with J Davlar 11th. Main rivals are team Polan...
3               negative  Talking about ACT's &amp;&amp; SAT's, deciding...
4               negative  They may have a SuperBowl in Dallas, but Dalla...
5                neutral  Im bringing the monster load of candy tomorrow...
6   objective-OR-neutral  Apple software, retail chiefs o

In [4]:
#Next we need to re-write the neutral / objective labels to all be neutral.  The organizers kept this distinction
#for other tasks, but for this task, it's considered the same.
# so, let's re-write all objective ->neutral, and all neutral-OR-objective --> neutral.

#Since we probably will want our labels numeric (some classifiers may not like 3-way text labels),
#we can do that all now.

df = df.apply(lambda x: x.replace(['positive', 'negative', 'neutral', 'objective', 'objective-OR-neutral']
                                  , [1, -1,0,0,0]) ,1)
print df[:20]

    sentiment                                               text
0           1  Gas by my house hit $3.39!!!! I'm going to Cha...
1          -1  Iranian general says Israel's Iron Dome can't ...
2           1  with J Davlar 11th. Main rivals are team Polan...
3          -1  Talking about ACT's &amp;&amp; SAT's, deciding...
4          -1  They may have a SuperBowl in Dallas, but Dalla...
5           0  Im bringing the monster load of candy tomorrow...
6           0  Apple software, retail chiefs out in overhaul:...
7           1  @oluoch @victor_otti @kunjand I just watched i...
8           0  #Livewire Nadal confirmed for Mexican Open in ...
9           1  @MsSheLahY I didnt want to just pop up... but ...
10          0  @Alyoup005 @addicted2haley hmmmm  November is ...
11          0  #Iran US delisting MKO from global terrorists ...
12          1  Good Morning Becky ! Thursday is going to be F...
13          0  Expect light-moderate rains over E. Visayas; C...
14          1  One ticket

In [5]:
#Let's take a look at our class distribution
total_tweets = len(df)
positive_tweets = sum(df.sentiment == 1)
negative_tweets = sum(df.sentiment == -1)
neutral_tweets = sum(df.sentiment == 0)

print "The total number of samples is : {}".format(len(df.sentiment))
print "There are {} positive tweets or {}%".format \
(positive_tweets, positive_tweets/float(total_tweets) )
print "There are {} Negative tweets or {}%".format \
(negative_tweets, negative_tweets / float(total_tweets))
print "There are {} Neutral tweets or {}%".format \
(neutral_tweets, neutral_tweets/ float(total_tweets))

The total number of samples is : 7549
There are 2820 positive tweets or 0.373559411843%
There are 1066 Negative tweets or 0.141210756392%
There are 3663 Neutral tweets or 0.485229831766%


In [6]:
# Let's load the texts into lists and remove RT's and URls
# we will build a custom function for an individual tweet, 
#and then use Pandas Dataframe.apply() to run it on all tweets.

first_tweet = "Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"
def parse_tweet (text):
    text = text.split()
    return text
    
parsed_tweet = parse_tweet(first_tweet)
print parsed_tweet
#this results in the most basic splitting operation.  However it gets us very close to what we want.
#In the below output the only concern I have is with "!!!!" attached to "$3.39".  This is not really ideal.    

['Gas', 'by', 'my', 'house', 'hit', '$3.39!!!!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat.', ':)']


In [7]:
#Let's enhance the parser to deal with a few more special cases

#compile regex outside of the function, because we will be running this function in a loop.
retweets = re.compile(r'(RT ?@.*:)')   
urls = re.compile(r'(http:.*\b)')
dotdotdot = re.compile(r'(\.\.\.)')
pound_question = re.compile(r'([!\?])')
period_dot = re.compile(r'(\.(?!\d))')

regex_args = (retweets, urls, dotdotdot, pound_question, period_dot)


def parse_tweet (text , retweets, urls, dotdotdot, pound_question, period_dot):
    text = re.sub(retweets, "", text) #removes RT@thisguy: or RT @thisguy:   two common Retweet bits I dont' need
    text = re.sub(urls, "", text) # removes URL's
    text = re.sub(dotdotdot, ' DOTDOTDOT', text) #replace '...' with "DOTDOTDOT' so i preserve the meaning in that token
    text = re.sub(pound_question, r' \1 ', text)  #eyes bleeding? Searches for ! ? and adds white space around them.
    text = re.sub(period_dot, r' \1 ', text) #more blood.  searched for '.' but looks ahead for digits. will not break 3.39
  
    text = text.split()
    return text


############
#Test cases#
############
first_tweet = "Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"
RT_tweet_1 = "Cool #cdnpoli RT@angelpike: Call the hospital in Iqaluit &amp; press 2 for English. \
Experience an aboriginal language as 1st choice"
RT_tweet_2 = "For how long, i might be in NJ then?RT @FoolishInApril: @blove402 Thursday Night the 13th of Dec."
URL_tweet = "Get ready for our Wednesday Drink Specials Wednesday - 3-8pm Have it your Way Margarita Day \
( Bar Brand Only)... http://t.co/ml806WRT"

test1 = parse_tweet(first_tweet, *regex_args)
test2 = parse_tweet(RT_tweet_1, *regex_args)
test3 = parse_tweet(RT_tweet_2, *regex_args)
test4 = parse_tweet(URL_tweet, *regex_args)
print test1
print test2
print test3
print test4

['Gas', 'by', 'my', 'house', 'hit', '$3.39', '!', '!', '!', '!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat', '.', ':)']
['Cool', '#cdnpoli', 'Call', 'the', 'hospital', 'in', 'Iqaluit', '&amp;', 'press', '2', 'for', 'English', '.', 'Experience', 'an', 'aboriginal', 'language', 'as', '1st', 'choice']
['For', 'how', 'long,', 'i', 'might', 'be', 'in', 'NJ', 'then', '?', '@blove402', 'Thursday', 'Night', 'the', '13th', 'of', 'Dec', '.']
['Get', 'ready', 'for', 'our', 'Wednesday', 'Drink', 'Specials', 'Wednesday', '-', '3-8pm', 'Have', 'it', 'your', 'Way', 'Margarita', 'Day', '(', 'Bar', 'Brand', 'Only)', 'DOTDOTDOT']


In [8]:
# ok, now that we have rough parsing, lets parse them all!

df.text = df.text.apply(lambda x: parse_tweet(x,*regex_args))
print df.head()


   sentiment                                               text
0          1  [Gas, by, my, house, hit, $3.39, !, !, !, !, I...
1         -1  [Iranian, general, says, Israel's, Iron, Dome,...
2          1  [with, J, Davlar, 11th, ., Main, rivals, are, ...
3         -1  [Talking, about, ACT's, &amp;&amp;, SAT's,, de...
4         -1  [They, may, have, a, SuperBowl, in, Dallas,, b...


In [13]:
# now that we have all the tweets parsed, we actually want to split into our training / testing sets. 
# This is because n-gram analysis (which comes next), should not be done on the testing data!  
# The n-gram analysis should on be on training data.  

# TODO try to implement n-gram analysis with cross validation, for now I'll use a hold-out testing set
from sklearn import cross_validation

#Let's split up the labels from the training data

X_all = df['text']
y_all = df['sentiment']

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X_all, y_all, test_size=0.25)

print "size of training tweets: ", len(X_train)
print "size of testing tweets: ", len(X_test)

size of training tweets:  5661
size of testing tweets:  1888


In [35]:
# tricked you! we need to merge the labels and parsed tweets for doing our n-gram analysis.
# This is because we will build n-gram models for each class, therefore we need to select only
# those tweets that are positive / negative for the two n-gram tables.

# Let's re-merge the labels into the training data order to do n-gram analysis
XyN_gram = pd.concat([X_train, y_train], axis = 1)

print XyN_gram.head()
print XyN_gram.text[:10]

                                                   text  sentiment
6273  [@Cavillafuertee, Cav, !, See, you, tomorrow, ...          1
3822  [Jungle, Island, says, it, will, be, open, as,...          0
6351  [@DAISIA__Vu, wellllll, friday, imma, be, with...          0
307   [@rinithyme, He'll, be, the, Eli, Manning, of,...          1
5817  [Tragedy, struck, and, collided, in, my, sport...         -1
6273    [@Cavillafuertee, Cav, !, See, you, tomorrow, ...
3822    [Jungle, Island, says, it, will, be, open, as,...
6351    [@DAISIA__Vu, wellllll, friday, imma, be, with...
307     [@rinithyme, He'll, be, the, Eli, Manning, of,...
5817    [Tragedy, struck, and, collided, in, my, sport...
5069                          [ha, !, //t, ., co/Xaw29WA]
6700    [Wicket,, its, getting, worse, for, the, Afgha...
2239    [Listen, in, Hour, 2, for, an, expansive, inte...
4743    [Nicki, Miinaj, Talks, to, Robin, Roberts, on,...
6935    [The, Brick$quad, is, the, Yankees, of, 8th, p...
Name: text, dtype:

In [11]:
#Let's start by developing a function that will take a parsed tweet and output grams of any size

uni_gram_map = {}
bi_gram_map = {}
tri_gram_map = {}

def nGram_counter (parsed_tweet, distance_to_cover, gram_map):
    for_loop_range = range(len(parsed_tweet) - distance_to_cover)    
    for i in for_loop_range:
        gram = tuple(parsed_tweet[i:i+distance_to_cover])
        print "this is the gram:",gram
        if gram in gram_map:
            gram_map[gram] += 1
        else:
            gram_map[gram] = 1


            
nGram_counter(test1, 3, tri_gram_map)
nGram_counter(test1, 2, bi_gram_map)
nGram_counter(test1, 1, uni_gram_map)

#Our output should be a dictionaries of all possible tri-grams, bi-grams and unigrams of the tweet
#"Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"

# if a particular gram exists more than once in a tweet, the counter should have incremented.  We see an example of this
# with the token "!!!!" which is parsed into "!!" 3 times. and "!!!" twice. (it overlaps).

print uni_gram_map
print " "
print bi_gram_map
print " "
print tri_gram_map
print " "

this is the gram: ('Gas', 'by', 'my')
this is the gram: ('by', 'my', 'house')
this is the gram: ('my', 'house', 'hit')
this is the gram: ('house', 'hit', '$3.39')
this is the gram: ('hit', '$3.39', '!')
this is the gram: ('$3.39', '!', '!')
this is the gram: ('!', '!', '!')
this is the gram: ('!', '!', '!')
this is the gram: ('!', '!', "I'm")
this is the gram: ('!', "I'm", 'going')
this is the gram: ("I'm", 'going', 'to')
this is the gram: ('going', 'to', 'Chapel')
this is the gram: ('to', 'Chapel', 'Hill')
this is the gram: ('Chapel', 'Hill', 'on')
this is the gram: ('Hill', 'on', 'Sat')
this is the gram: ('on', 'Sat', '.')
this is the gram: ('Gas', 'by')
this is the gram: ('by', 'my')
this is the gram: ('my', 'house')
this is the gram: ('house', 'hit')
this is the gram: ('hit', '$3.39')
this is the gram: ('$3.39', '!')
this is the gram: ('!', '!')
this is the gram: ('!', '!')
this is the gram: ('!', '!')
this is the gram: ('!', "I'm")
this is the gram: ("I'm", 'going')
this is the gr

In [37]:
#let's now apply our n-gram counter to all the tweets of a certain class.
# Let's make the positive n_gram map, on the training data.

def print_tweet(text):
    print "the type on 'text' is ",type(text)
    print text

pos_tweets = XyN_gram[XyN_gram.sentiment == 1][:10]
print pos_tweets
pos_tweets.apply(print_tweet(pos_tweets.text))

#pos_tweets.apply(nGram_counter(pos_tweets.text, 3, tri_gram_map))
#print tri_gram_map


#XyN_gram[XyN_gram.sentiment == 1].apply(nGram_counter(XyN_gram.text, 3, tri_gram_map))
#print tri_gram_map


    
    

                                                   text  sentiment
6273  [@Cavillafuertee, Cav, !, See, you, tomorrow, ...          1
307   [@rinithyme, He'll, be, the, Eli, Manning, of,...          1
2239  [Listen, in, Hour, 2, for, an, expansive, inte...          1
2628  [Well,, I, haven't, done, any, homework, in, t...          1
616   [Celebrity, Juice, is, the, highlight, of, my,...          1
6882  [@tomhanks, Youve, done, Bob's, Great,, The, P...          1
1681  [@lovinjakemore, You, bet, !, And, @chuckonthe...          1
4766  [Nicki, Minaj, flashes, a, nipple, on, Good, M...          1
5078  [@laurenbuckenham, guess, what, im, working, i...          1
943   [@JoeTribe_2012, runner, on, 2nd, no, outs,, d...          1
the type on 'text' is  <class 'pandas.core.series.Series'>
6273    [@Cavillafuertee, Cav, !, See, you, tomorrow, ...
307     [@rinithyme, He'll, be, the, Eli, Manning, of,...
2239    [Listen, in, Hour, 2, for, an, expansive, inte...
2628    [Well,, I, haven't, do

TypeError: ("'NoneType' object is not callable", u'occurred at index text')