In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re


# Tell iPython to include plots inline in the notebook
%matplotlib inline

data = pd.read_csv("downloaded2.tsv", sep = '\t')
print "Dataset has {} rows, {} columns".format(*data.shape)
print data.head()  # print the first 5 rows

Dataset has 9665 rows, 4 columns
                   id   tweet-id             sentiment  \
0  264183816548130816   15140428              positive   
1  263405084770172928  591166521              negative   
2  262163168678248449   35266263              negative   
3  264249301910310912   18516728              negative   
4  262682041215234048  254373818  objective-OR-neutral   

                                                text  
0  Gas by my house hit $3.39!!!! I'm going to Cha...  
1                                      Not Available  
2                                      Not Available  
3  Iranian general says Israel's Iron Dome can't ...  
4                                      Not Available  


In [2]:
#Let's drop the id-columns, they were used to download the twitter data, with twitter API.
df = data.drop(data.columns[[0,1]], axis=1)
print df.head()

              sentiment                                               text
0              positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1              negative                                      Not Available
2              negative                                      Not Available
3              negative  Iranian general says Israel's Iron Dome can't ...
4  objective-OR-neutral                                      Not Available


In [3]:
#Now let's drop all the rows in which the tweet was no longer available.
df = df[df.text != "Not Available"]
df = df.reset_index(drop=True) #reset the index after dropping the above rows
print df.head()
print "Dataset has {} rows, {} columns".format(*df.shape)
print df[:20]

  sentiment                                               text
0  positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1  negative  Iranian general says Israel's Iron Dome can't ...
2  positive  with J Davlar 11th. Main rivals are team Polan...
3  negative  Talking about ACT's &amp;&amp; SAT's, deciding...
4  negative  They may have a SuperBowl in Dallas, but Dalla...
Dataset has 7549 rows, 2 columns
               sentiment                                               text
0               positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1               negative  Iranian general says Israel's Iron Dome can't ...
2               positive  with J Davlar 11th. Main rivals are team Polan...
3               negative  Talking about ACT's &amp;&amp; SAT's, deciding...
4               negative  They may have a SuperBowl in Dallas, but Dalla...
5                neutral  Im bringing the monster load of candy tomorrow...
6   objective-OR-neutral  Apple software, retail chiefs o

In [4]:
#Next we need to re-write the neutral / objective labels to all be neutral.  The organizers kept this distinction
#for other tasks, but for this task, it's considered the same.
# so, let's re-write all objective ->neutral, and all neutral-OR-objective --> neutral.

#Since we probably will want our labels numeric (some classifiers may not like 3-way text labels),
#we can do that all now.
df = df.apply(lambda x: x.replace(['positive', 'negative', 'neutral', 'objective', 'objective-OR-neutral']
                                  , [1, -1,0,0,0]) ,1)
print df[:20]

    sentiment                                               text
0           1  Gas by my house hit $3.39!!!! I'm going to Cha...
1          -1  Iranian general says Israel's Iron Dome can't ...
2           1  with J Davlar 11th. Main rivals are team Polan...
3          -1  Talking about ACT's &amp;&amp; SAT's, deciding...
4          -1  They may have a SuperBowl in Dallas, but Dalla...
5           0  Im bringing the monster load of candy tomorrow...
6           0  Apple software, retail chiefs out in overhaul:...
7           1  @oluoch @victor_otti @kunjand I just watched i...
8           0  #Livewire Nadal confirmed for Mexican Open in ...
9           1  @MsSheLahY I didnt want to just pop up... but ...
10          0  @Alyoup005 @addicted2haley hmmmm  November is ...
11          0  #Iran US delisting MKO from global terrorists ...
12          1  Good Morning Becky ! Thursday is going to be F...
13          0  Expect light-moderate rains over E. Visayas; C...
14          1  One ticket

In [5]:
#Let's take a look at our class distribution
total_tweets = len(df)
positive_tweets = sum(df.sentiment == 1)
negative_tweets = sum(df.sentiment == -1)
neutral_tweets = sum(df.sentiment == 0)

print "The total number of samples is : {}".format(len(df.sentiment))
print "There are {} positive tweets or {}%".format \
(positive_tweets, positive_tweets/float(total_tweets) )
print "There are {} Negative tweets or {}%".format \
(negative_tweets, negative_tweets / float(total_tweets))
print "There are {} Neutral tweets or {}%".format \
(neutral_tweets, neutral_tweets/ float(total_tweets))

The total number of samples is : 7549
There are 2820 positive tweets or 0.373559411843%
There are 1066 Negative tweets or 0.141210756392%
There are 3663 Neutral tweets or 0.485229831766%


In [6]:
# Let's load the texts into lists and remove RT's and URls
# we will build a custom function for an individual tweet, 
#and then use Pandas Dataframe.apply() to run it on all tweets.

first_tweet = "Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"
def parse_tweet (text):
    text = text.split()
    return text
    
parsed_tweet = parse_tweet(first_tweet)
print parsed_tweet
#this results in the most basic splitting operation.  However it gets us very close to what we want.
#In the below output the only concern I have is with "!!!!" attached to "$3.39".  This is not really ideal.    

['Gas', 'by', 'my', 'house', 'hit', '$3.39!!!!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat.', ':)']


In [62]:
#Let's enhance the parser to deal with a few more special cases
def parse_tweet (text):
    
    retweets = re.compile(r'(RT ?@.*:)')
    urls = re.compile(r'(http:.*\b)')
    dotdotdot = re.compile(r'(\.\.\.)')
    pound_question = re.compile(r'([!\?])')
    period_dot = re.compile(r'(\.(?!\d))')
    
      
    text = re.sub(retweets, "", text) #removes RT@thisguy: or RT @thisguy:   two common Retweet bits I dont' need
    text = re.sub(urls, "", text) # removes URL's
    text = re.sub((\.\.\.)', ' DOTDOTDOT', text) #replace '...' with "DOTDOTDOT' so i preserve the meaning in that token
    text = re.sub(r'([!\?])', r' \1 ', text)  #eyes bleeding? Searches for ! ? and adds white space around them.
    text = re.sub(r'(\.(?!\d))', r' \1 ', text) #more blood.  searched for '.' but looks ahead for digits. will not break 3.39
  
    text = text.split()
    return text
 
first_tweet = "Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"
RT_tweet_1 = "Cool #cdnpoli RT@angelpike: Call the hospital in Iqaluit &amp; press 2 for English. \
Experience an aboriginal language as 1st choice"
RT_tweet_2 = "For how long, i might be in NJ then?RT @FoolishInApril: @blove402 Thursday Night the 13th of Dec."
URL_tweet = "Get ready for our Wednesday Drink Specials Wednesday - 3-8pm Have it your Way Margarita Day \
( Bar Brand Only)... http://t.co/ml806WRT"

test1 = parse_tweet(first_tweet)
test2 = parse_tweet(RT_tweet_1)
test3 = parse_tweet(RT_tweet_2)
test4 = parse_tweet(URL_tweet)
print test1
print test2
print test3
print test4

['Gas', 'by', 'my', 'house', 'hit', '$3.39', '!', '!', '!', '!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat', '.', ':)']
['Cool', '#cdnpoli', 'Call', 'the', 'hospital', 'in', 'Iqaluit', '&amp;', 'press', '2', 'for', 'English', '.', 'Experience', 'an', 'aboriginal', 'language', 'as', '1st', 'choice']
['For', 'how', 'long,', 'i', 'might', 'be', 'in', 'NJ', 'then', '?', '@blove402', 'Thursday', 'Night', 'the', '13th', 'of', 'Dec', '.']
['Get', 'ready', 'for', 'our', 'Wednesday', 'Drink', 'Specials', 'Wednesday', '-', '3-8pm', 'Have', 'it', 'your', 'Way', 'Margarita', 'Day', '(', 'Bar', 'Brand', 'Only)', 'DOTDOTDOT']


In [None]:
retweets = re.compile(r'(RT ?@.*:)')
urls = re.compile(r'(http:.*\b)')
dotdotdot = re.compile(r'(\.\.\.)')
pound_question = re.compile(r'([!\?])')
period_dot = re.compile(r'(\.(?!\d))')