In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re


# Tell iPython to include plots inline in the notebook
%matplotlib inline

data = pd.read_csv("downloaded2.tsv", sep = '\t')
print "Dataset has {} rows, {} columns".format(*data.shape)
print data.head()  # print the first 5 rows

Dataset has 9665 rows, 4 columns
                   id   tweet-id             sentiment  \
0  264183816548130816   15140428              positive   
1  263405084770172928  591166521              negative   
2  262163168678248449   35266263              negative   
3  264249301910310912   18516728              negative   
4  262682041215234048  254373818  objective-OR-neutral   

                                                text  
0  Gas by my house hit $3.39!!!! I'm going to Cha...  
1                                      Not Available  
2                                      Not Available  
3  Iranian general says Israel's Iron Dome can't ...  
4                                      Not Available  


In [2]:
#Let's drop the id-columns, they were used to download the twitter data, with twitter API.
df = data.drop(data.columns[[0,1]], axis=1)
print df.head()

              sentiment                                               text
0              positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1              negative                                      Not Available
2              negative                                      Not Available
3              negative  Iranian general says Israel's Iron Dome can't ...
4  objective-OR-neutral                                      Not Available


In [3]:
#Now let's drop all the rows in which the tweet was no longer available.
df = df[df.text != "Not Available"]
df = df.reset_index(drop=True) #reset the index after dropping the above rows
print df.head()
print "Dataset has {} rows, {} columns".format(*df.shape)
print df[:20]

  sentiment                                               text
0  positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1  negative  Iranian general says Israel's Iron Dome can't ...
2  positive  with J Davlar 11th. Main rivals are team Polan...
3  negative  Talking about ACT's &amp;&amp; SAT's, deciding...
4  negative  They may have a SuperBowl in Dallas, but Dalla...
Dataset has 7549 rows, 2 columns
               sentiment                                               text
0               positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1               negative  Iranian general says Israel's Iron Dome can't ...
2               positive  with J Davlar 11th. Main rivals are team Polan...
3               negative  Talking about ACT's &amp;&amp; SAT's, deciding...
4               negative  They may have a SuperBowl in Dallas, but Dalla...
5                neutral  Im bringing the monster load of candy tomorrow...
6   objective-OR-neutral  Apple software, retail chiefs o

In [4]:
#Next we need to re-write the neutral / objective labels to all be neutral.  The organizers kept this distinction
#for other tasks, but for this task, it's considered the same.
# so, let's re-write all objective ->neutral, and all neutral-OR-objective --> neutral.

#Since we probably will want our labels numeric (some classifiers may not like 3-way text labels),
#we can do that all now.

df = df.apply(lambda x: x.replace(['positive', 'negative', 'neutral', 'objective', 'objective-OR-neutral']
                                  , [1, 0,0,0,0]) ,1)
print df[:20]

    sentiment                                               text
0           1  Gas by my house hit $3.39!!!! I'm going to Cha...
1           0  Iranian general says Israel's Iron Dome can't ...
2           1  with J Davlar 11th. Main rivals are team Polan...
3           0  Talking about ACT's &amp;&amp; SAT's, deciding...
4           0  They may have a SuperBowl in Dallas, but Dalla...
5           0  Im bringing the monster load of candy tomorrow...
6           0  Apple software, retail chiefs out in overhaul:...
7           1  @oluoch @victor_otti @kunjand I just watched i...
8           0  #Livewire Nadal confirmed for Mexican Open in ...
9           1  @MsSheLahY I didnt want to just pop up... but ...
10          0  @Alyoup005 @addicted2haley hmmmm  November is ...
11          0  #Iran US delisting MKO from global terrorists ...
12          1  Good Morning Becky ! Thursday is going to be F...
13          0  Expect light-moderate rains over E. Visayas; C...
14          1  One ticket

In [5]:
#Let's take a look at our class distribution
total_tweets = len(df)
positive_tweets = sum(df.sentiment == 1)
negative_tweets = sum(df.sentiment == -1)
neutral_tweets = sum(df.sentiment == 0)

print "The total number of samples is : {}".format(len(df.sentiment))
print "There are {} positive tweets or {}%".format \
(positive_tweets, positive_tweets/float(total_tweets) )
print "There are {} Negative tweets or {}%".format \
(negative_tweets, negative_tweets / float(total_tweets))
print "There are {} Neutral tweets or {}%".format \
(neutral_tweets, neutral_tweets/ float(total_tweets))

The total number of samples is : 7549
There are 2820 positive tweets or 0.373559411843%
There are 0 Negative tweets or 0.0%
There are 4729 Neutral tweets or 0.626440588157%


In [6]:
# Let's load the texts into lists and remove RT's and URls
# we will build a custom function for an individual tweet, 
#and then use Pandas Dataframe.apply() to run it on all tweets.

first_tweet = "Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"
def parse_tweet (text):
    text = text.split()
    return text
    
parsed_tweet = parse_tweet(first_tweet)
print parsed_tweet
#this results in the most basic splitting operation.  However it gets us very close to what we want.
#In the below output the only concern I have is with "!!!!" attached to "$3.39".  This is not really ideal.    

['Gas', 'by', 'my', 'house', 'hit', '$3.39!!!!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat.', ':)']


In [7]:
#Let's enhance the parser to deal with a few more special cases

#compile regex outside of the function, because we will be running this function in a loop.
retweets = re.compile(r'(RT ?@.*:)')   
urls = re.compile(r'(http:.*\b)')
dotdotdot = re.compile(r'(\.\.\.)')
pound_question = re.compile(r'([!\?])')
period_dot = re.compile(r'(\.(?!\d))')

regex_args = (retweets, urls, dotdotdot, pound_question, period_dot)


def parse_tweet (text , retweets, urls, dotdotdot, pound_question, period_dot):
    text = re.sub(retweets, "", text) #removes RT@thisguy: or RT @thisguy:   two common Retweet bits I dont' need
    text = re.sub(urls, "", text) # removes URL's
    text = re.sub(dotdotdot, ' DOTDOTDOT ', text) #replace '...' with "DOTDOTDOT' so i preserve the meaning in that token
    text = re.sub(pound_question, r' \1 ', text)  #eyes bleeding? Searches for ! ? and adds white space around them.
    text = re.sub(period_dot, r' \1 ', text) #more blood.  searched for '.' but looks ahead for digits. will not break 3.39
  
    text = text.split()
    return text


############
#Test cases#
############
first_tweet = "Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"
RT_tweet_1 = "Cool #cdnpoli RT@angelpike: Call the hospital in Iqaluit &amp; press 2 for English. \
Experience an aboriginal language as 1st choice"
RT_tweet_2 = "For how long, i might be in NJ then?RT @FoolishInApril: @blove402 Thursday Night the 13th of Dec."
URL_tweet = "Get ready for our Wednesday Drink Specials Wednesday - 3-8pm Have it your Way Margarita Day \
( Bar Brand Only)... http://t.co/ml806WRT"

test1 = parse_tweet(first_tweet, *regex_args)
test2 = parse_tweet(RT_tweet_1, *regex_args)
test3 = parse_tweet(RT_tweet_2, *regex_args)
test4 = parse_tweet(URL_tweet, *regex_args)
print test1
print test2
print test3
print test4

['Gas', 'by', 'my', 'house', 'hit', '$3.39', '!', '!', '!', '!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat', '.', ':)']
['Cool', '#cdnpoli', 'Call', 'the', 'hospital', 'in', 'Iqaluit', '&amp;', 'press', '2', 'for', 'English', '.', 'Experience', 'an', 'aboriginal', 'language', 'as', '1st', 'choice']
['For', 'how', 'long,', 'i', 'might', 'be', 'in', 'NJ', 'then', '?', '@blove402', 'Thursday', 'Night', 'the', '13th', 'of', 'Dec', '.']
['Get', 'ready', 'for', 'our', 'Wednesday', 'Drink', 'Specials', 'Wednesday', '-', '3-8pm', 'Have', 'it', 'your', 'Way', 'Margarita', 'Day', '(', 'Bar', 'Brand', 'Only)', 'DOTDOTDOT']


In [8]:
# ok, now that we have rough parsing, lets parse them all!
df.text = df.text.apply(lambda x: parse_tweet(x,*regex_args))
print df.head()
print df.shape

   sentiment                                               text
0          1  [Gas, by, my, house, hit, $3.39, !, !, !, !, I...
1          0  [Iranian, general, says, Israel's, Iron, Dome,...
2          1  [with, J, Davlar, 11th, ., Main, rivals, are, ...
3          0  [Talking, about, ACT's, &amp;&amp;, SAT's,, de...
4          0  [They, may, have, a, SuperBowl, in, Dallas,, b...
(7549, 2)


In [9]:
#drop some tweets that got parsed to zero
df = df[df['text'].map(len) >= 1]
df.shape


(7536, 2)

In [10]:
# now that we have all the tweets parsed, we actually want to split into our training / testing sets. 
# This is because n-gram analysis (which comes next), should not be done on the testing data!  
# The n-gram analysis should on be on training data.  

# TODO try to implement n-gram analysis with cross validation, for now I'll use a hold-out testing set
from sklearn import cross_validation

#Let's split up the labels from the training data

X_all = df['text']
y_all = df['sentiment']

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X_all, y_all, test_size=0.25, stratify = y_all)

print "size of training tweets: ", len(X_train)
print "size of testing tweets: ", len(X_test)

size of training tweets:  5652
size of testing tweets:  1884


In [11]:
# tricked you! we need to merge the labels and parsed tweets for doing our n-gram analysis.
# This is because we will build n-gram models for each class, therefore we need to select only
# those tweets that are positive / negative for the two n-gram tables.

# Let's re-merge the labels into the training data order to do n-gram analysis
XyN_gram = pd.concat([X_train, y_train], axis = 1)

print XyN_gram.head()
print XyN_gram.text[:10]

                                                   text  sentiment
4265  [The, Exchange, Club, of, Winona, and, the, Ea...          0
4495  [Tracking, Texas, A&M, Football, Players, in, ...          1
7236  [Rattlestick, recommends, IMAGINING, MADOFF, !...          0
5105  [SHATING, RT, @LeighFrancis, Celebrity, Juice,...          1
331   [@fueltv, @danaehite, what, about, Holland, ?,...          0
4265    [The, Exchange, Club, of, Winona, and, the, Ea...
4495    [Tracking, Texas, A&M, Football, Players, in, ...
7236    [Rattlestick, recommends, IMAGINING, MADOFF, !...
5105    [SHATING, RT, @LeighFrancis, Celebrity, Juice,...
331     [@fueltv, @danaehite, what, about, Holland, ?,...
5987    [WILDCAT, WORKOUT, OCTOBER, 31st, in, the, old...
886     [#torchlight2, comes, out, tomorrow, ., Torchl...
5231    [@MikeDavis88, Pls, tell, Dale, Jr, and, the, ...
5227    [If, you, thought, the, London, Riots, were, b...
1352    [Mizzou, moves, back, the, start, time, for, t...
Name: text, dtype:

In [12]:
#Let's start by developing a function that will take a parsed tweet and output grams of any size

uni_gram_map = {}
bi_gram_map = {}
tri_gram_map = {}

def nGram_counter (parsed_tweet, distance_to_cover, gram_map):
    for_loop_range = range(len(parsed_tweet) - distance_to_cover)    
    for i in for_loop_range:
        gram = tuple(parsed_tweet[i:i+distance_to_cover])
        if gram in gram_map:
            gram_map[gram] += 1
        else:
            gram_map[gram] = 1


            
nGram_counter(test1, 3, tri_gram_map)
nGram_counter(test1, 2, bi_gram_map)
nGram_counter(test1, 1, uni_gram_map)

#Our output should be a dictionaries of all possible tri-grams, bi-grams and unigrams of the tweet
#"Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"

# if a particular gram exists more than once in a tweet, the counter should have incremented.  We see an example of this
# with the token "!!!!" which is parsed into "!!" 3 times. and "!!!" twice. (it overlaps).


# check our output.
print uni_gram_map
print " "
print bi_gram_map
print " "
print tri_gram_map
print " "

{('.',): 1, ("I'm",): 1, ('Hill',): 1, ('!',): 4, ('on',): 1, ('my',): 1, ('$3.39',): 1, ('by',): 1, ('Sat',): 1, ('house',): 1, ('hit',): 1, ('Chapel',): 1, ('to',): 1, ('Gas',): 1, ('going',): 1}
 
{('by', 'my'): 1, ('!', '!'): 3, ('!', "I'm"): 1, ('Hill', 'on'): 1, ('going', 'to'): 1, ('my', 'house'): 1, ('Sat', '.'): 1, ('to', 'Chapel'): 1, ('$3.39', '!'): 1, ('house', 'hit'): 1, ('Gas', 'by'): 1, ('hit', '$3.39'): 1, ('Chapel', 'Hill'): 1, ("I'm", 'going'): 1, ('on', 'Sat'): 1}
 
{('my', 'house', 'hit'): 1, ("I'm", 'going', 'to'): 1, ('Gas', 'by', 'my'): 1, ('on', 'Sat', '.'): 1, ('!', '!', "I'm"): 1, ('house', 'hit', '$3.39'): 1, ('!', "I'm", 'going'): 1, ('going', 'to', 'Chapel'): 1, ('!', '!', '!'): 2, ('Chapel', 'Hill', 'on'): 1, ('hit', '$3.39', '!'): 1, ('Hill', 'on', 'Sat'): 1, ('to', 'Chapel', 'Hill'): 1, ('$3.39', '!', '!'): 1, ('by', 'my', 'house'): 1}
 


In [13]:
#let's now apply our n-gram counter to all the tweets of a certain class.
# Let's make the positive n_gram map, on the training data.

#for now I will merge all grams into a single map, maybe harder for stats later, but easier for coding now

#setup n-gram maps.
pos_uni_gram_map ={}
pos_bi_gram_map = {}
pos_tri_gram_map = {}

pos_tweets = XyN_gram[XyN_gram.sentiment == 1]

pos_tweets.apply(lambda x: nGram_counter(x.text, 1, pos_uni_gram_map), 1)
pos_tweets.apply(lambda x: nGram_counter(x.text, 2, pos_bi_gram_map), 1)
pos_tweets.apply(lambda x: nGram_counter(x.text, 3, pos_tri_gram_map), 1)
print "Total Unigrams for Positive Tweets : {}".format(len(pos_uni_gram_map))
print 
print "Total Bi-grams for Positive Tweets: {}".format(len(pos_bi_gram_map))
print
print "Total Tri-grams for Positive Tweets: {}".format(len(pos_tri_gram_map))
print
print "Most popular Positive Uni-grams : {}" \
.format(sorted(pos_uni_gram_map.items(), key=lambda x: x[1], reverse = True)[:30])
print
print "Most Popular Positive  Bi-gams : {}" \
.format(sorted(pos_bi_gram_map.items(), key = lambda x: x[1], reverse = True)[:30])
print
print "Most popular Positive Tri-grams : {}" \
.format(sorted(pos_tri_gram_map.items(), key=lambda x: x[1], reverse = True)[:30])

Total Unigrams for Positive Tweets : 10232

Total Bi-grams for Positive Tweets: 29266

Total Tri-grams for Positive Tweets: 36046

Most popular Positive Uni-grams : [(('the',), 1512), (('.',), 1502), (('!',), 1464), (('to',), 984), (('on',), 615), (('in',), 592), (('and',), 568), (('I',), 548), (('a',), 546), (('for',), 508), (('of',), 446), (('is',), 386), (('at',), 381), (('you',), 366), (('be',), 351), (('tomorrow',), 334), (('with',), 332), (('it',), 311), (('my',), 250), (('DOTDOTDOT',), 236), (('have',), 182), (('?',), 181), (('that',), 178), (('night',), 170), (('this',), 164), (('will',), 151), (('see',), 146), (('may',), 139), (('tonight',), 138), (('The',), 134)]

Most Popular Positive  Bi-gams : [(('!', '!'), 419), (('in', 'the'), 135), (('for', 'the'), 121), (('of', 'the'), 115), (('on', 'the'), 108), (('at', 'the'), 106), (('to', 'see'), 93), (('to', 'the'), 92), (('going', 'to'), 90), (('.', '.'), 85), (('.', 'I'), 83), (('to', 'be'), 64), (('will', 'be'), 63), (('is', 't

ok, lets use these gram maps to create some features finally.
so what we want to do is :

take each tweet.text and calculate the probability of that tweet existing as a positive tweet.  
we can use this feature to construct our first classifier, for positive tweets.
let's start by defining a function that calculates the probability of a tweet.  
I will need to include smoothing, normalization and worry about over / underflow.
actually the very first step is to transform our maps into maximum likliehood probabilities.

In [14]:
# maximum likliehood probabilities for positive grams.
# we will calculate maximum likliehood with smoothing, will use simple k-smoothing, with k = 1


def calculate_maximum_likliehood (gram_map, k_smoothing = 1, Prior_map = None):
    MLE_estimates = {}
    total_unique_grams = len(gram_map) # this is V for smoothing 
    total_gram_count = sum(gram_map.values())
    
    if Prior_map != None:
        total_prior_gram_count = sum(Prior_map.values()) # also V for smoothing on conditioned grams
    
    #figure out what kind of gram-map we have
    keys = gram_map.keys()
    if len(keys[0]) == 1: # we have unigrams
        for key in keys:
            MLE_estimates[key] = (gram_map[key]+ k_smoothing) / \
            float(total_unique_grams + k_smoothing * total_gram_count)
            # above will give MLE with smoothing = 1
                
    elif len(keys[0]) == 2: # This means we want to condition on previous uni gram
        for key in keys:
            MLE_estimates[key] = (gram_map[key] + k_smoothing) / \
            float(Prior_map[key[0],] + k_smoothing * total_prior_gram_count)
    else: #should be 3 size, so condition on previous bi-gram
        for key in keys:
            MLE_estimates[key] = (gram_map[key] + k_smoothing) / \
            float(Prior_map[key[:2]] + k_smoothing * total_prior_gram_count)
            

    return MLE_estimates

MLE_pos_uni_gram = calculate_maximum_likliehood(pos_uni_gram_map , 1.0)
MLE_pos_bi_gram = calculate_maximum_likliehood(pos_bi_gram_map, 1.0, Prior_map=pos_uni_gram_map)
MLE_pos_tri_gram = calculate_maximum_likliehood(pos_tri_gram_map, 1.0, Prior_map=pos_bi_gram_map)

## sanity checks
print len(MLE_pos_uni_gram) == len(pos_uni_gram_map)
print len(MLE_pos_bi_gram) == len(pos_bi_gram_map)
print len(MLE_pos_tri_gram) == len(pos_tri_gram_map)

# should look reasonble?
print MLE_pos_bi_gram.values()[:10]

True
True
True
[4.5815866034407715e-05, 4.6029919447640965e-05, 4.595377050687009e-05, 4.621819610380607e-05, 4.583161464778404e-05, 4.610632117663332e-05, 0.0003194888178913738, 4.621819610380607e-05, 4.470672389127325e-05, 4.588945230938669e-05]


Ok, now we have MLE for all the training data.  This is n-gram analysis on the positive data.
NExt we need to reparse all the tweets, looking up their values in the MLE_pos gram maps.
Take log probabilities of everything If a gram doesn't exist in the correct place, then we'll use smoothing.

In [15]:
import math

def v_plus_n(grams):
    total_unique_grams = len(grams) # this is V for smoothing 
    total_gram_count = sum(grams.values()) #this is N
    return float(total_unique_grams + total_gram_count)


def positive_probability_calculator (parsed_tweet, gram_size):
    
    if len(parsed_tweet) <1:  #this will catch any empty tweets I missed earlier.
        return "NaN"
    
    # access the gram maps we've calculated before
    global MLE_pos_uni_gram
    global MLE_pos_bi_gram
    global MLE_pos_tri_gram
    
    global pos_uni_gram_map
    global pos_bi_gram_map
    
    uni_VplusN = v_plus_n(pos_uni_gram_map) # will use these values in smoothing
    bi_VplusN = v_plus_n(pos_bi_gram_map)
    tri_VplusN = v_plus_n(pos_tri_gram_map)
        
    # gram_map should correspond to gram_size i.E bi-grams, or tri-grams etc.
    loop_range = range(len(parsed_tweet) - gram_size)
    prob = 0
    
    if gram_size == 1: #unigrams
        for i in loop_range:
            gram = tuple(parsed_tweet[i:i+gram_size])
            
            if gram in MLE_pos_uni_gram: #look up the probability value we've already calculated
                prob += math.log(MLE_pos_uni_gram[gram])
            else:  #it's unseen so create a new probability with k-smoothing
                prob += math.log( 1.0 / uni_VplusN )  
    
    if gram_size == 2: #bi-grams
        for i in loop_range:
            gram = tuple(parsed_tweet[i:i+gram_size])
            
            if gram in MLE_pos_bi_gram:
                prob += math.log(MLE_pos_bi_gram[gram])  #look up probability we've calculated
            
            else:  #condition the unseen bi-gram on the seen unigram.
                if (gram[0],) in pos_uni_gram_map:
                    prob += math.log( 1.0 / (pos_uni_gram_map[gram[0],] + len(pos_uni_gram_map)))  
                    
                    # so if gram = ('this','cat'), and we have never seen that before.  we are
                    # getting a probability that is: 1 / count('this') + count(unique_single grams)
                    #obviously close to zero.  ....
                else: #then even the first part of this unseen bigram is not the unigram database, just do V+N
                    prob += math.log(1.0 / bi_VplusN)
    
    if gram_size == 3: #tri-grams
        for i in loop_range:
            gram = tuple(parsed_tweet[i:i+gram_size])
            
            if gram in MLE_pos_tri_gram:
                prob += math.log(MLE_pos_tri_gram[gram]) # look up prob we've already calculated
            
            else:
                if gram[:2] in pos_bi_gram_map:
                    prob += math.log( 1.0 / (pos_bi_gram_map[gram[:2]] + len(pos_bi_gram_map)))
                else:
                    prob += math.log(1.0 / tri_VplusN)
                             
    probability = math.exp(prob) / len(parsed_tweet) # normalize by the number of grams in the tweet.
    return probability
   

test_tweet = ['Gas', 'by', 'my', 'house', 'hit', '$3.39', '!', '!', '!', '!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat', '.', ':)']

print positive_probability_calculator(test_tweet,1)
print positive_probability_calculator(test_tweet,2)
print positive_probability_calculator(test_tweet,3)

5.17598361006e-52
1.24490995544e-63
2.85988359835e-65


now we want to make features using the probability calculator!! time to finally get positive probability features for all our tweets.  both training and testing need them.

In [16]:
X_trainy = pd.DataFrame(X_train)  #have to convert the Series into a dataframe, in order to add columns
X_trainy['POS-uni'] = X_trainy.text.apply(lambda x: positive_probability_calculator(x, 1),1)
X_trainy['POS-bi'] = X_trainy.text.apply(lambda x: positive_probability_calculator(x,2),1)
X_trainy['POS-tri'] = X_trainy.text.apply(lambda x: positive_probability_calculator(x,3),1)
print X_trainy.head()

                                                   text       POS-uni  \
4265  [The, Exchange, Club, of, Winona, and, the, Ea...  5.352197e-86   
4495  [Tracking, Texas, A&M, Football, Players, in, ...  1.259233e-71   
7236  [Rattlestick, recommends, IMAGINING, MADOFF, !...  3.270087e-67   
5105  [SHATING, RT, @LeighFrancis, Celebrity, Juice,...  2.502038e-54   
331   [@fueltv, @danaehite, what, about, Holland, ?,...  9.403316e-71   

             POS-bi        POS-tri  
4265  2.171768e-102  2.036079e-111  
4495   8.528411e-82   9.987253e-84  
7236   6.694948e-81   2.593735e-86  
5105   5.080316e-64   2.770795e-60  
331    2.317764e-80   4.280052e-84  


In [17]:
X_testy = pd.DataFrame(X_test) #have to convert the Series into a dataframe, in order to add columns
X_testy['POS-uni'] = X_testy.text.apply(lambda x: positive_probability_calculator(x, 1), 1)
X_testy['POS-bi'] = X_testy.text.apply(lambda x: positive_probability_calculator(x,2),1)
X_testy['POS-tri'] = X_testy.text.apply(lambda x: positive_probability_calculator(x,3),1)
print X_testy.head()

                                                   text       POS-uni  \
6817  [Belated, post:, I, actually, really, liked, t...  1.456620e-80   
6782  [Rediff, Live, !, Hundreds, of, Afghans, on, T...  1.691035e-63   
7027  [I, still, remember, those, days, when, Demi, ...  1.648694e-88   
2234  [@Desdemonaous, The, sun, does, not, rise, in,...  1.149490e-84   
7354  [Capt, and, Chloe, were, at, the, Fall, For, H...  2.776724e-77   

             POS-bi        POS-tri  
6817   6.209989e-92   2.255553e-98  
6782   1.473152e-74   1.862555e-81  
7027  9.692981e-107  2.063078e-116  
2234  4.015162e-101  2.891341e-111  
7354   3.048318e-92  3.561305e-101  


Alright, we can now make a classifier with these features.  This classifier will predict positive labels.  Let's try a few classification algorithms

In [18]:
#First let's drop the text tweets, they aren't helpful in actual classification
X_trainy = X_trainy.drop(X_trainy.columns[0], axis =1)
print X_trainy.head()
X_testy = X_testy.drop(X_testy.columns[0], axis =1)
print X_testy.head()


           POS-uni         POS-bi        POS-tri
4265  5.352197e-86  2.171768e-102  2.036079e-111
4495  1.259233e-71   8.528411e-82   9.987253e-84
7236  3.270087e-67   6.694948e-81   2.593735e-86
5105  2.502038e-54   5.080316e-64   2.770795e-60
331   9.403316e-71   2.317764e-80   4.280052e-84
           POS-uni         POS-bi        POS-tri
6817  1.456620e-80   6.209989e-92   2.255553e-98
6782  1.691035e-63   1.473152e-74   1.862555e-81
7027  1.648694e-88  9.692981e-107  2.063078e-116
2234  1.149490e-84  4.015162e-101  2.891341e-111
7354  2.776724e-77   3.048318e-92  3.561305e-101


In [22]:
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn import tree
from sklearn.metrics import confusion_matrix


clf = tree.DecisionTreeClassifier()
def basic(clf):
    clf.fit(X_trainy, y_train)
    y_pred = clf.predict(X_testy)
    F1_score = f1_score(y_test, y_pred)
    conf = confusion_matrix(y_test, y_pred)
    print conf
    print F1_score
    
basic(clf)
print X_testy.shape
print X_trainy.shape

[[1179    0]
 [ 705    0]]
0.0
(1884, 3)
(5652, 3)


In [20]:
clf = svm.SVC()
basic(clf)

[[1179    0]
 [ 705    0]]
0.0


In [21]:
clf = 

SyntaxError: invalid syntax (<ipython-input-21-d5c36b780f5f>, line 1)