In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re


# Tell iPython to include plots inline in the notebook
%matplotlib inline

data = pd.read_csv('/Users/Amar/PycharmProjects/MLND_projects/Capstone/data/first-GOP-debate/Sentiment.csv')
print "Dataset has {} rows, {} columns".format(*data.shape)
print data.head()  # print the first 5 rows

Dataset has 13871 rows, 21 columns
    id               candidate  candidate_confidence relevant_yn  \
0    1  No candidate mentioned                     1         yes   
1    2            Scott Walker                     1         yes   
2    3  No candidate mentioned                     1         yes   
3    4  No candidate mentioned                     1         yes   
4    5            Donald Trump                     1         yes   

   relevant_yn_confidence sentiment  sentiment_confidence     subject_matter  \
0                       1   Neutral                0.6578  None of the above   
1                       1  Positive                0.6333  None of the above   
2                       1   Neutral                0.6629  None of the above   
3                       1  Positive                1.0000  None of the above   
4                       1  Positive                0.7045  None of the above   

   subject_matter_confidence candidate_gold             ...              \


In [2]:
#A lot of information we really aren't interested in, so take a look at the columns with their index.
header_index = [(i,z) for z,i in enumerate(data.columns.view())]
print header_index

[(' id', 0), ('candidate', 1), ('candidate_confidence', 2), ('relevant_yn', 3), ('relevant_yn_confidence', 4), ('sentiment', 5), ('sentiment_confidence', 6), ('subject_matter', 7), ('subject_matter_confidence', 8), ('candidate_gold', 9), ('name', 10), ('relevant_yn_gold', 11), ('retweet_count', 12), ('sentiment_gold', 13), ('subject_matter_gold', 14), ('text', 15), ('tweet_coord', 16), ('tweet_created', 17), ('tweet_id', 18), ('tweet_location', 19), ('user_timezone', 20)]


In [3]:
#Now we can drop the ones we won't be using.  Keeping ID-0, sentiment-5, sentiment_confidence-6, text-15.
df = data.drop(data.columns[[0,1,2,3,4,6,7,8,9,10,11,12,13,14,16,17,18,19,20]], axis=1) #probably should keep sentiment confidance
print df.head()

  sentiment                                               text
0   Neutral  RT @NancyLeeGrahn: How did everyone feel about...
1  Positive  RT @ScottWalker: Didn't catch the full #GOPdeb...
2   Neutral  Re-SubmissionT @TJMShow: No mention of Tamir R...
3  Positive  RT @RobGeorge: That Carly Fiorina is trending ...
4  Positive  RT @DanScavino: #GOPDebate w/ @realDonaldTrump...


In [4]:
#Next we need to re-write the neutral / objective labels to all be neutral.  The organizers kept this distinction
#for other tasks, but for this task, it's considered the same.
# so, let's re-write all objective ->neutral, and all neutral-OR-objective --> neutral.

#Since we probably will want our labels numeric (some classifiers may not like 3-way text labels),
#we can do that all now.

df = df.apply(lambda x: x.replace(['Positive', 'Negative', 'Neutral'] # let's just do positive first
                                  , [1, 0,0]) ,1)
print df[:20]

    sentiment                                               text
0           0  RT @NancyLeeGrahn: How did everyone feel about...
1           1  RT @ScottWalker: Didn't catch the full #GOPdeb...
2           0  Re-SubmissionT @TJMShow: No mention of Tamir R...
3           1  RT @RobGeorge: That Carly Fiorina is trending ...
4           1  RT @DanScavino: #GOPDebate w/ @realDonaldTrump...
5           1  RT @GregAbbott_TX: @TedCruz: "On my first day ...
6           0  RT @warriorwoman91: I liked her and was happy ...
7           0  Going on #MSNBC Live with @ThomasARoberts arou...
8           0  Deer in the headlights RT @lizzwinstead: Ben C...
9           0  RT @NancyOsborne180: Last night's debate prove...
10          0  @JGreenDC @realDonaldTrump In all fairness #Bi...
11          1  RT @WayneDupreeShow: Just woke up to tweet thi...
12          0  Me reading my family's comments about how grea...
13          0  RT @ArcticFox2016: RT @AllenWestRepub "Dear @J...
14          1  RT @patton

In [5]:
#Let's take a look at our class distribution
total_tweets = len(df)
positive_tweets = sum(df.sentiment == 1)
negative_tweets = sum(df.sentiment == -1)
neutral_tweets = sum(df.sentiment == 0)

print "The total number of samples is : {}".format(len(df.sentiment))
print "There are {} positive tweets or {}%".format \
(positive_tweets, positive_tweets/float(total_tweets) )
print "There are {} Negative tweets or {}%".format \
(negative_tweets, negative_tweets / float(total_tweets))
print "There are {} Neutral tweets or {}%".format \
(neutral_tweets, neutral_tweets/ float(total_tweets))

The total number of samples is : 13871
There are 2236 positive tweets or 0.161199625117%
There are 0 Negative tweets or 0.0%
There are 11635 Neutral tweets or 0.838800374883%


In [6]:
# Let's load the texts into lists and remove RT's and URls
# we will build a custom function for an individual tweet, 
#and then use Pandas Dataframe.apply() to run it on all tweets.

first_tweet = "Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"
def parse_tweet (text):
    text = text.split()
    return text
    
parsed_tweet = parse_tweet(first_tweet)
print parsed_tweet
#this results in the most basic splitting operation.  However it gets us very close to what we want.
#In the below output the only concern I have is with "!!!!" attached to "$3.39".  This is not really ideal.    

['Gas', 'by', 'my', 'house', 'hit', '$3.39!!!!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat.', ':)']


In [7]:
#stop-word removal
with open('stopwords.txt') as f:
    stop_words = f.read().splitlines()
    stop_words.extend(['I', '\.', 'The','\.\.']) #add upper-case I
print stop_words

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our', 'ours\tourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'the

In [8]:
#Let's enhance the parser to deal with a few more special cases

#compile regex outside of the function, because we will be running this function in a loop.
retweets = re.compile(r'(RT ?@.*?:)')   
urls = re.compile(r'(https?:.*\b)')
dotdotdot = re.compile(r'(\.\.\.)')
pound_question = re.compile(r'([!\?])')
period_dot = re.compile(r'(\.(?!\d))')
stop_word = re.compile(r'\b(?:{})\b'.format('|'.join(stop_words)))

regex_args = (retweets, urls, dotdotdot, pound_question, period_dot, stop_word)

def parse_tweet (text , retweets, urls, dotdotdot, pound_question, period_dot, stop_word):
    text = re.sub(retweets, "", text) #removes RT@thisguy: or RT @thisguy:   two common Retweet bits I dont' need
    text = re.sub(urls, "", text) # removes URL's
    text = re.sub(dotdotdot, ' DOTDOTDOT ', text) #replace '...' with "DOTDOTDOT' so i preserve the meaning in that token
    text = re.sub(pound_question, r' \1 ', text)  #eyes bleeding? Searches for ! ? and adds white space around them.
    text = re.sub(period_dot, r' \1 ', text) #more blood.  searched for '.' but looks ahead for digits. will not break 3.39
    text = re.sub(stop_word, "", text) #removes stop words.
    
    text = text.split()

    return text


############
#Test cases#
############
first_tweet = "Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"
RT_tweet_1 = "Cool #cdnpoli RT@angelpike: Call the hospital in Iqaluit &amp; press 2 for English. \
Experience an aboriginal language as 1st choice"
RT_tweet_2 = "For how long, i might be in NJ then?RT @FoolishInApril: @blove402 Thursday Night the 13th of Dec."
URL_tweet = "Get ready for our Wednesday Drink Specials Wednesday - 3-8pm Have it your Way Margarita Day \
( Bar Brand Only)... http://t.co/ml806WRT"

test1 = parse_tweet(first_tweet, *regex_args)
test2 = parse_tweet(RT_tweet_1, *regex_args)
test3 = parse_tweet(RT_tweet_2, *regex_args)
test4 = parse_tweet(URL_tweet, *regex_args)
print test1
print test2
print test3
print test4

['Gas', 'house', 'hit', '$339', '!', '!', '!', '!', "'m", 'going', 'Chapel', 'Hill', 'Sat', '.', ':)']
['Cool', '#cdnpoli', 'Call', 'hospital', 'Iqaluit', '&amp;', 'press', '2', 'English', '.', 'Experience', 'aboriginal', 'language', '1st', 'choice']
['For', 'long,', 'might', 'NJ', '?', '@blove402', 'Thursday', 'Night', '13th', 'Dec', '.']
['Get', 'ready', 'Wednesday', 'Drink', 'Specials', 'Wednesday', '-', '3-8pm', 'Have', 'Way', 'Margarita', 'Day', '(', 'Bar', 'Brand', 'Only)', 'DOTDOTDOT']


In [9]:
# ok, now that we have rough parsing, lets parse them all!
df.text = df.text.apply(lambda x: parse_tweet(x,*regex_args))
print df.head()
print df.shape

   sentiment                                               text
0          0  [How, everyone, feel, Climate, Change, questio...
1          1  [Didn't, catch, full, #GOPdebate, last, night,...
2          0  [Re-SubmissionT, @TJMShow:, No, mention, Tamir...
3          1  [That, Carly, Fiorina, trending, --, hours, HE...
4          1  [#GOPDebate, w/, @realDonaldTrump, delivered, ...
(13871, 2)


In [10]:
#drop some tweets that got parsed to zero
df = df[df['text'].map(len) >= 1]
df.shape

(13857, 2)

In [11]:
# now that we have all the tweets parsed, we actually want to split into our training / testing sets. 
# This is because n-gram analysis (which comes next), should not be done on the testing data!  
# The n-gram analysis should on be on training data.  

# TODO try to implement n-gram analysis with cross validation, for now I'll use a hold-out testing set
from sklearn import cross_validation

#Let's split up the labels from the training data

X_all = df['text']
y_all = df['sentiment']

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X_all, y_all, test_size=0.25, stratify = y_all)

print "size of training tweets: ", len(X_train)
print "size of testing tweets: ", len(X_test)

size of training tweets:  10392
size of testing tweets:  3465


In [12]:
# tricked you! we need to merge the labels and parsed tweets for doing our n-gram analysis.
# This is because we will build n-gram models for each class, therefore we need to select only
# those tweets that are positive / negative for the two n-gram tables.

# Let's re-merge the labels into the training data order to do n-gram analysis
XyN_gram = pd.concat([X_train, y_train], axis = 1)

print XyN_gram.head()
print XyN_gram.text[:10]

                                                    text  sentiment
11860  [You, never, know, @realDonaldTrump, frontrunn...          1
1865   [My, takeaway, Mike, Huckabee's, going, privat...          0
1354   [Here's, Donald, Trump, channelling, inner, c*...          0
3641   [My, two, cents:, What,, anything,, learn, las...          0
3421   [#FF, Best, source, 's, going, US, Senate:, @S...          1
11860    [You, never, know, @realDonaldTrump, frontrunn...
1865     [My, takeaway, Mike, Huckabee's, going, privat...
1354     [Here's, Donald, Trump, channelling, inner, c*...
3641     [My, two, cents:, What,, anything,, learn, las...
3421     [#FF, Best, source, 's, going, US, Senate:, @S...
12107    [Jebimiah, talking, respect, life, ., GOP, car...
13748    ["God, loves, needs, money", George, Carlin, &...
772      [As, watch, #GOPDebate,, pray, future, nation,...
5031     [Well,, 's, ., And, minds, aliens, watching, o...
13416    [#GOPDebates, Stop, secular, progressive, move...
Na

In [13]:
#Let's start by developing a function that will take a parsed tweet and output grams of any size

uni_gram_map = {}
bi_gram_map = {}
tri_gram_map = {}

def nGram_counter (parsed_tweet, distance_to_cover, gram_map):
    for_loop_range = range(len(parsed_tweet) - distance_to_cover)    
    for i in for_loop_range:
        gram = tuple(parsed_tweet[i:i+distance_to_cover])
        if gram in gram_map:
            gram_map[gram] += 1
        else:
            gram_map[gram] = 1

nGram_counter(test1, 3, tri_gram_map)
nGram_counter(test1, 2, bi_gram_map)
nGram_counter(test1, 1, uni_gram_map)

#Our output should be a dictionaries of all possible tri-grams, bi-grams and unigrams of the tweet
#"Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"

# if a particular gram exists more than once in a tweet, the counter should have incremented.  We see an example of this
# with the token "!!!!" which is parsed into "!!" 3 times. and "!!!" twice. (it overlaps).


# check our output.
print uni_gram_map
print " "
print bi_gram_map
print " "
print tri_gram_map
print " "

{('.',): 1, ("'m",): 1, ('Hill',): 1, ('!',): 4, ('$339',): 1, ('Sat',): 1, ('house',): 1, ('hit',): 1, ('Chapel',): 1, ('Gas',): 1, ('going',): 1}
 
{('going', 'Chapel'): 1, ('!', '!'): 3, ('!', "'m"): 1, ("'m", 'going'): 1, ('Sat', '.'): 1, ('hit', '$339'): 1, ('Hill', 'Sat'): 1, ('$339', '!'): 1, ('house', 'hit'): 1, ('Chapel', 'Hill'): 1, ('Gas', 'house'): 1}
 
{('!', "'m", 'going'): 1, ('Chapel', 'Hill', 'Sat'): 1, ('house', 'hit', '$339'): 1, ('Hill', 'Sat', '.'): 1, ('!', '!', "'m"): 1, ("'m", 'going', 'Chapel'): 1, ('hit', '$339', '!'): 1, ('Gas', 'house', 'hit'): 1, ('!', '!', '!'): 2, ('$339', '!', '!'): 1, ('going', 'Chapel', 'Hill'): 1}
 


In [14]:
#let's now apply our n-gram counter to all the tweets of a certain class.
# Let's make the positive n_gram map, on the training data.

#for now I will merge all grams into a single map, maybe harder for stats later, but easier for coding now

#setup n-gram maps.
pos_uni_gram_map ={}
pos_bi_gram_map = {}
pos_tri_gram_map = {}

pos_tweets = XyN_gram[XyN_gram.sentiment == 1]

pos_tweets.apply(lambda x: nGram_counter(x.text, 1, pos_uni_gram_map), 1)
pos_tweets.apply(lambda x: nGram_counter(x.text, 2, pos_bi_gram_map), 1)
pos_tweets.apply(lambda x: nGram_counter(x.text, 3, pos_tri_gram_map), 1)
print "Total Unigrams for Positive Tweets : {}".format(len(pos_uni_gram_map))
print 
print "Total Bi-grams for Positive Tweets: {}".format(len(pos_bi_gram_map))
print
print "Total Tri-grams for Positive Tweets: {}".format(len(pos_tri_gram_map))
print
print "Most popular Positive Uni-grams : {}" \
.format(sorted(pos_uni_gram_map.items(), key=lambda x: x[1], reverse = True)[:30])
print
print "Most Popular Positive  Bi-gams : {}" \
.format(sorted(pos_bi_gram_map.items(), key = lambda x: x[1], reverse = True)[:30])
print
print "Most popular Positive Tri-grams : {}" \
.format(sorted(pos_tri_gram_map.items(), key=lambda x: x[1], reverse = True)[:30])

Total Unigrams for Positive Tweets : 4448

Total Bi-grams for Positive Tweets: 10281

Total Tri-grams for Positive Tweets: 10525

Most popular Positive Uni-grams : [(('.',), 1289), (('#GOPDebate',), 639), (('!',), 374), (('Trump',), 266), (('\xf0\x9f\x87\xba\xf0\x9f\x87\xb8',), 212), (('@realDonaldTrump',), 210), (('#GOPDebates',), 206), (('&amp;',), 126), (("'s",), 116), (('?',), 113), (('last',), 112), (('need',), 95), (('Cruz',), 92), (('think',), 91), (('debate',), 88), (('Fox',), 85), (('will',), 84), (('get',), 84), (('night',), 78), (('"',), 75), (('said',), 71), (('Thanks',), 71), (('DOTDOTDOT',), 67), (('like',), 66), (('#TedCruz',), 65), (('Bush',), 65), (('candidates',), 64), (('@megynkelly',), 64), (('next',), 63), (('ratings',), 63)]

Most Popular Positive  Bi-gams : [(('\xf0\x9f\x87\xba\xf0\x9f\x87\xb8', '#GOPDebate'), 186), (('.', '\xf0\x9f\x87\xba\xf0\x9f\x87\xb8'), 113), (('!', '!'), 88), (('.', '#GOPDebate'), 69), (('get', 'rid'), 57), (('Cruz', 'Trump'), 57), (('toge

ok, lets use these gram maps to create some features finally.
so what we want to do is :

take each tweet.text and calculate the probability of that tweet existing as a positive tweet.  
we can use this feature to construct our first classifier, for positive tweets.
let's start by defining a function that calculates the probability of a tweet.  
I will need to include smoothing, normalization and worry about over / underflow.
actually the very first step is to transform our maps into maximum likliehood probabilities.

In [15]:
# maximum likliehood probabilities for positive grams.
# we will calculate maximum likliehood with smoothing, will use simple k-smoothing, with k = 1


def calculate_maximum_likliehood (gram_map, k_smoothing = 1, Prior_map = None):
    MLE_estimates = {}
    total_unique_grams = len(gram_map) # this is V for smoothing 
    total_gram_count = sum(gram_map.values())
    
    if Prior_map != None:
        total_prior_gram_count = sum(Prior_map.values()) # also V for smoothing on conditioned grams
    
    #figure out what kind of gram-map we have
    keys = gram_map.keys()
    if len(keys[0]) == 1: # we have unigrams
        for key in keys:
            MLE_estimates[key] = (gram_map[key]+ k_smoothing) / \
            float(total_unique_grams + k_smoothing * total_gram_count)
            # above will give MLE with smoothing = 1
                
    elif len(keys[0]) == 2: # This means we want to condition on previous uni gram
        for key in keys:
            MLE_estimates[key] = (gram_map[key] + k_smoothing) / \
            float(Prior_map[key[0],] + k_smoothing * total_prior_gram_count)
    else: #should be 3 size, so condition on previous bi-gram
        for key in keys:
            MLE_estimates[key] = (gram_map[key] + k_smoothing) / \
            float(Prior_map[key[:2]] + k_smoothing * total_prior_gram_count)
            

    return MLE_estimates

MLE_pos_uni_gram = calculate_maximum_likliehood(pos_uni_gram_map , 1)
MLE_pos_bi_gram = calculate_maximum_likliehood(pos_bi_gram_map, 1, Prior_map=pos_uni_gram_map)
MLE_pos_tri_gram = calculate_maximum_likliehood(pos_tri_gram_map, 1, Prior_map=pos_bi_gram_map)

## sanity checks
print len(MLE_pos_uni_gram) == len(pos_uni_gram_map)
print len(MLE_pos_bi_gram) == len(pos_bi_gram_map)
print len(MLE_pos_tri_gram) == len(pos_tri_gram_map)

# should look reasonble?
print MLE_pos_bi_gram.values()[:10]

True
True
True
[0.00011041183614883516, 0.00011021105416873313, 0.0001655263738689031, 0.0005517545795630103, 0.00010918818583829229, 0.0001104484205875856, 0.00011034482758620689, 0.00016566348224639682, 0.00010990218705352236, 0.00011024750565018466]


Ok, now we have MLE for all the training data.  This is n-gram analysis on the positive data.
NExt we need to reparse all the tweets, looking up their values in the MLE_pos gram maps.
Take log probabilities of everything If a gram doesn't exist in the correct place, then we'll use smoothing.

In [16]:
import math

def v_plus_n(grams):
    total_unique_grams = len(grams) # this is V for smoothing 
    total_gram_count = sum(grams.values()) #this is N
    return float(total_unique_grams + total_gram_count)


def positive_probability_calculator (parsed_tweet, gram_size):
    
    if len(parsed_tweet) <1:  #this will catch any empty tweets I missed earlier.
        return "NaN"
    
    # access the gram maps we've calculated before
    global MLE_pos_uni_gram
    global MLE_pos_bi_gram
    global MLE_pos_tri_gram
    
    global pos_uni_gram_map
    global pos_bi_gram_map
    
    uni_VplusN = v_plus_n(pos_uni_gram_map) # will use these values in smoothing
    bi_VplusN = v_plus_n(pos_bi_gram_map)
    tri_VplusN = v_plus_n(pos_tri_gram_map)
        
    # gram_map should correspond to gram_size i.E bi-grams, or tri-grams etc.
    loop_range = range(len(parsed_tweet) - gram_size)
    prob = 0
    
    if gram_size == 1: #unigrams
        for i in loop_range:
            gram = tuple(parsed_tweet[i:i+gram_size])
            
            if gram in MLE_pos_uni_gram: #look up the probability value we've already calculated
                prob += math.log(MLE_pos_uni_gram[gram])
            else:  #it's unseen so create a new probability with k-smoothing
                #pass # penalize it with nothing
                prob += math.log( 1.0 / uni_VplusN )  
    
    if gram_size == 2: #bi-grams
        for i in loop_range:
            gram = tuple(parsed_tweet[i:i+gram_size])
            
            if gram in MLE_pos_bi_gram:
                prob += math.log(MLE_pos_bi_gram[gram])  #look up probability we've calculated
            
            else:  #condition the unseen bi-gram on the seen unigram.
                #pass
                if (gram[0],) in pos_uni_gram_map:
                    prob += math.log( 1.0 / (pos_uni_gram_map[gram[0],] + len(pos_uni_gram_map)))  
                    
                    # so if gram = ('this','cat'), and we have never seen that before.  we are
                    # getting a probability that is: 1 / count('this') + count(unique_single grams)
                    #obviously close to zero.  ....
                else: #then even the first part of this unseen bigram is not the unigram database, just do V+N
                    prob += math.log(1.0 / bi_VplusN)
    
    if gram_size == 3: #tri-grams
        for i in loop_range:
            gram = tuple(parsed_tweet[i:i+gram_size])
            
            if gram in MLE_pos_tri_gram:
                prob += math.log(MLE_pos_tri_gram[gram]) # look up prob we've already calculated
            
            else:
                #pass
                if gram[:2] in pos_bi_gram_map:
                    prob += math.log( 1.0 / (pos_bi_gram_map[gram[:2]] + len(pos_bi_gram_map)))
                else:
                    prob += math.log(1.0 / tri_VplusN)
                             
    probability = math.exp(prob) / len(parsed_tweet) # normalize by the number of grams in the tweet.
    return probability
   

test_tweet = ['Gas', 'by', 'my', 'house', 'hit', '$3.39', '!', '!', '!', '!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat', '.', ':)']

print positive_probability_calculator(test_tweet,1)
print positive_probability_calculator(test_tweet,2)
print positive_probability_calculator(test_tweet,3)

2.94144152246e-65
1.24298450991e-68
1.17300230295e-68


now we want to make features using the probability calculator!! time to finally get positive probability features for all our tweets.  both training and testing need them.

In [17]:
X_trainy = pd.DataFrame(X_train)  #have to convert the Series into a dataframe, in order to add columns
X_trainy['POS-uni'] = X_trainy.text.apply(lambda x: positive_probability_calculator(x, 1),1)
X_trainy['POS-bi'] = X_trainy.text.apply(lambda x: positive_probability_calculator(x,2),1)
X_trainy['POS-tri'] = X_trainy.text.apply(lambda x: positive_probability_calculator(x,3),1)
print X_trainy.head()

                                                    text       POS-uni  \
11860  [You, never, know, @realDonaldTrump, frontrunn...  3.737323e-25   
1865   [My, takeaway, Mike, Huckabee's, going, privat...  7.709332e-38   
1354   [Here's, Donald, Trump, channelling, inner, c*...  2.954993e-23   
3641   [My, two, cents:, What,, anything,, learn, las...  1.163266e-32   
3421   [#FF, Best, source, 's, going, US, Senate:, @S...  1.789417e-50   

             POS-bi       POS-tri  
11860  1.443676e-26  1.610607e-24  
1865   1.015776e-37  5.441312e-37  
1354   1.104814e-20  8.578941e-19  
3641   1.038801e-32  4.649673e-32  
3421   3.484650e-53  7.041520e-49  


In [18]:
X_testy = pd.DataFrame(X_test) #have to convert the Series into a dataframe, in order to add columns
X_testy['POS-uni'] = X_testy.text.apply(lambda x: positive_probability_calculator(x, 1), 1)
X_testy['POS-bi'] = X_testy.text.apply(lambda x: positive_probability_calculator(x,2),1)
X_testy['POS-tri'] = X_testy.text.apply(lambda x: positive_probability_calculator(x,3),1)
print X_testy.head()

                                                    text       POS-uni  \
13099  [Jeb, Bush, reminds, elevator, music, ., You, ...  1.148073e-36   
10099             [#GOPDebates, Dr, Ben, damn, funny, !]  6.737671e-17   
921    [@BernieSanders, @ScottWalker, man, say, one, ...  4.291353e-43   
2514   [@EPAespanol, Who's, real, illegal, alien, #GO...  1.894714e-43   
11569  [@RealBenCarson, others, getting, time, debate...  1.780546e-42   

             POS-bi       POS-tri  
13099  7.646237e-41  1.750629e-42  
10099  1.979284e-16  2.535393e-14  
921    1.396483e-50  5.902548e-54  
2514   2.973624e-45  1.781777e-41  
11569  8.187831e-52  7.026553e-49  


Alright, we can now make a classifier with these features.  This classifier will predict positive labels.  Let's try a few classification algorithms

In [19]:
#First let's drop the text tweets, they aren't helpful in actual classification
X_trainy = X_trainy.drop(X_trainy.columns[0], axis =1)
print X_trainy.head()
X_testy = X_testy.drop(X_testy.columns[0], axis =1)
print X_testy.head()

#should be no reason to scale data, because we've normalized it all, it's all probabilities.



            POS-uni        POS-bi       POS-tri
11860  3.737323e-25  1.443676e-26  1.610607e-24
1865   7.709332e-38  1.015776e-37  5.441312e-37
1354   2.954993e-23  1.104814e-20  8.578941e-19
3641   1.163266e-32  1.038801e-32  4.649673e-32
3421   1.789417e-50  3.484650e-53  7.041520e-49
            POS-uni        POS-bi       POS-tri
13099  1.148073e-36  7.646237e-41  1.750629e-42
10099  6.737671e-17  1.979284e-16  2.535393e-14
921    4.291353e-43  1.396483e-50  5.902548e-54
2514   1.894714e-43  2.973624e-45  1.781777e-41
11569  1.780546e-42  8.187831e-52  7.026553e-49


In [20]:
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn import tree
from sklearn.metrics import confusion_matrix


clf = tree.DecisionTreeClassifier()
def basic(clf):
    clf.fit(X_trainy, y_train)

    x_pred = clf.predict(X_trainy)
    F1_train = f1_score(y_train, x_pred)
    train_conf = confusion_matrix(y_train, x_pred)
    
    print "training F1:", F1_train
    print
    print "training confusion:\n", train_conf
    print
    
    y_pred = clf.predict(X_testy)
    F1_score = f1_score(y_test, y_pred)
    conf = confusion_matrix(y_test, y_pred)

    print "testing F1:", F1_score
    print
    print "confusion for testing\n", conf
    
basic(clf)
print X_testy.shape
print X_trainy.shape

training F1: 0.0360255665311

training confusion:
[[8702   15]
 [1644   31]]

testing F1: 0.00704225352113

confusion for testing
[[2899    7]
 [ 557    2]]
(3465, 3)
(10392, 3)


In [21]:
clf = svm.SVC()
basic(clf)

training F1: 0.0

training confusion:
[[8717    0]
 [1675    0]]

testing F1: 0.0

confusion for testing
[[2906    0]
 [ 559    0]]


  'precision', 'predicted', average, warn_for)


In [22]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
basic(clf)

training F1: 0.0

training confusion:
[[8717    0]
 [1675    0]]

testing F1: 0.0

confusion for testing
[[2906    0]
 [ 559    0]]


In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
basic(gnb)

training F1: 0.27963679166

training confusion:
[[ 161 8556]
 [  12 1663]]

testing F1: 0.279210925645

confusion for testing
[[  63 2843]
 [   7  552]]


In [24]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100)
basic(clf)

training F1: 0.0337405468296

training confusion:
[[8702   15]
 [1646   29]]

testing F1: 0.0070796460177

confusion for testing
[[2902    4]
 [ 557    2]]
