In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re


# Tell iPython to include plots inline in the notebook
%matplotlib inline

data_gop = pd.read_csv('/Users/Amar/PycharmProjects/MLND_projects/Capstone/data/first-GOP-debate/Sentiment.csv')
data_semEval = pd.read_csv("downloaded2.tsv", sep = '\t')
print "GOP Dataset has {} rows, {} columns".format(*data_gop.shape)
print "SEm Eval Dataset has {} rows, {} columns".format(*data_semEval.shape)

GOP Dataset has 13871 rows, 21 columns
SEm Eval Dataset has 9665 rows, 4 columns


In [2]:
#A lot of information we really aren't interested in, so take a look at the columns with their index.
header_index = [(i,z) for z,i in enumerate(data_gop.columns.view())]
print header_index

[(' id', 0), ('candidate', 1), ('candidate_confidence', 2), ('relevant_yn', 3), ('relevant_yn_confidence', 4), ('sentiment', 5), ('sentiment_confidence', 6), ('subject_matter', 7), ('subject_matter_confidence', 8), ('candidate_gold', 9), ('name', 10), ('relevant_yn_gold', 11), ('retweet_count', 12), ('sentiment_gold', 13), ('subject_matter_gold', 14), ('text', 15), ('tweet_coord', 16), ('tweet_created', 17), ('tweet_id', 18), ('tweet_location', 19), ('user_timezone', 20)]


In [3]:
#Now we can drop the ones we won't be using.  Keeping ID-0, sentiment-5, sentiment_confidence-6, text-15.
df_gop = data_gop.drop(data_gop.columns[[0,1,2,3,4,6,7,8,9,10,11,12,13,14,16,17,18,19,20]], axis=1) #probably should keep sentiment confidance
df_semEval = data_semEval.drop(data_semEval.columns[[0,1]], axis=1)

#let's also drop rows that aren't available for semEval
df_semEval = df_semEval[df_semEval.text != "Not Available"]
df_semEval = df_semEval.reset_index(drop=True) #reset the index after dropping the above rows

print df_gop.head()
print df_semEval.head()

  sentiment                                               text
0   Neutral  RT @NancyLeeGrahn: How did everyone feel about...
1  Positive  RT @ScottWalker: Didn't catch the full #GOPdeb...
2   Neutral  Re-SubmissionT @TJMShow: No mention of Tamir R...
3  Positive  RT @RobGeorge: That Carly Fiorina is trending ...
4  Positive  RT @DanScavino: #GOPDebate w/ @realDonaldTrump...
  sentiment                                               text
0  positive  Gas by my house hit $3.39!!!! I'm going to Cha...
1  negative  Iranian general says Israel's Iron Dome can't ...
2  positive  with J Davlar 11th. Main rivals are team Polan...
3  negative  Talking about ACT's &amp;&amp; SAT's, deciding...
4  negative  They may have a SuperBowl in Dallas, but Dalla...


In [4]:
#Next we need to re-write the neutral / objective labels to all be neutral.  The organizers kept this distinction
#for other tasks, but for this task, it's considered the same.
# so, let's re-write all objective ->neutral, and all neutral-OR-objective --> neutral.

#Since we probably will want our labels numeric (some classifiers may not like 3-way text labels),
#we can do that all now.

df_gop = df_gop.apply(lambda x: x.replace(['Positive', 'Negative', 'Neutral'] 
                                  , [1, -1,0]) ,1)
print "gop tweets \n"
print df_gop[:20]

df_semEval = df_semEval.apply(lambda x: x.replace(['positive', 'negative', 'neutral', 'objective', 'objective-OR-neutral']
                                  , [1, -1,0,0,0]) ,1)
print "semEval tweets \n"
print df_semEval[:20]

gop tweets 

    sentiment                                               text
0           0  RT @NancyLeeGrahn: How did everyone feel about...
1           1  RT @ScottWalker: Didn't catch the full #GOPdeb...
2           0  Re-SubmissionT @TJMShow: No mention of Tamir R...
3           1  RT @RobGeorge: That Carly Fiorina is trending ...
4           1  RT @DanScavino: #GOPDebate w/ @realDonaldTrump...
5           1  RT @GregAbbott_TX: @TedCruz: "On my first day ...
6          -1  RT @warriorwoman91: I liked her and was happy ...
7           0  Going on #MSNBC Live with @ThomasARoberts arou...
8          -1  Deer in the headlights RT @lizzwinstead: Ben C...
9          -1  RT @NancyOsborne180: Last night's debate prove...
10         -1  @JGreenDC @realDonaldTrump In all fairness #Bi...
11          1  RT @WayneDupreeShow: Just woke up to tweet thi...
12         -1  Me reading my family's comments about how grea...
13          0  RT @ArcticFox2016: RT @AllenWestRepub "Dear @J...
14          

In [5]:
# let's combine our datasets now that they are both ready and in the same format
df = pd.concat([df_semEval, df_gop], axis = 0)

In [6]:
#Let's take a look at our class distribution
total_tweets = len(df)
positive_tweets = sum(df.sentiment == 1)
negative_tweets = sum(df.sentiment == -1)
neutral_tweets = sum(df.sentiment == 0)

print "The total number of samples is : {}".format(len(df.sentiment))
print "There are {} positive tweets or {}%".format \
(positive_tweets, positive_tweets/float(total_tweets) )
print "There are {} Negative tweets or {}%".format \
(negative_tweets, negative_tweets / float(total_tweets))
print "There are {} Neutral tweets or {}%".format \
(neutral_tweets, neutral_tweets/ float(total_tweets))

The total number of samples is : 21420
There are 5056 positive tweets or 0.2360410831%
There are 9559 Negative tweets or 0.446265172736%
There are 6805 Neutral tweets or 0.317693744164%


In [7]:
# Ok, for now, let's build a positive classifier:
df = df.apply(lambda x: x.replace(-1, 0))
print df.head()

   sentiment                                               text
0          1  Gas by my house hit $3.39!!!! I'm going to Cha...
1          0  Iranian general says Israel's Iron Dome can't ...
2          1  with J Davlar 11th. Main rivals are team Polan...
3          0  Talking about ACT's &amp;&amp; SAT's, deciding...
4          0  They may have a SuperBowl in Dallas, but Dalla...


In [12]:
# ok let's parse the tweets use the ARK tokenizer from CMU.
import twokenize as tw

first_tweet = "Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"
RT_tweet_1 = "Cool #cdnpoli RT@angelpike: Call the hospital in Iqaluit &amp; press 2 for English. \
Experience an aboriginal language as 1st choice"
RT_tweet_2 = "For how long, i might be in NJ then?RT @FoolishInApril: @blove402 Thursday Night the 13th of Dec."
URL_tweet = "Get ready for our Wednesday Drink Specials Wednesday - 3-8pm Have it your Way Margarita Day \
( Bar Brand Only)... http://t.co/ml806WRT"

test_tweets = [first_tweet, RT_tweet_1, RT_tweet_2, URL_tweet]

for tweet in test_tweets:
    print tw.tokenizeRawTweetText(tweet)

['Gas', 'by', 'my', 'house', 'hit', '$3.39', '!!!!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat', '.', ':)']
['Cool', '#cdnpoli', 'RT', '@angelpike', ':', 'Call', 'the', 'hospital', 'in', 'Iqaluit', '&', 'press', '2', 'for', 'English', '.', 'Experience', 'an', 'aboriginal', 'language', 'as', '1st', 'choice']
['For', 'how', 'long', ',', 'i', 'might', 'be', 'in', 'NJ', 'then', '?', 'RT', '@FoolishInApril', ':', '@blove402', 'Thursday', 'Night', 'the', '13th', 'of', 'Dec', '.']
['Get', 'ready', 'for', 'our', 'Wednesday', 'Drink', 'Specials', 'Wednesday', '-', '3-8pm', 'Have', 'it', 'your', 'Way', 'Margarita', 'Day', '(', 'Bar', 'Brand', 'Only', ')', '...', 'http://t.co/ml806WRT']


In [None]:
# ok, now that we have rough parsing, lets parse them all!
df.text = df.text.apply(lambda x: parse_tweet(x,*regex_args))
print df.head()
print df.shape

In [None]:
#drop some tweets that got parsed to zero
df = df[df['text'].map(len) >= 1]
df.shape

In [None]:
# now that we have all the tweets parsed, we actually want to split into our training / testing sets. 
# This is because n-gram analysis (which comes next), should not be done on the testing data!  
# The n-gram analysis should on be on training data.  

# TODO try to implement n-gram analysis with cross validation, for now I'll use a hold-out testing set
from sklearn import cross_validation

#Let's split up the labels from the training data

X_all = df['text']
y_all = df['sentiment']

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X_all, y_all, test_size=0.25, stratify = y_all)

print "size of training tweets: ", len(X_train)
print "size of testing tweets: ", len(X_test)

In [None]:
# tricked you! we need to merge the labels and parsed tweets for doing our n-gram analysis.
# This is because we will build n-gram models for each class, therefore we need to select only
# those tweets that are positive / negative for the two n-gram tables.

# Let's re-merge the labels into the training data order to do n-gram analysis
XyN_gram = pd.concat([X_train, y_train], axis = 1)

print XyN_gram[:10]

In [None]:
#Let's start by developing a function that will take a parsed tweet and output grams of any size

uni_gram_map = {}
bi_gram_map = {}
tri_gram_map = {}

def nGram_counter (parsed_tweet, distance_to_cover, gram_map):
    for_loop_range = range(len(parsed_tweet) - distance_to_cover)    
    for i in for_loop_range:
        gram = tuple(parsed_tweet[i:i+distance_to_cover])
        if gram in gram_map:
            gram_map[gram] += 1
        else:
            gram_map[gram] = 1

nGram_counter(test1, 3, tri_gram_map)
nGram_counter(test1, 2, bi_gram_map)
nGram_counter(test1, 1, uni_gram_map)

#Our output should be a dictionaries of all possible tri-grams, bi-grams and unigrams of the tweet
#"Gas by my house hit $3.39!!!! I'm going to Chapel Hill on Sat. :)"

# if a particular gram exists more than once in a tweet, the counter should have incremented.  We see an example of this
# with the token "!!!!" which is parsed into "!!" 3 times. and "!!!" twice. (it overlaps).


# check our output.
print uni_gram_map
print " "
print bi_gram_map
print " "
print tri_gram_map
print " "

In [None]:
#let's now apply our n-gram counter to all the tweets of a certain class.
# Let's make the positive n_gram map, on the training data.

#for now I will merge all grams into a single map, maybe harder for stats later, but easier for coding now

#setup n-gram maps.
pos_uni_gram_map ={}
pos_bi_gram_map = {}
pos_tri_gram_map = {}

pos_tweets = XyN_gram[XyN_gram.sentiment == 1]

pos_tweets.apply(lambda x: nGram_counter(x.text, 1, pos_uni_gram_map), 1)
pos_tweets.apply(lambda x: nGram_counter(x.text, 2, pos_bi_gram_map), 1)
pos_tweets.apply(lambda x: nGram_counter(x.text, 3, pos_tri_gram_map), 1)
print "Total Unigrams for Positive Tweets : {}".format(len(pos_uni_gram_map))
print 
print "Total Bi-grams for Positive Tweets: {}".format(len(pos_bi_gram_map))
print
print "Total Tri-grams for Positive Tweets: {}".format(len(pos_tri_gram_map))
print
print "Most popular Positive Uni-grams : {}" \
.format(sorted(pos_uni_gram_map.items(), key=lambda x: x[1], reverse = True)[:30])
print
print "Most Popular Positive  Bi-gams : {}" \
.format(sorted(pos_bi_gram_map.items(), key = lambda x: x[1], reverse = True)[:30])
print
print "Most popular Positive Tri-grams : {}" \
.format(sorted(pos_tri_gram_map.items(), key=lambda x: x[1], reverse = True)[:30])

ok, lets use these gram maps to create some features finally.
so what we want to do is :

take each tweet.text and calculate the probability of that tweet existing as a positive tweet.  
we can use this feature to construct our first classifier, for positive tweets.
let's start by defining a function that calculates the probability of a tweet.  
I will need to include smoothing, normalization and worry about over / underflow.
actually the very first step is to transform our maps into maximum likliehood probabilities.

In [None]:
# maximum likliehood probabilities for positive grams.
# we will calculate maximum likliehood with smoothing, will use simple k-smoothing, with k = 1


def calculate_maximum_likliehood (gram_map, k_smoothing = 1, Prior_map = None):
    MLE_estimates = {}
    total_unique_grams = len(gram_map) # this is V for smoothing 
    total_gram_count = sum(gram_map.values())
    
    if Prior_map != None:
        total_prior_gram_count = sum(Prior_map.values()) # also V for smoothing on conditioned grams
    
    #figure out what kind of gram-map we have
    keys = gram_map.keys()
    if len(keys[0]) == 1: # we have unigrams
        for key in keys:
            MLE_estimates[key] = (gram_map[key]+ k_smoothing) / \
            float(total_unique_grams + k_smoothing * total_gram_count)
            # above will give MLE with smoothing = 1
                
    elif len(keys[0]) == 2: # This means we want to condition on previous uni gram
        for key in keys:
            MLE_estimates[key] = (gram_map[key] + k_smoothing) / \
            float(Prior_map[key[0],] + k_smoothing * total_prior_gram_count)
    else: #should be 3 size, so condition on previous bi-gram
        for key in keys:
            MLE_estimates[key] = (gram_map[key] + k_smoothing) / \
            float(Prior_map[key[:2]] + k_smoothing * total_prior_gram_count)
            

    return MLE_estimates

MLE_pos_uni_gram = calculate_maximum_likliehood(pos_uni_gram_map , 1)
MLE_pos_bi_gram = calculate_maximum_likliehood(pos_bi_gram_map, 1, Prior_map=pos_uni_gram_map)
MLE_pos_tri_gram = calculate_maximum_likliehood(pos_tri_gram_map, 1, Prior_map=pos_bi_gram_map)

## sanity checks
print len(MLE_pos_uni_gram) == len(pos_uni_gram_map)
print len(MLE_pos_bi_gram) == len(pos_bi_gram_map)
print len(MLE_pos_tri_gram) == len(pos_tri_gram_map)

# should look reasonble?
print MLE_pos_bi_gram.values()[:10]

Ok, now we have MLE for all the training data.  This is n-gram analysis on the positive data.
NExt we need to reparse all the tweets, looking up their values in the MLE_pos gram maps.
Take log probabilities of everything If a gram doesn't exist in the correct place, then we'll use smoothing.

In [None]:
import math

def v_plus_n(grams):
    total_unique_grams = len(grams) # this is V for smoothing 
    total_gram_count = sum(grams.values()) #this is N
    return float(total_unique_grams + total_gram_count)


def positive_probability_calculator (parsed_tweet, gram_size):
    
    if len(parsed_tweet) <1:  #this will catch any empty tweets I missed earlier.
        return "NaN"
    
    # access the gram maps we've calculated before
    global MLE_pos_uni_gram
    global MLE_pos_bi_gram
    global MLE_pos_tri_gram
    
    global pos_uni_gram_map
    global pos_bi_gram_map
    
    uni_VplusN = v_plus_n(pos_uni_gram_map) # will use these values in smoothing
    bi_VplusN = v_plus_n(pos_bi_gram_map)
    tri_VplusN = v_plus_n(pos_tri_gram_map)
        
    # gram_map should correspond to gram_size i.E bi-grams, or tri-grams etc.
    loop_range = range(len(parsed_tweet) - gram_size)
    prob = 0
    
    if gram_size == 1: #unigrams
        for i in loop_range:
            gram = tuple(parsed_tweet[i:i+gram_size])
            
            if gram in MLE_pos_uni_gram: #look up the probability value we've already calculated
                prob += math.log(MLE_pos_uni_gram[gram])
            else:  #it's unseen so create a new probability with k-smoothing
                #pass # penalize it with nothing
                prob += math.log( 1.0 / uni_VplusN )  
    
    if gram_size == 2: #bi-grams
        for i in loop_range:
            gram = tuple(parsed_tweet[i:i+gram_size])
            
            if gram in MLE_pos_bi_gram:
                prob += math.log(MLE_pos_bi_gram[gram])  #look up probability we've calculated
            
            else:  #condition the unseen bi-gram on the seen unigram.
                #pass
                if (gram[0],) in pos_uni_gram_map:
                    prob += math.log( 1.0 / (pos_uni_gram_map[gram[0],] + len(pos_uni_gram_map)))  
                    
                    # so if gram = ('this','cat'), and we have never seen that before.  we are
                    # getting a probability that is: 1 / count('this') + count(unique_single grams)
                    #obviously close to zero.  ....
                else: #then even the first part of this unseen bigram is not the unigram database, just do V+N
                    prob += math.log(1.0 / bi_VplusN)
    
    if gram_size == 3: #tri-grams
        for i in loop_range:
            gram = tuple(parsed_tweet[i:i+gram_size])
            
            if gram in MLE_pos_tri_gram:
                prob += math.log(MLE_pos_tri_gram[gram]) # look up prob we've already calculated
            
            else:
                #pass
                if gram[:2] in pos_bi_gram_map:
                    prob += math.log( 1.0 / (pos_bi_gram_map[gram[:2]] + len(pos_bi_gram_map)))
                else:
                    prob += math.log(1.0 / tri_VplusN)
                             
    probability = math.exp(prob) / len(parsed_tweet) # normalize by the number of grams in the tweet.
    return probability
   

test_tweet = ['Gas', 'by', 'my', 'house', 'hit', '$3.39', '!', '!', '!', '!', "I'm", 'going', 'to', 'Chapel', 'Hill', 'on', 'Sat', '.', ':)']

print positive_probability_calculator(test_tweet,1)
print positive_probability_calculator(test_tweet,2)
print positive_probability_calculator(test_tweet,3)

now we want to make features using the probability calculator!! time to finally get positive probability features for all our tweets.  both training and testing need them.

In [None]:
X_trainy = pd.DataFrame(X_train)  #have to convert the Series into a dataframe, in order to add columns
X_trainy['POS-uni'] = X_trainy.text.apply(lambda x: positive_probability_calculator(x, 1),1)
X_trainy['POS-bi'] = X_trainy.text.apply(lambda x: positive_probability_calculator(x,2),1)
X_trainy['POS-tri'] = X_trainy.text.apply(lambda x: positive_probability_calculator(x,3),1)
print X_trainy.head()

In [None]:
X_testy = pd.DataFrame(X_test) #have to convert the Series into a dataframe, in order to add columns
X_testy['POS-uni'] = X_testy.text.apply(lambda x: positive_probability_calculator(x, 1), 1)
X_testy['POS-bi'] = X_testy.text.apply(lambda x: positive_probability_calculator(x,2),1)
X_testy['POS-tri'] = X_testy.text.apply(lambda x: positive_probability_calculator(x,3),1)
print X_testy.head()

Alright, we can now make a classifier with these features.  This classifier will predict positive labels.  Let's try a few classification algorithms

In [None]:
#First let's drop the text tweets, they aren't helpful in actual classification
X_trainy = X_trainy.drop(X_trainy.columns[0], axis =1)
print X_trainy.head()
X_testy = X_testy.drop(X_testy.columns[0], axis =1)
print X_testy.head()

#should be no reason to scale data, because we've normalized it all, it's all probabilities.



In [None]:
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn import tree
from sklearn.metrics import confusion_matrix

def f1_score_wrap (y_actual, y_predict):
    return f1_score(y_actual, y_predict)


clf = tree.DecisionTreeClassifier()
def basic(clf):
    clf.fit(X_trainy, y_train)

    x_pred = clf.predict(X_trainy)
    F1_train = f1_score_wrap(y_train, x_pred)
    train_conf = confusion_matrix(y_train, x_pred)
    
    print "training F1:", F1_train
    print
    print "training confusion:\n", train_conf
    print
    
    y_pred = clf.predict(X_testy)
    F1_score = f1_score_wrap(y_test, y_pred)
    conf = confusion_matrix(y_test, y_pred)

    print "testing F1:", F1_score
    print
    print "confusion for testing\n", conf
    
basic(clf)
print X_testy.shape
print X_trainy.shape

In [None]:
clf = svm.SVC()
basic(clf)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
basic(clf)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
basic(gnb)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100)
basic(clf)