In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
cols = ['polarity','id', 'date', 'query', 'user', 'tweet']

data = pd.read_csv('sentiment.csv',names=cols, encoding='ISO-8859-1')
print('length of data {}'.format(len(data)))

length of data 1600000


In [3]:
data[:5]

Unnamed: 0,polarity,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Sample the data into a random set of 1% of our data set

In [4]:
random = data.sample(frac = 0.01)
random

Unnamed: 0,polarity,id,date,query,user,tweet
1465654,4,2064375874,Sun Jun 07 06:05:30 PDT 2009,NO_QUERY,computergrl123,@billyraycyrus what part of ireland are/were y...
143745,0,1881800500,Fri May 22 05:35:21 PDT 2009,NO_QUERY,gsdadventures,@sarahdessen we still have our winter coats on
1215916,4,1989591306,Mon Jun 01 02:05:52 PDT 2009,NO_QUERY,kopigao,Threadless Tees $5 sale!
1481839,4,2067144985,Sun Jun 07 11:49:44 PDT 2009,NO_QUERY,tiger_lily09,gone swimming with sebs
66450,0,1691813413,Sun May 03 18:39:52 PDT 2009,NO_QUERY,Samrose4,@KimKardashian im at work and im gonna miss it
...,...,...,...,...,...,...
785955,0,2324552163,Thu Jun 25 03:27:00 PDT 2009,NO_QUERY,injenious,@WAHMBizbuilder Yes &amp; stand around in the...
339404,0,2014636897,Wed Jun 03 02:11:13 PDT 2009,NO_QUERY,unsaidthingsx,why are planetickets so expensive
418113,0,2061690969,Sat Jun 06 21:24:13 PDT 2009,NO_QUERY,Monica2112,"@timothyh2o btw, i'm getting tire of the jobr..."
1200850,4,1985574369,Sun May 31 17:21:59 PDT 2009,NO_QUERY,shrimponbarbie,Yay for everyone who's down with the blog/trac...


Dropping columns to clean it up

In [5]:
new_data = random.drop(columns = ['id', 'date', 'query', 'user'])
new_data

Unnamed: 0,polarity,tweet
1465654,4,@billyraycyrus what part of ireland are/were y...
143745,0,@sarahdessen we still have our winter coats on
1215916,4,Threadless Tees $5 sale!
1481839,4,gone swimming with sebs
66450,0,@KimKardashian im at work and im gonna miss it
...,...,...
785955,0,@WAHMBizbuilder Yes &amp; stand around in the...
339404,0,why are planetickets so expensive
418113,0,"@timothyh2o btw, i'm getting tire of the jobr..."
1200850,4,Yay for everyone who's down with the blog/trac...


In [6]:
new_data['polarity'] = new_data['polarity'].apply(lambda x: "pos" if x == 4 else "neg")
new_data

Unnamed: 0,polarity,tweet
1465654,pos,@billyraycyrus what part of ireland are/were y...
143745,neg,@sarahdessen we still have our winter coats on
1215916,pos,Threadless Tees $5 sale!
1481839,pos,gone swimming with sebs
66450,neg,@KimKardashian im at work and im gonna miss it
...,...,...
785955,neg,@WAHMBizbuilder Yes &amp; stand around in the...
339404,neg,why are planetickets so expensive
418113,neg,"@timothyh2o btw, i'm getting tire of the jobr..."
1200850,pos,Yay for everyone who's down with the blog/trac...


In [7]:
new_data['splittweet'] = new_data['tweet'].apply(lambda x: x.split())

In [8]:
new_data

Unnamed: 0,polarity,tweet,splittweet
1465654,pos,@billyraycyrus what part of ireland are/were y...,"[@billyraycyrus, what, part, of, ireland, are/..."
143745,neg,@sarahdessen we still have our winter coats on,"[@sarahdessen, we, still, have, our, winter, c..."
1215916,pos,Threadless Tees $5 sale!,"[Threadless, Tees, $5, sale!]"
1481839,pos,gone swimming with sebs,"[gone, swimming, with, sebs]"
66450,neg,@KimKardashian im at work and im gonna miss it,"[@KimKardashian, im, at, work, and, im, gonna,..."
...,...,...,...
785955,neg,@WAHMBizbuilder Yes &amp; stand around in the...,"[@WAHMBizbuilder, Yes, &amp;, stand, around, i..."
339404,neg,why are planetickets so expensive,"[why, are, planetickets, so, expensive]"
418113,neg,"@timothyh2o btw, i'm getting tire of the jobr...","[@timothyh2o, btw,, i'm, getting, tire, of, th..."
1200850,pos,Yay for everyone who's down with the blog/trac...,"[Yay, for, everyone, who's, down, with, the, b..."


In [9]:
vocabulary = [w for s in new_data.splittweet for w in s ]
len(vocabulary)

209045

In [10]:
len(set(vocabulary))

42547

Making the train and test sets

In [11]:
test = new_data.sample(frac=0.1)
train = new_data[~new_data.index.isin(test.index)]

In [12]:
from collections import Counter as ctr

p_t_estimate = ctr(train.polarity)
p_t_total = len(train)

In [13]:
p_t_estimate['neg']

7148

In [14]:
def Pt(T):
    return p_t_estimate[T] / p_t_total

In [15]:
Pt('pos'), Pt('neg')

(0.5036111111111111, 0.4963888888888889)

##### Estimate P(W)

In [16]:
words = [w for s in train.splittweet for w in s]
p_w_estimate = ctr(words)
p_w_total = len(words)

In [17]:
smoother = 0.00001
def Pw(W):
    if W not in p_w_estimate: return smoother
    return p_w_estimate[W] / p_w_total

In [18]:
Pw('hungry')

6.38073867684749e-05

In [19]:
np.sum([Pw(w) for w in set(words)])

1.0

##### Estimate P(W|T)

In [20]:
p_w_t_estimate = {}
p_w_t_totals = {}

for p in set(train.polarity):
    sub_frame = train[train.polarity == p]
    sub_words = [w for s in sub_frame.splittweet for w in s]
    p_w_t_estimate[p] = ctr(sub_words)
    p_w_t_totals[p] = len(sub_words)

In [21]:
def Pwt(W, T):
    if W not in p_w_t_estimate[T]: return smoother
    return p_w_t_estimate[T][W] / p_w_t_totals[T]

In [22]:
Pwt('the','pos')

0.02367064642861051

##### Bayes Therom

In [23]:
def Ptw(T, W):
    return Pwt(W, T) * Pt(T) / Pw(W)

In [24]:
def Pe(E):
    result = {}
    for p in set(train.polarity):
        result[p] = np.prod([Ptw(p, word) for word in E])
    return result

In [25]:
Pe(['the', 'river', 'is', 'long'])

{'pos': 0.04656821592662114, 'neg': 0.07364525102898878}

In [26]:
len(test)

1600

In [27]:
test['result'] = test.splittweet.apply(Pe)
test

Unnamed: 0,polarity,tweet,splittweet,result
1467193,pos,watching the French Open final,"[watching, the, French, Open, final]","{'pos': 0.055650114983389336, 'neg': 0.0071663..."
283148,neg,Stuck in doctor's waiting room. Had to check i...,"[Stuck, in, doctor's, waiting, room., Had, to,...","{'pos': 1.8421075804466763e-08, 'neg': 7.54635..."
1482033,pos,Fisher cats are great,"[Fisher, cats, are, great]","{'pos': 0.07240285006544905, 'neg': 0.02574631..."
1096179,pos,Time for me to go to bed. Love the idea I can ...,"[Time, for, me, to, go, to, bed., Love, the, i...","{'pos': 1.801786330307612e-08, 'neg': 2.681237..."
384967,neg,It's my birthday! Time to go to work,"[It's, my, birthday!, Time, to, go, to, work]","{'pos': 0.0013991281347556215, 'neg': 0.004804..."
...,...,...,...,...
300022,neg,I...won't be able to play the Sims 3 without a...,"[I...won't, be, able, to, play, the, Sims, 3, ...","{'pos': 5.0881616055200474e-05, 'neg': 0.00057..."
649660,neg,Eating papayas,"[Eating, papayas]","{'pos': 0.3914855680140972, 'neg': 0.119819172..."
413431,neg,"@the_sandra Ahh it was just on, unf lol, im s...","[@the_sandra, Ahh, it, was, just, on,, unf, lo...","{'pos': 4.040806044620598e-05, 'neg': 0.000566..."
813635,pos,p.s. @TobyDiva is one of the smart folks - I w...,"[p.s., @TobyDiva, is, one, of, the, smart, fol...","{'pos': 6.568612896701607e-06, 'neg': 5.483243..."


In [28]:
test['top'] = test.result.apply(lambda x:max(x, key=x.get))
test

Unnamed: 0,polarity,tweet,splittweet,result,top
1467193,pos,watching the French Open final,"[watching, the, French, Open, final]","{'pos': 0.055650114983389336, 'neg': 0.0071663...",pos
283148,neg,Stuck in doctor's waiting room. Had to check i...,"[Stuck, in, doctor's, waiting, room., Had, to,...","{'pos': 1.8421075804466763e-08, 'neg': 7.54635...",neg
1482033,pos,Fisher cats are great,"[Fisher, cats, are, great]","{'pos': 0.07240285006544905, 'neg': 0.02574631...",pos
1096179,pos,Time for me to go to bed. Love the idea I can ...,"[Time, for, me, to, go, to, bed., Love, the, i...","{'pos': 1.801786330307612e-08, 'neg': 2.681237...",neg
384967,neg,It's my birthday! Time to go to work,"[It's, my, birthday!, Time, to, go, to, work]","{'pos': 0.0013991281347556215, 'neg': 0.004804...",neg
...,...,...,...,...,...
300022,neg,I...won't be able to play the Sims 3 without a...,"[I...won't, be, able, to, play, the, Sims, 3, ...","{'pos': 5.0881616055200474e-05, 'neg': 0.00057...",neg
649660,neg,Eating papayas,"[Eating, papayas]","{'pos': 0.3914855680140972, 'neg': 0.119819172...",pos
413431,neg,"@the_sandra Ahh it was just on, unf lol, im s...","[@the_sandra, Ahh, it, was, just, on,, unf, lo...","{'pos': 4.040806044620598e-05, 'neg': 0.000566...",neg
813635,pos,p.s. @TobyDiva is one of the smart folks - I w...,"[p.s., @TobyDiva, is, one, of, the, smart, fol...","{'pos': 6.568612896701607e-06, 'neg': 5.483243...",pos


In [29]:
sum(test.polarity == test.top) / len(test)

0.71

## Comparing Results

#### Lowercase values

In [30]:
new_data['lowercase'] = new_data['tweet'].apply(lambda x: [i.lower() for i in x.split()])
new_data

Unnamed: 0,polarity,tweet,splittweet,lowercase
1465654,pos,@billyraycyrus what part of ireland are/were y...,"[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyrus, what, part, of, ireland, are/..."
143745,neg,@sarahdessen we still have our winter coats on,"[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c..."
1215916,pos,Threadless Tees $5 sale!,"[Threadless, Tees, $5, sale!]","[threadless, tees, $5, sale!]"
1481839,pos,gone swimming with sebs,"[gone, swimming, with, sebs]","[gone, swimming, with, sebs]"
66450,neg,@KimKardashian im at work and im gonna miss it,"[@KimKardashian, im, at, work, and, im, gonna,...","[@kimkardashian, im, at, work, and, im, gonna,..."
...,...,...,...,...
785955,neg,@WAHMBizbuilder Yes &amp; stand around in the...,"[@WAHMBizbuilder, Yes, &amp;, stand, around, i...","[@wahmbizbuilder, yes, &amp;, stand, around, i..."
339404,neg,why are planetickets so expensive,"[why, are, planetickets, so, expensive]","[why, are, planetickets, so, expensive]"
418113,neg,"@timothyh2o btw, i'm getting tire of the jobr...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, getting, tire, of, th..."
1200850,pos,Yay for everyone who's down with the blog/trac...,"[Yay, for, everyone, who's, down, with, the, b...","[yay, for, everyone, who's, down, with, the, b..."


In [31]:
test_lower = new_data.sample(frac=0.1)
train_lower = new_data[~new_data.index.isin(test_lower.index)]

In [32]:
words = [w for s in train_lower.lowercase for w in s]
p_w_estimate = ctr(words)
p_w_total = len(words)

In [33]:
##Checking to see if it works for row 1207318 NKOTB is capitalized then in lowecase is isn't so its probability is 0
Pw('NKOTB')

1e-05

In [50]:
p_w_t_estimate = {}
p_w_t_totals = {}

for l in set(train_lower.polarity):
    sub_frame = train_lower[train_lower.polarity == l]
    sub_words = [w for s in sub_frame.lowercase for w in s]
    p_w_t_estimate[l] = ctr(sub_words)
    p_w_t_totals[l] = len(sub_words)

In [51]:
def PwtL(W, T):
    if W not in p_w_t_estimate[T]: return smoother
    return p_w_t_estimate[T][W] / p_w_t_totals[T]

In [52]:
##Checking to see if it works for row 720998 MO is capitalized then in lowecase is isn't so its probability is 0
PwtL('NKOTB','neg')

1e-05

In [53]:
def PtwL(T, W):
    return PwtL(W, T) * Pt(T) / Pw(W)

In [54]:
def Pe(E):
    result = {}
    for p in set(train_lower.polarity):
        result[p] = np.prod([PtwL(p, word) for word in E])
    return result

In [55]:
Pe(['the', 'river', 'is', 'long'])

{'pos': 0.05730801481486517, 'neg': 0.13674488406356597}

In [56]:
test_lower['result'] = test_lower.lowercase.apply(Pe)
test_lower['top'] = test_lower.result.apply(lambda x:max(x, key=x.get))
test_lower

Unnamed: 0,polarity,tweet,splittweet,lowercase,result,top
787982,neg,@keelybin it only gets worse when ya get ya o...,"[@keelybin, it, only, gets, worse, when, ya, g...","[@keelybin, it, only, gets, worse, when, ya, g...","{'pos': 0.0007575328169419091, 'neg': 0.001582...",neg
1227707,pos,@ashashbaby or the ones u dont wanna hang out ...,"[@ashashbaby, or, the, ones, u, dont, wanna, h...","[@ashashbaby, or, the, ones, u, dont, wanna, h...","{'pos': 0.00046899492279994077, 'neg': 0.00070...",neg
400119,neg,@FDoTNiTTi hell nowhere. its fckn boring.n I w...,"[@FDoTNiTTi, hell, nowhere., its, fckn, boring...","[@fdotnitti, hell, nowhere., its, fckn, boring...","{'pos': 0.0001002709054411572, 'neg': 0.020197...",neg
1387755,pos,is getting ready for 8.......miles that is! T...,"[is, getting, ready, for, 8.......miles, that,...","[is, getting, ready, for, 8.......miles, that,...","{'pos': 0.007700137282766419, 'neg': 0.0018924...",pos
253611,neg,"@rejectedmoments aww are you okay hun, you see...","[@rejectedmoments, aww, are, you, okay, hun,, ...","[@rejectedmoments, aww, are, you, okay, hun,, ...","{'pos': 0.0039206022071749305, 'neg': 0.000493...",pos
...,...,...,...,...,...,...
280622,neg,is at home drinking coffee and wishing Marshal...,"[is, at, home, drinking, coffee, and, wishing,...","[is, at, home, drinking, coffee, and, wishing,...","{'pos': 0.00010924530893752148, 'neg': 0.00147...",neg
598611,neg,"Don't know why i made twitter, don't even use it","[Don't, know, why, i, made, twitter,, don't, e...","[don't, know, why, i, made, twitter,, don't, e...","{'pos': 0.004098271293386389, 'neg': 0.0185327...",neg
46676,neg,Hasn't felt this rough in ages.,"[Hasn't, felt, this, rough, in, ages.]","[hasn't, felt, this, rough, in, ages.]","{'pos': 0.012159590507974606, 'neg': 0.0948742...",neg
459450,neg,@chips99 omg I WAS NOT expecting the ending!! ...,"[@chips99, omg, I, WAS, NOT, expecting, the, e...","[@chips99, omg, i, was, not, expecting, the, e...","{'pos': 0.0008391647218349451, 'neg': 0.321539...",neg


In [57]:
sum(test_lower.polarity == test_lower.top) / len(test_lower)

0.7275

#### HashTag values

In [58]:
new_data['hashtag'] = new_data['tweet'].apply(lambda x: [i.replace('#', '') for i in x.split()])
new_data

Unnamed: 0,polarity,tweet,splittweet,lowercase,hashtag
1465654,pos,@billyraycyrus what part of ireland are/were y...,"[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyrus, what, part, of, ireland, are/..."
143745,neg,@sarahdessen we still have our winter coats on,"[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c..."
1215916,pos,Threadless Tees $5 sale!,"[Threadless, Tees, $5, sale!]","[threadless, tees, $5, sale!]","[Threadless, Tees, $5, sale!]"
1481839,pos,gone swimming with sebs,"[gone, swimming, with, sebs]","[gone, swimming, with, sebs]","[gone, swimming, with, sebs]"
66450,neg,@KimKardashian im at work and im gonna miss it,"[@KimKardashian, im, at, work, and, im, gonna,...","[@kimkardashian, im, at, work, and, im, gonna,...","[@KimKardashian, im, at, work, and, im, gonna,..."
...,...,...,...,...,...
785955,neg,@WAHMBizbuilder Yes &amp; stand around in the...,"[@WAHMBizbuilder, Yes, &amp;, stand, around, i...","[@wahmbizbuilder, yes, &amp;, stand, around, i...","[@WAHMBizbuilder, Yes, &amp;, stand, around, i..."
339404,neg,why are planetickets so expensive,"[why, are, planetickets, so, expensive]","[why, are, planetickets, so, expensive]","[why, are, planetickets, so, expensive]"
418113,neg,"@timothyh2o btw, i'm getting tire of the jobr...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, getting, tire, of, th..."
1200850,pos,Yay for everyone who's down with the blog/trac...,"[Yay, for, everyone, who's, down, with, the, b...","[yay, for, everyone, who's, down, with, the, b...","[Yay, for, everyone, who's, down, with, the, b..."


In [59]:
test_hash = new_data.sample(frac=0.1)
train_hash = new_data[~new_data.index.isin(test_hash.index)]

In [60]:
words = [w for s in train_hash.hashtag for w in s]
p_w_estimate = ctr(words)
p_w_total = len(words)

In [70]:
p_w_t_estimate = {}
p_w_t_totals = {}

for l in set(train_hash.polarity):
    sub_frame = train_hash[train_hash.polarity == l]
    sub_words = [w for s in sub_frame.hashtag for w in s]
    p_w_t_estimate[l] = ctr(sub_words)
    p_w_t_totals[l] = len(sub_words)

In [71]:
def PwtH(W, T):
    if W not in p_w_t_estimate[T]: return smoother
    return p_w_t_estimate[T][W] / p_w_t_totals[T]

In [73]:
def PtwH(T, W):
    return PwtH(W, T) * Pt(T) / Pw(W)

In [74]:
def Pe(E):
    result = {}
    for p in set(train_hash.polarity):
        result[p] = np.prod([PtwH(p, word) for word in E])
    return result

In [78]:
test_hash['result'] = test_hash.hashtag.apply(Pe)
test_hash['top'] = test_hash.result.apply(lambda x:max(x, key=x.get))
test_hash

Unnamed: 0,polarity,tweet,splittweet,lowercase,hashtag,result,top
1593872,pos,is headed back home to make dinner to when my ...,"[is, headed, back, home, to, make, dinner, to,...","[is, headed, back, home, to, make, dinner, to,...","[is, headed, back, home, to, make, dinner, to,...","{'pos': 1.6195479782266468e-07, 'neg': 2.39576...",pos
561450,neg,Off To The Showers Getting Ready For Work....B...,"[Off, To, The, Showers, Getting, Ready, For, W...","[off, to, the, showers, getting, ready, for, w...","[Off, To, The, Showers, Getting, Ready, For, W...","{'pos': 0.018828337309575085, 'neg': 0.0004353...",pos
1427262,pos,@snshyne What a darling little pup. Just look...,"[@snshyne, What, a, darling, little, pup., Jus...","[@snshyne, what, a, darling, little, pup., jus...","[@snshyne, What, a, darling, little, pup., Jus...","{'pos': 5.138225890819373e-06, 'neg': 2.238416...",pos
632979,neg,"Today was a bad day, and I feel like crap beca...","[Today, was, a, bad, day,, and, I, feel, like,...","[today, was, a, bad, day,, and, i, feel, like,...","[Today, was, a, bad, day,, and, I, feel, like,...","{'pos': 6.275123344894387e-06, 'neg': 0.000447...",neg
1521421,pos,hi @ all.... how you been?.... hope youÂ´re fi...,"[hi, @, all...., how, you, been?...., hope, yo...","[hi, @, all...., how, you, been?...., hope, yo...","[hi, @, all...., how, you, been?...., hope, yo...","{'pos': 0.0009872888577810547, 'neg': 0.000588...",pos
...,...,...,...,...,...,...,...
40010,neg,@awesomepam That /would/ be weird to see. I ha...,"[@awesomepam, That, /would/, be, weird, to, se...","[@awesomepam, that, /would/, be, weird, to, se...","[@awesomepam, That, /would/, be, weird, to, se...","{'pos': 0.000175477979527997, 'neg': 0.0019795...",neg
910819,pos,@BigGuitarStore I'll drink to that #shotdrink...,"[@BigGuitarStore, I'll, drink, to, that, #shot...","[@bigguitarstore, i'll, drink, to, that, #shot...","[@BigGuitarStore, I'll, drink, to, that, shotd...","{'pos': 0.014334913519071906, 'neg': 0.0149747...",neg
461635,neg,please..everything to the way it was,"[please..everything, to, the, way, it, was]","[please..everything, to, the, way, it, was]","[please..everything, to, the, way, it, was]","{'pos': 0.014548416849423046, 'neg': 0.0164588...",neg
742176,neg,@Danielle_Jane14 ugh that's no fun :/,"[@Danielle_Jane14, ugh, that's, no, fun, :/]","[@danielle_jane14, ugh, that's, no, fun, :/]","[@Danielle_Jane14, ugh, that's, no, fun, :/]","{'pos': 0.005809600973056252, 'neg': 0.0150599...",neg


In [79]:
sum(test_hash.polarity == test_hash.top) / len(test_hash)

0.725625

#### Stem values

In [80]:
import nltk
from nltk.stem import PorterStemmer
ps=PorterStemmer()
ps.stem('writing')
#split and step through each word, in line for loops
new_data['stem'] = new_data['lowercase'].apply(lambda x: [ps.stem(i) for i in x])
new_data['join'] = new_data['stem'].apply(lambda x: ' '.join(x))
new_data

Unnamed: 0,polarity,tweet,splittweet,lowercase,hashtag,stem,join
1465654,pos,@billyraycyrus what part of ireland are/were y...,"[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyru, what, part, of, ireland, are/w...",@billyraycyru what part of ireland are/wer you...
143745,neg,@sarahdessen we still have our winter coats on,"[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...",@sarahdessen we still have our winter coat on
1215916,pos,Threadless Tees $5 sale!,"[Threadless, Tees, $5, sale!]","[threadless, tees, $5, sale!]","[Threadless, Tees, $5, sale!]","[threadless, tee, $5, sale!]",threadless tee $5 sale!
1481839,pos,gone swimming with sebs,"[gone, swimming, with, sebs]","[gone, swimming, with, sebs]","[gone, swimming, with, sebs]","[gone, swim, with, seb]",gone swim with seb
66450,neg,@KimKardashian im at work and im gonna miss it,"[@KimKardashian, im, at, work, and, im, gonna,...","[@kimkardashian, im, at, work, and, im, gonna,...","[@KimKardashian, im, at, work, and, im, gonna,...","[@kimkardashian, im, at, work, and, im, gonna,...",@kimkardashian im at work and im gonna miss it
...,...,...,...,...,...,...,...
785955,neg,@WAHMBizbuilder Yes &amp; stand around in the...,"[@WAHMBizbuilder, Yes, &amp;, stand, around, i...","[@wahmbizbuilder, yes, &amp;, stand, around, i...","[@WAHMBizbuilder, Yes, &amp;, stand, around, i...","[@wahmbizbuild, ye, &amp;, stand, around, in, ...",@wahmbizbuild ye &amp; stand around in the kit...
339404,neg,why are planetickets so expensive,"[why, are, planetickets, so, expensive]","[why, are, planetickets, so, expensive]","[why, are, planetickets, so, expensive]","[whi, are, planeticket, so, expens]",whi are planeticket so expens
418113,neg,"@timothyh2o btw, i'm getting tire of the jobr...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, get, tire, of, the, j...","@timothyh2o btw, i'm get tire of the jobros. l..."
1200850,pos,Yay for everyone who's down with the blog/trac...,"[Yay, for, everyone, who's, down, with, the, b...","[yay, for, everyone, who's, down, with, the, b...","[Yay, for, everyone, who's, down, with, the, b...","[yay, for, everyon, who', down, with, the, blo...",yay for everyon who' down with the blog/track ...


In [81]:
test_stem = new_data.sample(frac=0.1)
train_stem = new_data[~new_data.index.isin(test_stem.index)]

In [112]:
words = [w for s in train_stem.stem for w in s]
p_w_estimate = ctr(words)
p_w_total = len(words)

p_w_t_estimate = {}
p_w_t_totals = {}

for l in set(train_stem.polarity):
    sub_frame = train_stem[train_stem.polarity == l]
    sub_words = [w for s in sub_frame.stem for w in s]
    p_w_t_estimate[l] = ctr(sub_words)
    p_w_t_totals[l] = len(sub_words)

In [113]:
def PwtS(W, T):
    if W not in p_w_t_estimate[T]: return smoother
    return p_w_t_estimate[T][W] / p_w_t_totals[T]

In [114]:
def PtwS(T, W):
    return PwtS(W, T) * Pt(T) / Pw(W)

In [115]:
def Pe(E):
    result = {}
    for p in set(train_stem.polarity):
        result[p] = np.prod([PtwS(p, word) for word in E])
    return result

In [116]:
test_stem['result'] = test_stem.stem.apply(Pe)
test_stem['top'] = test_stem.result.apply(lambda x:max(x, key=x.get))
test_stem

Unnamed: 0,polarity,tweet,splittweet,lowercase,hashtag,stem,join,result,top
424179,neg,is going to #vote and then drive up to #Bonn ....,"[is, going, to, #vote, and, then, drive, up, t...","[is, going, to, #vote, and, then, drive, up, t...","[is, going, to, vote, and, then, drive, up, to...","[is, go, to, #vote, and, then, drive, up, to, ...",is go to #vote and then drive up to #bonn ... ...,"{'pos': 8.310360091009599e-07, 'neg': 0.000177...",neg
882724,pos,@Bytorsnowdog haha I guess I'll have to. I th...,"[@Bytorsnowdog, haha, I, guess, I'll, have, to...","[@bytorsnowdog, haha, i, guess, i'll, have, to...","[@Bytorsnowdog, haha, I, guess, I'll, have, to...","[@bytorsnowdog, haha, i, guess, i'll, have, to...",@bytorsnowdog haha i guess i'll have to. i thi...,"{'pos': 3.756645294511516e-07, 'neg': 8.892028...",neg
1372708,pos,@jonasbrothers http://twitpic.com/6q1om - ooh ...,"[@jonasbrothers, http://twitpic.com/6q1om, -, ...","[@jonasbrothers, http://twitpic.com/6q1om, -, ...","[@jonasbrothers, http://twitpic.com/6q1om, -, ...","[@jonasbroth, http://twitpic.com/6q1om, -, ooh...",@jonasbroth http://twitpic.com/6q1om - ooh so ...,"{'pos': 0.09017528702834415, 'neg': 0.00575511...",pos
815556,pos,@mpilatow - The consequences of SEO would alwa...,"[@mpilatow, -, The, consequences, of, SEO, wou...","[@mpilatow, -, the, consequences, of, seo, wou...","[@mpilatow, -, The, consequences, of, SEO, wou...","[@mpilatow, -, the, consequ, of, seo, would, a...",@mpilatow - the consequ of seo would alway be ...,"{'pos': 4.790001716863826e-07, 'neg': 2.906082...",neg
770071,neg,@CarlisleRCullen yes but no time for baseball,"[@CarlisleRCullen, yes, but, no, time, for, ba...","[@carlislercullen, yes, but, no, time, for, ba...","[@CarlisleRCullen, yes, but, no, time, for, ba...","[@carlislercullen, ye, but, no, time, for, bas...",@carlislercullen ye but no time for basebal,"{'pos': 0.0051875890176379886, 'neg': 0.008083...",neg
...,...,...,...,...,...,...,...,...,...
47032,neg,"New Dollhouse episode in my line-up, I can't W...","[New, Dollhouse, episode, in, my, line-up,, I,...","[new, dollhouse, episode, in, my, line-up,, i,...","[New, Dollhouse, episode, in, my, line-up,, I,...","[new, dollhous, episod, in, my, line-up,, i, c...","new dollhous episod in my line-up, i can't wai...","{'pos': 1.4600789172049735e-05, 'neg': 2.06209...",neg
905467,pos,@kittyfisher Are they both still talking to yo...,"[@kittyfisher, Are, they, both, still, talking...","[@kittyfisher, are, they, both, still, talking...","[@kittyfisher, Are, they, both, still, talking...","[@kittyfish, are, they, both, still, talk, to,...",@kittyfish are they both still talk to you then?,"{'pos': 0.00229742425534489, 'neg': 0.00124466...",pos
117863,neg,Can't find any super-duper nice Twitter app fo...,"[Can't, find, any, super-duper, nice, Twitter,...","[can't, find, any, super-duper, nice, twitter,...","[Can't, find, any, super-duper, nice, Twitter,...","[can't, find, ani, super-dup, nice, twitter, a...",can't find ani super-dup nice twitter app for ...,"{'pos': 0.0007006699596664662, 'neg': 0.000775...",neg
135263,neg,Ugh tired as fuh driving around pines. Tossed ...,"[Ugh, tired, as, fuh, driving, around, pines.,...","[ugh, tired, as, fuh, driving, around, pines.,...","[Ugh, tired, as, fuh, driving, around, pines.,...","[ugh, tire, as, fuh, drive, around, pines., to...",ugh tire as fuh drive around pines. toss and t...,"{'pos': 2.8672081715784815e-07, 'neg': 3.51485...",neg


In [117]:
sum(test_stem.polarity == test_stem.top) / len(test_stem)

0.71875

## Open Ended

#### Using Numbers

In [107]:
new_data['numbers'] = new_data['splittweet'].apply(lambda x: [i == "NaN" for i in x if not i.isdigit()])
new_data

Unnamed: 0,polarity,tweet,splittweet,lowercase,hashtag,stem,join,numbers
1465654,pos,@billyraycyrus what part of ireland are/were y...,"[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyru, what, part, of, ireland, are/w...",@billyraycyru what part of ireland are/wer you...,"[False, False, False, False, False, False, Fal..."
143745,neg,@sarahdessen we still have our winter coats on,"[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...",@sarahdessen we still have our winter coat on,"[False, False, False, False, False, False, Fal..."
1215916,pos,Threadless Tees $5 sale!,"[Threadless, Tees, $5, sale!]","[threadless, tees, $5, sale!]","[Threadless, Tees, $5, sale!]","[threadless, tee, $5, sale!]",threadless tee $5 sale!,"[False, False, False, False]"
1481839,pos,gone swimming with sebs,"[gone, swimming, with, sebs]","[gone, swimming, with, sebs]","[gone, swimming, with, sebs]","[gone, swim, with, seb]",gone swim with seb,"[False, False, False, False]"
66450,neg,@KimKardashian im at work and im gonna miss it,"[@KimKardashian, im, at, work, and, im, gonna,...","[@kimkardashian, im, at, work, and, im, gonna,...","[@KimKardashian, im, at, work, and, im, gonna,...","[@kimkardashian, im, at, work, and, im, gonna,...",@kimkardashian im at work and im gonna miss it,"[False, False, False, False, False, False, Fal..."
...,...,...,...,...,...,...,...,...
785955,neg,@WAHMBizbuilder Yes &amp; stand around in the...,"[@WAHMBizbuilder, Yes, &amp;, stand, around, i...","[@wahmbizbuilder, yes, &amp;, stand, around, i...","[@WAHMBizbuilder, Yes, &amp;, stand, around, i...","[@wahmbizbuild, ye, &amp;, stand, around, in, ...",@wahmbizbuild ye &amp; stand around in the kit...,"[False, False, False, False, False, False, Fal..."
339404,neg,why are planetickets so expensive,"[why, are, planetickets, so, expensive]","[why, are, planetickets, so, expensive]","[why, are, planetickets, so, expensive]","[whi, are, planeticket, so, expens]",whi are planeticket so expens,"[False, False, False, False, False]"
418113,neg,"@timothyh2o btw, i'm getting tire of the jobr...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, get, tire, of, the, j...","@timothyh2o btw, i'm get tire of the jobros. l...","[False, False, False, False, False, False, Fal..."
1200850,pos,Yay for everyone who's down with the blog/trac...,"[Yay, for, everyone, who's, down, with, the, b...","[yay, for, everyone, who's, down, with, the, b...","[Yay, for, everyone, who's, down, with, the, b...","[yay, for, everyon, who', down, with, the, blo...",yay for everyon who' down with the blog/track ...,"[False, False, False, False, False, False, Fal..."


In [108]:
test_num = new_data.sample(frac=0.1)
train_num = new_data[~new_data.index.isin(test_num.index)]

In [118]:
words = [w for s in train_num.numbers for w in s]
p_w_estimate = ctr(words)
p_w_total = len(words)

p_w_t_estimate = {}
p_w_t_totals = {}

for l in set(train_num.polarity):
    sub_frame = train_num[train_num.polarity == l]
    sub_words = [w for s in sub_frame.numbers for w in s]
    p_w_t_estimate[l] = ctr(sub_words)
    p_w_t_totals[l] = len(sub_words)

In [119]:
def PwtN(W, T):
    if W not in p_w_t_estimate[T]: return smoother
    return p_w_t_estimate[T][W] / p_w_t_totals[T]

In [120]:
def PtwN(T, W):
    return PwtN(W, T) * Pt(T) / Pw(W)

In [121]:
def Pe(E):
    result = {}
    for p in set(train_num.polarity):
        result[p] = np.prod([PtwN(p, word) for word in E])
    return result

In [122]:
test_stem['result'] = test_stem.stem.apply(Pe)
test_stem['top'] = test_stem.result.apply(lambda x:max(x, key=x.get))
test_stem

Unnamed: 0,polarity,tweet,splittweet,lowercase,hashtag,stem,join,result,top
424179,neg,is going to #vote and then drive up to #Bonn ....,"[is, going, to, #vote, and, then, drive, up, t...","[is, going, to, #vote, and, then, drive, up, t...","[is, going, to, vote, and, then, drive, up, to...","[is, go, to, #vote, and, then, drive, up, to, ...",is go to #vote and then drive up to #bonn ... ...,"{'pos': 3.399613537192134e-05, 'neg': 2.737352...",pos
882724,pos,@Bytorsnowdog haha I guess I'll have to. I th...,"[@Bytorsnowdog, haha, I, guess, I'll, have, to...","[@bytorsnowdog, haha, i, guess, i'll, have, to...","[@Bytorsnowdog, haha, I, guess, I'll, have, to...","[@bytorsnowdog, haha, i, guess, i'll, have, to...",@bytorsnowdog haha i guess i'll have to. i thi...,"{'pos': 1.101301083667076e-06, 'neg': 8.249754...",pos
1372708,pos,@jonasbrothers http://twitpic.com/6q1om - ooh ...,"[@jonasbrothers, http://twitpic.com/6q1om, -, ...","[@jonasbrothers, http://twitpic.com/6q1om, -, ...","[@jonasbrothers, http://twitpic.com/6q1om, -, ...","[@jonasbroth, http://twitpic.com/6q1om, -, ooh...",@jonasbroth http://twitpic.com/6q1om - ooh so ...,"{'pos': 0.016314426811923282, 'neg': 0.0149600...",pos
815556,pos,@mpilatow - The consequences of SEO would alwa...,"[@mpilatow, -, The, consequences, of, SEO, wou...","[@mpilatow, -, the, consequences, of, seo, wou...","[@mpilatow, -, The, consequences, of, SEO, wou...","[@mpilatow, -, the, consequ, of, seo, would, a...",@mpilatow - the consequ of seo would alway be ...,"{'pos': 2.7931655259877207e-07, 'neg': 2.03275...",pos
770071,neg,@CarlisleRCullen yes but no time for baseball,"[@CarlisleRCullen, yes, but, no, time, for, ba...","[@carlislercullen, yes, but, no, time, for, ba...","[@CarlisleRCullen, yes, but, no, time, for, ba...","[@carlislercullen, ye, but, no, time, for, bas...",@carlislercullen ye but no time for basebal,"{'pos': 0.008216126613893586, 'neg': 0.0074259...",pos
...,...,...,...,...,...,...,...,...,...
47032,neg,"New Dollhouse episode in my line-up, I can't W...","[New, Dollhouse, episode, in, my, line-up,, I,...","[new, dollhouse, episode, in, my, line-up,, i,...","[New, Dollhouse, episode, in, my, line-up,, I,...","[new, dollhous, episod, in, my, line-up,, i, c...","new dollhous episod in my line-up, i can't wai...","{'pos': 1.7120831508137052e-05, 'neg': 1.35879...",pos
905467,pos,@kittyfisher Are they both still talking to yo...,"[@kittyfisher, Are, they, both, still, talking...","[@kittyfisher, are, they, both, still, talking...","[@kittyfisher, Are, they, both, still, talking...","[@kittyfish, are, they, both, still, talk, to,...",@kittyfish are they both still talk to you then?,"{'pos': 0.0020838081388845054, 'neg': 0.001829...",pos
117863,neg,Can't find any super-duper nice Twitter app fo...,"[Can't, find, any, super-duper, nice, Twitter,...","[can't, find, any, super-duper, nice, twitter,...","[Can't, find, any, super-duper, nice, Twitter,...","[can't, find, ani, super-dup, nice, twitter, a...",can't find ani super-dup nice twitter app for ...,"{'pos': 0.0010494289321660024, 'neg': 0.000908...",pos
135263,neg,Ugh tired as fuh driving around pines. Tossed ...,"[Ugh, tired, as, fuh, driving, around, pines.,...","[ugh, tired, as, fuh, driving, around, pines.,...","[Ugh, tired, as, fuh, driving, around, pines.,...","[ugh, tire, as, fuh, drive, around, pines., to...",ugh tire as fuh drive around pines. toss and t...,"{'pos': 8.622240978959021e-06, 'neg': 6.744890...",pos


In [123]:
sum(test_stem.polarity == test_stem.top) / len(test_stem)

0.49375

I think a way to improve the classifier would be to test the combination of all the @ and # in a tweet. I think this would better the classifier becasue that is a pretty unique combination of all of those specific characters so I think it woud improve the classifier overall especially when possibly identifying harmful tweets attcking someone by not only tagging them but also creating a harmful hashtag.

In [136]:
new_data['atandtag'] = new_data['splittweet'].apply(lambda x: [i for i in x if i[0] =='@' or i[0] == '#'])
new_data

Unnamed: 0,polarity,tweet,splittweet,lowercase,hashtag,stem,join,numbers,atandtag
1465654,pos,@billyraycyrus what part of ireland are/were y...,"[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyrus, what, part, of, ireland, are/...","[@billyraycyru, what, part, of, ireland, are/w...",@billyraycyru what part of ireland are/wer you...,"[False, False, False, False, False, False, Fal...",[@billyraycyrus]
143745,neg,@sarahdessen we still have our winter coats on,"[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...","[@sarahdessen, we, still, have, our, winter, c...",@sarahdessen we still have our winter coat on,"[False, False, False, False, False, False, Fal...",[@sarahdessen]
1215916,pos,Threadless Tees $5 sale!,"[Threadless, Tees, $5, sale!]","[threadless, tees, $5, sale!]","[Threadless, Tees, $5, sale!]","[threadless, tee, $5, sale!]",threadless tee $5 sale!,"[False, False, False, False]",[]
1481839,pos,gone swimming with sebs,"[gone, swimming, with, sebs]","[gone, swimming, with, sebs]","[gone, swimming, with, sebs]","[gone, swim, with, seb]",gone swim with seb,"[False, False, False, False]",[]
66450,neg,@KimKardashian im at work and im gonna miss it,"[@KimKardashian, im, at, work, and, im, gonna,...","[@kimkardashian, im, at, work, and, im, gonna,...","[@KimKardashian, im, at, work, and, im, gonna,...","[@kimkardashian, im, at, work, and, im, gonna,...",@kimkardashian im at work and im gonna miss it,"[False, False, False, False, False, False, Fal...",[@KimKardashian]
...,...,...,...,...,...,...,...,...,...
785955,neg,@WAHMBizbuilder Yes &amp; stand around in the...,"[@WAHMBizbuilder, Yes, &amp;, stand, around, i...","[@wahmbizbuilder, yes, &amp;, stand, around, i...","[@WAHMBizbuilder, Yes, &amp;, stand, around, i...","[@wahmbizbuild, ye, &amp;, stand, around, in, ...",@wahmbizbuild ye &amp; stand around in the kit...,"[False, False, False, False, False, False, Fal...",[@WAHMBizbuilder]
339404,neg,why are planetickets so expensive,"[why, are, planetickets, so, expensive]","[why, are, planetickets, so, expensive]","[why, are, planetickets, so, expensive]","[whi, are, planeticket, so, expens]",whi are planeticket so expens,"[False, False, False, False, False]",[]
418113,neg,"@timothyh2o btw, i'm getting tire of the jobr...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, getting, tire, of, th...","[@timothyh2o, btw,, i'm, get, tire, of, the, j...","@timothyh2o btw, i'm get tire of the jobros. l...","[False, False, False, False, False, False, Fal...",[@timothyh2o]
1200850,pos,Yay for everyone who's down with the blog/trac...,"[Yay, for, everyone, who's, down, with, the, b...","[yay, for, everyone, who's, down, with, the, b...","[Yay, for, everyone, who's, down, with, the, b...","[yay, for, everyon, who', down, with, the, blo...",yay for everyon who' down with the blog/track ...,"[False, False, False, False, False, False, Fal...",[]


In [137]:
test_at = new_data.sample(frac=0.1)
train_at = new_data[~new_data.index.isin(test_at.index)]

In [139]:
words = [w for s in train_at.atandtag for w in s]
p_w_estimate = ctr(words)
p_w_total = len(words)

p_w_t_estimate = {}
p_w_t_totals = {}

for l in set(train_at.polarity):
    sub_frame = train_at[train_at.polarity == l]
    sub_words = [w for s in sub_frame.atandtag for w in s]
    p_w_t_estimate[l] = ctr(sub_words)
    p_w_t_totals[l] = len(sub_words)

In [140]:
def PwtA(W, T):
    if W not in p_w_t_estimate[T]: return smoother
    return p_w_t_estimate[T][W] / p_w_t_totals[T]

In [142]:
def PtwA(T, W):
    return PwtA(W, T) * Pt(T) / Pw(W)

In [143]:
def Pe(E):
    result = {}
    for p in set(train_at.polarity):
        result[p] = np.prod([PtwA(p, word) for word in E])
    return result

In [144]:
test_at['result'] = test_at.atandtag.apply(Pe)
test_at['top'] = test_at.result.apply(lambda x:max(x, key=x.get))
test_at

Unnamed: 0,polarity,tweet,splittweet,lowercase,hashtag,stem,join,numbers,atandtag,result,top
719347,neg,Packing for a trip. As exciting as a trip to t...,"[Packing, for, a, trip., As, exciting, as, a, ...","[packing, for, a, trip., as, exciting, as, a, ...","[Packing, for, a, trip., As, exciting, as, a, ...","[pack, for, a, trip., as, excit, as, a, trip, ...",pack for a trip. as excit as a trip to the den...,"[False, False, False, False, False, False, Fal...",[],"{'pos': 1.0, 'neg': 1.0}",pos
1252432,pos,@farhan Hey thanks for the spoils!,"[@farhan, Hey, thanks, for, the, spoils!]","[@farhan, hey, thanks, for, the, spoils!]","[@farhan, Hey, thanks, for, the, spoils!]","[@farhan, hey, thank, for, the, spoils!]",@farhan hey thank for the spoils!,"[False, False, False, False, False, False]",[@farhan],"{'pos': 0.5036111111111111, 'neg': 0.496388888...",pos
226378,neg,Feelings suck,"[Feelings, suck]","[feelings, suck]","[Feelings, suck]","[feel, suck]",feel suck,"[False, False]",[],"{'pos': 1.0, 'neg': 1.0}",pos
112435,neg,Eating my free hotel breakfast (better than no...,"[Eating, my, free, hotel, breakfast, (better, ...","[eating, my, free, hotel, breakfast, (better, ...","[Eating, my, free, hotel, breakfast, (better, ...","[eat, my, free, hotel, breakfast, (better, tha...",eat my free hotel breakfast (better than nothi...,"[False, False, False, False, False, False, Fal...",[],"{'pos': 1.0, 'neg': 1.0}",pos
392408,neg,i feel so bad for what ive done last night,"[i, feel, so, bad, for, what, ive, done, last,...","[i, feel, so, bad, for, what, ive, done, last,...","[i, feel, so, bad, for, what, ive, done, last,...","[i, feel, so, bad, for, what, ive, done, last,...",i feel so bad for what ive done last night,"[False, False, False, False, False, False, Fal...",[],"{'pos': 1.0, 'neg': 1.0}",pos
...,...,...,...,...,...,...,...,...,...,...,...
50979,neg,@GermanKitty iï¿½m good too. i know what you m...,"[@GermanKitty, iï¿½m, good, too., i, know, wha...","[@germankitty, iï¿½m, good, too., i, know, wha...","[@GermanKitty, iï¿½m, good, too., i, know, wha...","[@germankitti, iï¿½m, good, too., i, know, wha...",@germankitti iï¿½m good too. i know what you m...,"[False, False, False, False, False, False, Fal...",[@GermanKitty],"{'pos': 0.5036111111111111, 'neg': 0.496388888...",pos
247749,neg,@twoname: Aha! I knew the people in the tech b...,"[@twoname:, Aha!, I, knew, the, people, in, th...","[@twoname:, aha!, i, knew, the, people, in, th...","[@twoname:, Aha!, I, knew, the, people, in, th...","[@twoname:, aha!, i, knew, the, peopl, in, the...",@twoname: aha! i knew the peopl in the tech bo...,"[False, False, False, False, False, False, Fal...",[@twoname:],"{'pos': 0.5036111111111111, 'neg': 0.496388888...",pos
760819,neg,Tryin 2 learn history..! It just ain't appenin...,"[Tryin, 2, learn, history..!, It, just, ain't,...","[tryin, 2, learn, history..!, it, just, ain't,...","[Tryin, 2, learn, history..!, It, just, ain't,...","[tryin, 2, learn, history..!, it, just, ain't,...",tryin 2 learn history..! it just ain't appenin...,"[False, False, False, False, False, False, Fal...",[],"{'pos': 1.0, 'neg': 1.0}",pos
1574560,pos,@princeofny lol...thanks! my girl got me!!,"[@princeofny, lol...thanks!, my, girl, got, me!!]","[@princeofny, lol...thanks!, my, girl, got, me!!]","[@princeofny, lol...thanks!, my, girl, got, me!!]","[@princeofni, lol...thanks!, my, girl, got, me!!]",@princeofni lol...thanks! my girl got me!!,"[False, False, False, False, False, False]",[@princeofny],"{'pos': 0.5036111111111111, 'neg': 0.496388888...",pos


In [145]:
sum(test_stem.polarity == test_stem.top) / len(test_stem)

0.49375

I am not really sure if it overall improved my classifier. It obviously lessened in accuracy so I would have to say that it didn't accuratley do it's job.

# Summary

### the baseline (i.e., most common type): (0.5036111111111111, 0.4963888888888889)
### classifier accuracy on just the split text: 0.71
### classifier accuracy on the lower-cased, split text: 0.7275
### classifier accuracy when you lower-case, split the text and remove all hashtags: 0.725625
### classifier accuracy when you lower-case, split the text and stem all of the words: 0.49375
### open ended: 0.49375