## TextBlob

In [38]:
# https://textblob.readthedocs.io/en/dev/

# How to install TextBlob
#     1. pip install -U textblob
#     2. python -m textblob.download_corpora

from textblob import TextBlob

blob = TextBlob(dfT['text'][0]) # run json-to-pandas

# Part-of-speech Tagging
print blob.tags
print

# Noun Phrase Extraction¶
print blob.noun_phrases
print 

# Tokenization
print blob.words

[(u'One', u'CD'), (u'of', u'IN'), (u'China', u'NNP'), (u"'s", u'POS'), (u'first', u'JJ'), (u'female', u'NN'), (u'fighter', u'NN'), (u'pilots', u'NNS'), (u'was', u'VBD'), (u'killed', u'VBN'), (u'in', u'IN'), (u'a', u'DT'), (u'training', u'NN'), (u'accident', u'NN'), (u'according', u'VBG'), (u'to', u'TO'), (u'state-run', u'JJ'), (u'media', u'NNS'), (u'reports\u2026', u'NN'), (u'https', u'NN'), (u'//t.co/DoEZLme8Cq', u'NN')]

[u'china', u'female fighter pilots', u'state-run media reports\u2026 https']

[u'One', u'of', u'China', u"'s", u'first', u'female', u'fighter', u'pilots', u'was', u'killed', u'in', u'a', u'training', u'accident', u'according', u'to', u'state-run', u'media', u'reports\u2026', u'https', u't.co/DoEZLme8Cq']


In [39]:
# The subjectivity is a float within the range [0.0, 1.0] 
# where 0.0 is very objective and 1.0 is very subjective
for sentence in blob.sentences:
    print sentence.sentiment.polarity
print

# Lemmatize each word
for sentence in blob.sentences:    
    for word in sentence.words:
        print "%s---%s" % (word, word.lemmatize('v')) # 'v' for 'verb'

0.0166666666667

One---One
of---of
China---China
's---'s
first---first
female---female
fighter---fighter
pilots---pilot
was---be
killed---kill
in---in
a---a
training---train
accident---accident
according---accord
to---to
state-run---state-run
media---media
reports…---reports…
https---https
t.co/DoEZLme8Cq---t.co/DoEZLme8Cq


## NLTK

http://www.nltk.org/

In [11]:
import nltk
from nltk.corpus import treebank # to draw a parse tree

sentence = dfT['text'][0] # run json-to-pandas

tokens = nltk.word_tokenize(sentence)

tagged = nltk.pos_tag(tokens)

# Identify named entities - Make parse tree?
# You might need to call nltk.download() and down load some packages
entities = nltk.chunk.ne_chunk(tagged)

In [12]:
# Example from http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
 
def word_feats(words):
    return dict([(word, True) for word in words])
 
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
 
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
 
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
 
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()

train on 1500 instances, test on 500 instances
accuracy: 0.728
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


## Supervised Learning w/ manual scripting

In [37]:
# Task 1: Load the texts
import pandas as pd
import glob, os         # for reading all .txt files
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoLars
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ARDRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier

cutoff = 800

# Read sample texts
sample = pd.read_csv('/Users/Haru/Documents/! College/4. Fall 2016/489Project/sample_data_results.csv') 
# Pos = 1, Neg = -1
sample['Answer.sentiment'] = sample['Answer.sentiment'].map({'Positive':1, 'Negative':-1})

train = pd.DataFrame({'label':sample['Answer.sentiment'][:cutoff], 'texts':sample['Input.content'][:cutoff]})
test  = pd.DataFrame({'label':sample['Answer.sentiment'][cutoff:], 'texts':sample['Input.content'][cutoff:]})

print train.shape
print test.shape

(800, 2)
(72, 2)


In [38]:
# Task 4: feature engineering
for h in range(1,30): # min_df
    for i in range(70,100): # max_df
        for j in range(0,2): # stop_words: k=0 'engl' k=1 none
            for k in range(0,2): # k=0 CountVectorizer (count), k=1 TfidfVectorizer (weighed)
                for l in range(0,16): # l=0 MultinomialNB, l=1 GaussianNB, l=2 BernoulliNB              
                                      # l=3 LogisticRegression, l=4 SGDClassifier, l=5 Ridge
                                      # l=6 ElasticNet, l=7 LassoLars, l=8 SGDRegressor
                                      # l=9 ARDRegression, l=10 GradientBoostingClassifier, l=11 RandomForestRegressor
                                      # l=12 SVC, l=13 LinearSVC, l=14 SVR, l=15 KNeighborsClassifier 

                    if j==0 & k==0:
                        tf_vectorizer = CountVectorizer(max_df=i/100.0, min_df=h/100.0, stop_words='english')
                    elif j==1 & k==0:
                        tf_vectorizer = CountVectorizer(max_df=i/100.0, min_df=h/100.0)
                    elif j==0 & k==1:
                        tf_vectorizer = TfidfVectorizer(max_df=i/100.0, min_df=h/100.0, stop_words='english')
                    elif j==1 & k==1:
                        tf_vectorizer = TfidfVectorizer(max_df=i/100.0, min_df=h/100.0)

                    train_tf_ = tf_vectorizer.fit_transform(train['texts'].values)
                    test_tf_  = tf_vectorizer.transform(test['texts'].values)

                    if l==0:
                        clf = MultinomialNB()
                    elif l==1:
                        clf = GaussianNB()
                    elif l==2:
                        clf = BernoulliNB()
                    elif l==3:
                        clf = LogisticRegression()
                    elif l==4:
                        clf = SGDClassifier()
                    elif l==5:
                        clf = Ridge()
                    elif l==6:
                        clf = ElasticNet()
                    elif l==7:
                        clf = LassoLars()
                    elif l==8:
                        clf = SGDRegressor()
                    elif l==9:
                        continue #skip - takes too long
                        clf = ARDRegression()
                    elif l==10:
                        clf = GradientBoostingClassifier()
                    elif l==11:
                        clf = RandomForestRegressor()
                    elif l==12:
                        clf = SVC()
                    elif l==13:
                        clf = LinearSVC()
                    elif l==14:
                        clf = SVR()
                    elif l==15:
                        clf = KNeighborsClassifier()
                    
                    if l==1 or l==5 or l==7 or l==9 or l==10:
                        clf.fit(train_tf_.toarray(), train['label'])
                        print "%d %d %d %d %d \t %.4f" % (h,i,j,k,l,clf.score(test_tf_.toarray(), test['label']))
                    else:
                        clf.fit(train_tf_, train['label'])
                        print "%d %d %d %d %d \t %.4f" % (h,i,j,k,l,clf.score(test_tf_, test['label']))


1 70 0 0 0 	 0.6389
1 70 0 0 1 	 0.8056
1 70 0 0 2 	 0.5833
1 70 0 0 3 	 0.6389
1 70 0 0 4 	 0.6667
1 70 0 0 5 	 0.0487
1 70 0 0 6 	 -0.0192
1 70 0 0 7 	 -0.0192
1 70 0 0 8 	 0.0822
1 70 0 0 10 	 0.7222
1 70 0 0 11 	 -0.0854
1 70 0 0 12 	 0.5833
1 70 0 0 13 	 0.6389
1 70 0 0 14 	 0.0865
1 70 0 0 15 	 0.4722
1 70 0 1 0 	 0.6389
1 70 0 1 1 	 0.8056
1 70 0 1 2 	 0.5833
1 70 0 1 3 	 0.6389
1 70 0 1 4 	 0.6111
1 70 0 1 5 	 0.0487
1 70 0 1 6 	 -0.0192
1 70 0 1 7 	 -0.0192
1 70 0 1 8 	 0.0799
1 70 0 1 10 	 0.6667
1 70 0 1 11 	 -0.2464
1 70 0 1 12 	 0.5833
1 70 0 1 13 	 0.6389
1 70 0 1 14 	 0.0865
1 70 0 1 15 	 0.4722
1 70 1 0 0 	 0.6389
1 70 1 0 1 	 0.8056
1 70 1 0 2 	 0.5833
1 70 1 0 3 	 0.6389
1 70 1 0 4 	 0.5278
1 70 1 0 5 	 0.0487
1 70 1 0 6 	 -0.0192
1 70 1 0 7 	 -0.0192
1 70 1 0 8 	 0.0809
1 70 1 0 10 	 0.6944
1 70 1 0 11 	 -0.0616
1 70 1 0 12 	 0.5833
1 70 1 0 13 	 0.6389
1 70 1 0 14 	 0.0865
1 70 1 0 15 	 0.4722
1 70 1 1 0 	 0.7222
1 70 1 1 1 	 0.7222
1 70 1 1 2 	 0.7222
1 70 1 1 3 	 

KeyboardInterrupt: 

Out of the 20,000+ possibilities and running 8000+ cases, <br>
It never hit 0.7.. ->> For train:test = 1:1 size

Train:test = 10:1 size. I do get up to 0.8+ <br>
Now we also have up to (more reasonable) 14400 cases (all expected to run)

## Supervised Learning w/ TPOT

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import csv
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
for i in range(0,2): # 'Negative' from sample = -1 or 0
    for j in range(1,10): # min_df
        for k in range(90,100): # max_df
            for l in range(0,2): # l=0 CountVectorizer (count), l=1 TfidfVectorizer (weighed)
                for m in range(5,10): # train:test = m/10 : (1-m/10) so 50:50 to 90:10
                    for n in range(4,6): # generation (# of TPOT iteration)
                        for p in range(3,5): # pop_size 5p 
                            for q in range(9,11): # k-fold number
                                print ""    

                                sample = pd.read_csv('/Users/Haru/Documents/! College/4. Fall 2016/489Project/sample_data_results.csv') 

                                # label ('positive','Negative') ->> (#,#) (e.g. (1,-1) or (1,0))
                                if i==0:
                                    sample['Answer.sentiment'] = sample['Answer.sentiment'].map({'Positive':1, 'Negative':-1})
                                elif i==1:
                                    sample['Answer.sentiment'] = sample['Answer.sentiment'].map({'Positive':1, 'Negative':0})

                                if l==0:
                                    tf_vectorizer = CountVectorizer(min_df=j/100.0, max_df=k/100.0)
                                elif l==1:
                                    tf_vectorizer = TfidfVectorizer(min_df=j/100.0, max_df=k/100.0)

                                sample_input_tf  = tf_vectorizer.fit_transform(sample['Input.content'].values)

                                X_train, X_test, y_train, y_test = train_test_split(sample_input_tf, sample['Answer.sentiment'].values,
                                                                    train_size=m/10.0, test_size=(1-m/10.0))#, random_state=)

                                # Official website example: gen=5, pop_size=20, verbo=2
                                tpot = TPOTClassifier(generations=n, population_size=5*p, num_cv_folds=q,
                                                      verbosity=2)
                                tpot.fit(X_train, y_train)
                                print(tpot.score(X_test, y_test))
                                print "%d %d %d %d %d %d %d %d" % (i,j,k,l,m,n,p,q)




Optimization Progress:  20%|██        | 15/75 [01:27<09:55,  9.92s/pipeline]

Generation 1 - Current best internal CV score: 0.730170602345


Optimization Progress:  39%|███▊      | 29/75 [06:32<13:10, 17.18s/pipeline]

Generation 2 - Current best internal CV score: 0.730170602345


Optimization Progress:  60%|██████    | 45/75 [13:02<10:46, 21.53s/pipeline]

Generation 3 - Current best internal CV score: 0.739191581365


Optimization Progress:  79%|███████▊  | 59/75 [19:13<07:42, 28.88s/pipeline]

Generation 4 - Current best internal CV score: 0.739191581365





Best pipeline: DecisionTreeClassifier(Nystroem(input_matrix, 19, 0.53000000000000003, 24))
0.694951472663
0 1 90 0 5 4 3 9



Optimization Progress:  20%|██        | 15/75 [02:29<17:58, 17.98s/pipeline]

Generation 1 - Current best internal CV score: 0.752644927536


Optimization Progress:  41%|████▏     | 31/75 [06:50<14:50, 20.25s/pipeline]

Generation 2 - Current best internal CV score: 0.752644927536


Optimization Progress:  59%|█████▊    | 44/75 [06:55<01:26,  2.79s/pipeline]

Generation 3 - Current best internal CV score: 0.752644927536


Optimization Progress:  81%|████████▏ | 61/75 [07:00<00:03,  4.00pipeline/s]

Generation 4 - Current best internal CV score: 0.752644927536


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Best pipeline: DecisionTreeClassifier(input_matrix)
0.672018348624
0 1 90 0 5 4 3 10



Optimization Progress:  20%|██        | 20/100 [02:51<16:01, 12.02s/pipeline]

Generation 1 - Current best internal CV score: 0.763276972625


Optimization Progress:  42%|████▏     | 42/100 [08:27<17:21, 17.96s/pipeline]

Generation 2 - Current best internal CV score: 0.763276972625


Optimization Progress:  59%|█████▉    | 59/100 [08:31<00:55,  1.35s/pipeline]

Generation 3 - Current best internal CV score: 0.763276972625


Optimization Progress:  83%|████████▎ | 83/100 [09:39<00:08,  2.00pipeline/s]

Generation 4 - Current best internal CV score: 0.763276972625


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Best pipeline: DecisionTreeClassifier(input_matrix)
0.692401960784
0 1 90 0 5 4 4 9



Optimization Progress:  21%|██        | 21/100 [00:01<00:07, 11.09pipeline/s]

Generation 1 - Current best internal CV score: 0.72766798419


Optimization Progress:  38%|███▊      | 38/100 [00:03<00:05, 10.74pipeline/s]

Generation 2 - Current best internal CV score: 0.72766798419


Optimization Progress:  57%|█████▋    | 57/100 [00:06<00:09,  4.66pipeline/s]

Generation 3 - Current best internal CV score: 0.732213438735


Optimization Progress:  81%|████████  | 81/100 [00:54<00:10,  1.78pipeline/s]

Generation 4 - Current best internal CV score: 0.732213438735


Optimization Progress:   1%|          | 1/90 [00:00<00:09,  9.71pipeline/s]


Best pipeline: LinearSVC(Binarizer(input_matrix, 0.34000000000000002), 0.42999999999999999, 49, True)
0.729801604052
0 1 90 0 5 4 4 10



Optimization Progress:  18%|█▊        | 16/90 [01:02<03:20,  2.71s/pipeline]

Generation 1 - Current best internal CV score: 0.69483226918


Optimization Progress:  37%|███▋      | 33/90 [03:04<04:45,  5.01s/pipeline]

Generation 2 - Current best internal CV score: 0.69483226918


Optimization Progress:  54%|█████▍    | 49/90 [03:05<00:20,  1.97pipeline/s]

Generation 3 - Current best internal CV score: 0.69483226918


Optimization Progress:  69%|██████▉   | 62/90 [03:10<00:12,  2.19pipeline/s]

Generation 4 - Current best internal CV score: 0.69483226918


Optimization Progress:  84%|████████▍ | 76/90 [03:11<00:01,  9.49pipeline/s]

Generation 5 - Current best internal CV score: 0.69483226918


Optimization Progress:   0%|          | 0/90 [00:00<?, ?pipeline/s]


Best pipeline: DecisionTreeClassifier(RBFSampler(input_matrix, 0.55000000000000004))
0.730289757412
0 1 90 0 5 5 3 9



Optimization Progress:  19%|█▉        | 17/90 [01:05<04:29,  3.69s/pipeline]

Generation 1 - Current best internal CV score: 0.728938923395


Optimization Progress:  37%|███▋      | 33/90 [03:41<08:59,  9.47s/pipeline]

Generation 2 - Current best internal CV score: 0.731247412008


Optimization Progress:  51%|█████     | 46/90 [03:42<01:12,  1.65s/pipeline]

Generation 3 - Current best internal CV score: 0.731247412008


Optimization Progress:  64%|██████▍   | 58/90 [03:43<00:08,  3.81pipeline/s]

Generation 4 - Current best internal CV score: 0.731247412008


Optimization Progress:  84%|████████▍ | 76/90 [04:33<00:17,  1.26s/pipeline]

Generation 5 - Current best internal CV score: 0.731247412008


Optimization Progress:   1%|          | 1/120 [00:00<00:12,  9.31pipeline/s]


Best pipeline: DecisionTreeClassifier(input_matrix)
0.695038746631
0 1 90 0 5 5 3 10



Optimization Progress:  17%|█▋        | 20/120 [02:38<13:29,  8.10s/pipeline]

Generation 1 - Current best internal CV score: 0.732443329617


Optimization Progress:  32%|███▎      | 39/120 [04:05<09:01,  6.68s/pipeline]

Generation 2 - Current best internal CV score: 0.732443329617


Optimization Progress:  48%|████▊     | 57/120 [07:54<11:35, 11.03s/pipeline]

Generation 3 - Current best internal CV score: 0.732443329617


Optimization Progress:  64%|██████▍   | 77/120 [13:36<12:31, 17.48s/pipeline]

Generation 4 - Current best internal CV score: 0.732443329617


Optimization Progress:  82%|████████▏ | 98/120 [18:51<10:07, 27.63s/pipeline]

Generation 5 - Current best internal CV score: 0.732443329617





Best pipeline: RandomForestClassifier(input_matrix)


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]

0.761808367072
0 1 90 0 5 5 4 9



Optimization Progress:  18%|█▊        | 21/120 [00:44<03:43,  2.26s/pipeline]

Generation 1 - Current best internal CV score: 0.710939676266


Optimization Progress:  32%|███▏      | 38/120 [04:22<13:57, 10.21s/pipeline]

Generation 2 - Current best internal CV score: 0.710939676266


Optimization Progress:  49%|████▉     | 59/120 [04:30<00:23,  2.62pipeline/s]

Generation 3 - Current best internal CV score: 0.710939676266


Optimization Progress:  64%|██████▍   | 77/120 [04:40<00:16,  2.56pipeline/s]

Generation 4 - Current best internal CV score: 0.710939676266


Optimization Progress:  82%|████████▏ | 98/120 [04:48<00:12,  1.82pipeline/s]

Generation 5 - Current best internal CV score: 0.710939676266


Optimization Progress:   0%|          | 0/75 [00:00<?, ?pipeline/s]


Best pipeline: LinearSVC(input_matrix, 9.0, 38, True)
0.691016324381
0 1 90 0 5 5 4 10



Optimization Progress:  21%|██▏       | 16/75 [01:18<04:24,  4.49s/pipeline]

Generation 1 - Current best internal CV score: 0.712237583205


Optimization Progress:  40%|████      | 30/75 [05:27<18:44, 24.99s/pipeline]

Generation 2 - Current best internal CV score: 0.712237583205


Optimization Progress:  59%|█████▊    | 44/75 [13:02<17:29, 33.84s/pipeline]

Generation 3 - Current best internal CV score: 0.712237583205


Optimization Progress:  77%|███████▋  | 58/75 [16:20<03:48, 13.44s/pipeline]

Generation 4 - Current best internal CV score: 0.712237583205





Best pipeline: ExtraTreesClassifier(Binarizer(input_matrix, 0.27000000000000002), 14, 0.94000000000000006)
0.697577194022
0 1 90 0 6 4 3 9



Optimization Progress:  21%|██▏       | 16/75 [03:25<20:30, 20.85s/pipeline]

Generation 1 - Current best internal CV score: 0.75966951567


Optimization Progress:  39%|███▊      | 29/75 [10:00<26:20, 34.37s/pipeline]

Generation 2 - Current best internal CV score: 0.75966951567


Optimization Progress:  60%|██████    | 45/75 [10:07<00:12,  2.38pipeline/s]

Generation 3 - Current best internal CV score: 0.75966951567


Optimization Progress:  83%|████████▎ | 62/75 [10:09<00:01,  8.96pipeline/s]

Generation 4 - Current best internal CV score: 0.759811965812


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Best pipeline: LogisticRegression(input_matrix, 0.98999999999999999, 5, True)
0.71221449851
0 1 90 0 6 4 3 10



Optimization Progress:  20%|██        | 20/100 [03:30<07:44,  5.81s/pipeline]

Generation 1 - Current best internal CV score: 0.732525126622


Optimization Progress:  37%|███▋      | 37/100 [11:04<33:28, 31.88s/pipeline]

Generation 2 - Current best internal CV score: 0.732525126622


Optimization Progress:  51%|█████     | 51/100 [21:21<44:59, 55.09s/pipeline]

CountVectorizer, Pos=1 Neg=-1, train:test=75:25, size=872 in TPOT = 0.78 <br>
CountVectorizer, Pos=1 Neg=-1, train:test=92:08, size=872 in TPOT = 0.78 <br>
CountVectorizer, Pos=1 Neg =0, train:test=92:08, size=872 in TPOT = 0.86 <br>



## Unsupervised Learning

In [42]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


from sklearn.decomposition import NMF, LatentDirichletAllocation

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   #max_features=n_features,
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(train['texts'].values)

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                #max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(train['texts'].values)

lda = LatentDirichletAllocation(#n_topics=n_topics, 
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

lda.fit(tf)

tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)

Topic #0:
trump donald presidency nigel farage loathsome creature calls brexit leader obama inside national guru security mind change climate experts increasingly
Topic #1:
trump new york protesters win yorkers database tweets unfair hours info light calling mayor nyc deleted undocumented praise calls denounce
Topic #2:
rt help trump author historian lady future shyness cnnnewsroom detect melania speaker ryan deportation erecting force planning paul cnnpolitics allies
Topic #3:
america like americans hope trump promised watching does reid white fear feel tears innocent celebrate nationalists breath says deep everybody
Topic #4:
trump obamacare pulling going pol rug backing repeal day interview pledge appeared country end needs replace open agreed work democrats
Topic #5:
trump says kelly denies advance saw megyn debate report book question children gets advising rudy cnnsotu government giuliani jobs lead
Topic #6:
president cast ballots million unqualified candidates voters nearly 18 l

## Ensemble & Bagging (Bootstrap AGgregating)

In [104]:
# Nope... http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/

## So text extraction + ..?

In [108]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [48]:
# Task 1: Load the texts
import pandas as pd
import glob, os         # for reading all .txt files
import csv
import numpy as np
from textblob import TextBlob # use kernel Python[Root]

cutoff = 436

# Read sample texts
sample = pd.read_csv('/Users/Haru/Documents/! College/4. Fall 2016/489Project/sample_data_results.csv') 

# Lemmatize
new_sample = pd.DataFrame(columns=("Input.content", "Answer.sentiment"))
i=0
for text in sample['Input.content']:
    blob = TextBlob(text)
    newtexts = ""

    for sentence in blob.sentences:
        newtext = ""
#        print sentence.dict

        for word in sentence.words:
            newtext += " " + word.lemmatize('v') # 'v' for 'verb'

        newtexts += newtext
        new_sample.loc[i] = newtexts
        i += 1

i=0
for answer in sample['Answer.sentiment']: # updating answers
    new_sample['Answer.sentiment'].loc[i] = answer
    i += 1
    
sample = new_sample

In [49]:
train = pd.DataFrame({'label':sample['Answer.sentiment'][:cutoff], 'texts':sample['Input.content'][:cutoff]})
test  = pd.DataFrame({'label':sample['Answer.sentiment'][cutoff:], 'texts':sample['Input.content'][cutoff:]})

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Task 4: feature engineering
for h in range(1,30): # min_df
    for i in range(h+5,100): # max_df
        for j in range(0,2): # stop_words: k=0 'engl' k=1 none
            for k in range(0,2): # k=0 CountVectorizer (count), k=1 TfidfVectorizer (weighed)
                for l in range(0,3): # l=0 MultinomialNB, l=1 GaussianNB, l=2 BernoulliNB              

                    if j==0 & k==0:
                        tf_vectorizer = CountVectorizer(max_df=i/100.0, min_df=h/100.0, stop_words='english')
                    elif j==1 & k==0:
                        tf_vectorizer = CountVectorizer(max_df=i/100.0, min_df=h/100.0)
                    elif j==0 & k==1:
                        tf_vectorizer = TfidfVectorizer(max_df=i/100.0, min_df=h/100.0, stop_words='english')
                    elif j==1 & k==1:
                        tf_vectorizer = TfidfVectorizer(max_df=i/100.0, min_df=h/100.0)

                    train_tf_ = tf_vectorizer.fit_transform(train['texts'].values)
                    test_tf_  = tf_vectorizer.transform(test['texts'].values)

                    if l==0:
                        clf = MultinomialNB()
                    elif l==1:
                        clf = GaussianNB()
                    elif l==2:
                        clf = BernoulliNB()

                    if l==0 | l==2:
                        clf.fit(train_tf_, train['label'])
                        print "%d %d %d %d %d - %.4f" % (h,i,j,k,l,clf.score(test_tf_, test['label']))
                    elif l==1:
                        clf.fit(train_tf_.toarray(), train['label'])
                        print "%d %d %d %d %d - %.4f" % (h,i,j,k,l,clf.score(test_tf_.toarray(), test['label']))

1 6 0 0 1 - 0.7778
1 6 0 0 2 - 0.6389
1 6 0 1 1 - 0.7778
1 6 0 1 2 - 0.6389
1 6 1 0 1 - 0.7778
1 6 1 0 2 - 0.6389
1 6 1 1 1 - 0.7222
1 6 1 1 2 - 0.6944
1 7 0 0 1 - 0.8056
1 7 0 0 2 - 0.6389
1 7 0 1 1 - 0.8056
1 7 0 1 2 - 0.6389
1 7 1 0 1 - 0.8056
1 7 1 0 2 - 0.6389
1 7 1 1 1 - 0.7222
1 7 1 1 2 - 0.6389
1 8 0 0 1 - 0.8056
1 8 0 0 2 - 0.6389
1 8 0 1 1 - 0.8056
1 8 0 1 2 - 0.6389
1 8 1 0 1 - 0.8056
1 8 1 0 2 - 0.6389
1 8 1 1 1 - 0.7222
1 8 1 1 2 - 0.6389
1 9 0 0 1 - 0.8056
1 9 0 0 2 - 0.5833
1 9 0 1 1 - 0.8056
1 9 0 1 2 - 0.5833
1 9 1 0 1 - 0.8056
1 9 1 0 2 - 0.5833
1 9 1 1 1 - 0.7222
1 9 1 1 2 - 0.6389
1 10 0 0 1 - 0.8056
1 10 0 0 2 - 0.6389
1 10 0 1 1 - 0.8056
1 10 0 1 2 - 0.6389
1 10 1 0 1 - 0.8056
1 10 1 0 2 - 0.6389
1 10 1 1 1 - 0.7222
1 10 1 1 2 - 0.6944
1 11 0 0 1 - 0.8056
1 11 0 0 2 - 0.6389
1 11 0 1 1 - 0.8056
1 11 0 1 2 - 0.6389
1 11 1 0 1 - 0.8056
1 11 1 0 2 - 0.6389
1 11 1 1 1 - 0.7222
1 11 1 1 2 - 0.6944
1 12 0 0 1 - 0.8056
1 12 0 0 2 - 0.6389
1 12 0 1 1 - 0.8056
1 12 0 1 2 -

KeyboardInterrupt: 