# Sentiment Analysis

In [3]:
# for mongodb
import pymongo

# normal stuff
import pandas as pd
import numpy as np
import cPickle as pickle
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('max_colwidth', -1)

from collections import defaultdict, Counter
import string
import re
import codecs

# sklearn
from sklearn.metrics.pairwise import cosine_similarity

# nltk
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from gensim.models.doc2vec import TaggedDocument
 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# gensim
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim import similarities
from gensim.models import doc2vec

# spacy
import spacy
# python -m spacy download en ## to download english model for spacy, do in cmd

# textblob
from textblob import TextBlob



In [4]:
# load spacy english model
nlp_spacy = spacy.load('en')

## connect to MongoDB for Dramabeans comments

In [5]:
# "C:\Program Files\MongoDB\Server\3.6\bin\mongod.exe" --dbpath "D:\Documents\Heidi\mongodb\data"

# Connection to Mongo DB
try:
    client = pymongo.MongoClient()
    print "Hooray, we have connected to MongoDB successfully!"
except pymongo.errors.ConnectionFailure, e:
    print "Could not connect to MongoDB: %s" % e 

Hooray, we have connected to MongoDB successfully!


In [7]:
# connect to the dramabeans database
print client.database_names()
db = client.dramabeans_v4

[u'admin', u'config', u'dramabeans_v2', u'dramabeans_v3', u'dramabeans_v4', u'local', u'raw_dramabeans']


In [9]:
# collection names
print 'No. of shows: {}'.format(len(db.collection_names()))

No. of shows: 386


In [10]:
# for toy dataset

# # put all show names into a list
# shownames = db.collection_names()
# # print shownames

# # pick 3 shows to do initial playing
# # shownames = [u'circle', u'weightlifting fairy kim bok-ju', u'oh hae-young again']

# # randomly pick some shows
# num_shows = 3
# np.random.seed(75)
# show_ids = np.random.choice(range(len(db.collection_names())), size=num_shows, replace=False)
# shownames = [show for i, show in enumerate(shownames) if i in show_ids]

# print shownames

### Sentiment Analysis

Tried out the pre-trained models TextBlob and VADER (NLTK), VADER gave results that looked more promising, so decided to use VADER. 
1. TextBlob (TB)
2. VADER - nltk (V)

General idea, these packages have take into account:
- negation e.g. "not great" (TB & V)
- emoticons e.g. ":)" (TB & V)
- modifier words e.g. "very" (TB & V)
- punctuation & capitalisation e.g. "GREAT!!!" (V)

**Objectives: **
- find out how positive or negative each show is - sentiment analysis of recaps
- find descriptive words on each show - tragic/ lighthearted/ cute... etc.


### VADER

Resources:
- http://www.nltk.org/howto/sentiment.html
- http://t-redactyl.io/blog/2017/04/using-vader-to-handle-sentiment-analysis-with-social-media-text.html

In [13]:
# some exploration first
vader = SentimentIntensityAnalyzer()

vader_score = []
for comment in comments.Comment:
    vader_score.append(vader.polarity_scores(comment))
    
vader_score = pd.DataFrame(vader_score)
vader_score.columns = ['V_'+col for col in vader_score.columns]
comments = pd.concat([comments, vader_score], axis=1)
comments.sort_values('V_compound', ascending=False)

In [15]:
test_com = comments.Comment[94]
print test_com
print vader.polarity_scores(test_com)
print '------------'

test_com_sent = nltk.tokenize.sent_tokenize(test_com)

for sent in test_com_sent:
    print vader.polarity_scores(sent), sent

# better to go through sentence by sentence, filter out - good, bad, neutral?

I did laugh in places, I thought the end scenes there were cute and reminiscent of the show’s original flavour but I was ultimately unimpressed by the outcome. I am still looking for a Sageuk that really blows me away. I wasn’t expecting this to be that Sageuk, but it frustrates me that I CAN see that Saguek in its bone marrow, just that its only bone marrow, and not an actual skeleton, let alone a fully fleshed drama. Those tiny moments of its potential make me even more disappointed. I really enjoyed Hyun Shik’s acting for example, even if half his lines were flop worthy. I was surprised when I enjoyed the first couple of episodes, expecting it to flop from the start. In retrospect those episodes seem a long way away and yet I can’t say I’m surprised to be disappointed in the end.  (shit I wrote a lot haha props to anyone who reads all that)
{'neg': 0.136, 'neu': 0.704, 'pos': 0.16, 'compound': 0.5008}
------------
{'neg': 0.103, 'neu': 0.699, 'pos': 0.198, 'compound': 0.2144} I did 

### Approach

For each show, 

1. Sentiment Analysis: go through all the comments, bin each sentence into positive, negative and neutral.
2. Qualitative Feature Extraction: extract the adjective-noun phrases from all the comments
3. Quantitative Feature Extraction: for each category (listed below), calculate a category score (number of positive comments / sum of positive & negative comments):
    - acting
    - directing/ writing
    - music
    - relationships (e.g. romance, family, friendships)
    - ending (how satisfying was the ending?)
    - overall entertainment value

Output for each show:
1. lists of positive & negative comments
2. adjective-noun phrases
3. category scores (0-10)

References:
- https://nlp.stanford.edu/courses/cs224n/2007/fp/johnnyw-hengren.pdf
- http://www.statmt.org/OSMOSES/FeatureEx.pdf

### Define all functions

In [40]:
def bin_sentences(comments_list, threshold=0.3):
    '''bin comments into positive & negative by sentence'''
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    vader = SentimentIntensityAnalyzer()
    
    like_sent, dislike_sent, neutral_sent = [], [], []
    like_sent_score, dislike_sent_score, neutral_sent_score = [], [], []
    
    for comment in comments_list:
        comment_sents = nltk.tokenize.sent_tokenize(comment)
        for sent in comment_sents:
            score = vader.polarity_scores(sent)['compound']
#                     print score, sent
            if score > threshold:
                like_sent.append(sent)
                like_sent_score.append(score)
            elif score < - threshold:
                dislike_sent.append(sent)
                dislike_sent_score.append(score)
            else:
                neutral_sent.append(sent)
                neutral_sent_score.append(score)
    return like_sent, dislike_sent, neutral_sent, like_sent_score, dislike_sent_score, neutral_sent_score

In [41]:
def extract_phrases(text, buf=2):
    '''extract adjective-noun phrases from given text string'''
    
    # process into spaCy
    trial_spacy = nlp_spacy(text)
    
    # initialise holding variables
    phrases, phrases_ind, phrases_pos = [], [], []

    # iterate through each word
    for i, t in enumerate(trial_spacy):
        
        # identify adjectives that are not stop words
        if (t.pos_ == 'ADJ') and (t.is_stop == False):
            
            # initialise variables
            f=0
            found = 0
            start = i
            end = i
            
            # search the space around the adjective for nouns, 
            # expanding the search window by 1 each time
            while (f < buf) and (found == 0):
                # increment f
                f += 1

                # define search space, deal with start & end of sentence
                k= []
                if i-f < 0:
                    k = [0]
                else:
                    k = [i-f]
                if i+f >= len(trial_spacy):
                    k.append(len(trial_spacy)-1)
                else:
                    k.append(i+f)

                # check for nouns
                if (trial_spacy[k[0]].pos_ == 'NOUN') or (trial_spacy[k[0]].pos_ == 'PROPN'):
                    # if there is a hyphen, extend the phrase
                    try:
                        if ('-' in trial_spacy[k[0]-1].text):
                            start = k[0]-2
                            found = 1
                        else:
                            start = k[0]
                            found = 1
                    except:
                        start = k[0]
                        found = 1

                if (trial_spacy[k[1]].pos_ == 'NOUN') or (trial_spacy[k[1]].pos_ == 'PROPN'):
                    # if there is a hyphen or another noun, extend the phrase
                    try:
                        if (trial_spacy[k[1]+1].pos_=='PROPN') or (trial_spacy[k[1]+1].pos_=='NOUN'):
                            end = k[1]+1
                            found=1
                        elif ('-' in trial_spacy[k[1]+1].text) and ((trial_spacy[k[1]+2].pos_=='PROPN') or (trial_spacy[k[1]+2].pos_=='NOUN')):
                            end = k[1]+2
                            found = 1

                        else:
                            end = k[1]
                            found = 1

                    except:
                        end = k[1]
                        found = 1

        
            # if a noun was found, check if the word before the adjective is an adverb
            if (start == i) and (end != i):
                if (t.pos_ == 'ADJ') and (trial_spacy[i-1].pos_ == 'ADV'):
                    if i-1 >=0:
                        start = i-1
                # also check if word before is a hyphen, extend phrase if hyphen is found
                elif (u'-' in trial_spacy[i-1].text):
                    if i-2 >=0:
                        start = i-2
            
            # form list of phrases
            phrase_ind = range(start,end+1)
            phrase = [w.text for w in trial_spacy[start:end+1]]
            phrase_pos = [w.pos_ for w in trial_spacy[start:end+1]]

            if len(phrase_ind) == len(phrase) == len(phrase_pos):

                # exclude phrase if it cuts across punctuation marks like full stop or comma. exclude single words (no context)
                if (('-' in phrase) or ('PUNCT' not in phrase_pos)) and (len(phrase_pos)>1):

                    # if phrase overlaps with other phrases already collected, combine them
                    if phrases != []:
                        last_phrase = phrases[-1]
                        last_phrase_pos = phrases_pos[-1]
                        last_phrase_ind = phrases_ind[-1]
                        
                        if phrase_ind[0]-1 <= last_phrase_ind[-1]:
                            phrases[-1] = [w.text for w in trial_spacy[last_phrase_ind[0]:(phrase_ind[-1]+1)]]
                            phrases_pos[-1] = [w.pos_ for w in trial_spacy[last_phrase_ind[0]:(phrase_ind[-1]+1)]]
                            phrases_ind[-1] = range(last_phrase_ind[0], phrase_ind[-1]+1)

                        else:
                            phrases.append(phrase)
                            phrases_pos.append(phrase_pos)
                            phrases_ind.append(phrase_ind)

                    else: # just append the first phrase
                        phrases.append(phrase)
                        phrases_pos.append(phrase_pos)
                        phrases_ind.append(phrase_ind)
    
    return phrases, phrases_pos, phrases_ind
        

In [42]:
def get_phrases_nouns(sent_grp):
    '''get phrases and nouns from sentence grouping'''

    all_phrases = []
    all_nouns = []
    for text in sent_grp:
        phrases, phrases_pos, phrases_ind = extract_phrases(text)
    #     print phrases
        all_phrases += phrases
        # extract out all the nouns
        for i in range(len(phrases_pos)):
            if 'NOUN' in phrases_pos[i]:
                ind = phrases_pos[i].index('NOUN')
            elif 'PROPN' in phrases_pos[i]:
                ind = phrases_pos[i].index('PROPN')
#             print ind, phrases[i]

            all_nouns.append(phrases[i][ind])
    return all_phrases, all_nouns

In [43]:
def show_related_sent(keyword_list, sents, sent_scores):
    '''given keyword, display related sentences'''
    short_sent, short_sent_scores = [], []
    for i, sent in enumerate(sents):
        for keyword in keyword_list:
            if (keyword in sent.lower().split()) and (len(sent) < 500):
                short_sent.append(sent)
                short_sent_scores.append(sent_scores[i])
    df = pd.DataFrame({'sentence': short_sent, 'sentiment score': short_sent_scores}).drop_duplicates()
    if np.max(sent_scores) < 0:
        df_top = df.sort_values('sentiment score', ascending=True)
    else:
        df_top = df.sort_values('sentiment score', ascending=False)
    return df_top, df.shape[0]

In [44]:
def show_related_phrases(keyword_list, all_phrases):
    '''given keyword, display related phrases'''
    phrases_shortlist = []
    for phrase in all_phrases:
        phrase_str = ' '.join([w for w in phrase])
        for keyword in keyword_list:
            if keyword in phrase:
                phrases_shortlist.append(phrase_str)
    return pd.Series(phrases_shortlist).value_counts()

# d = show_related_phrases('acting', all_phrases)
# d.index.tolist()

In [45]:
terms_dict = {
    'writing_direction' : ['writing', 'pacing', 'storyline', 'plot', 'coherence', 'editing', 'directing', 'direction' 'cinematography'],
    'acting': ['act', 'actor', 'actress', 'acting', 'performance', 'cast'],
    'rs': ['romance', 'couple', 'ship', 'bromance', 'chemistry', 'triangle', 'OTP', 'kiss', 'cute', 'friend', 'family'],
    'ending': ['ending', 'resolution', 'finale'],
    'ent': ['entertaining', 'exciting', 'enjoyable', 'thrilling', 'suspense', 'heartwarming'\
                      'crack', 'addictive', 'comedy', 'comedic', 'charm'],
    'music': ['music', 'OST', 'soundtrack', 'sound'] }

In [46]:
def calc_score(pos_count, neg_count):
    '''calculate the show score (upon 10) based on number of positive and negative comments'''
    # only return score is there are at least 10 comments
    if pos_count + neg_count >=10:
        pos_ratio = pos_count / float(pos_count + neg_count) * 10
    else:
        pos_ratio = np.nan
    return pos_ratio

## Code

In [48]:
# load pickle order of shows from gensim 
show_order = pickle.load(open('tfidf_files/show_mapping.pkl', 'rb'))

for k, show in enumerate(show_order):
    print show

ma boy
falling for innocence
mad dog
to the beautiful you
vampire prosecutor 2
joseon x-files
beautiful gong shim
fashion king
mysterious il-seung
reunited worlds
smile
will it snow for christmas
oh my geum-bi
legend of the blue sea
bad guy
haeundae lovers
i’m not a robot
bad couple
pretty man
chicago typewriter
birth of a beauty
she was pretty
entourage
three dads, one mom
healer
superdaddy yeol
the time i’ve loved you
miss ripley
a love to kill
sword and flower
falsify
queen in-hyun’s man
seven day queen
personal taste
hero
girl who sees smells
punch
secret garden
i need romance 2012
nothing to lose
love rain
playful kiss
goodbye mr. black
tomorrow with you
twenty again
hidden identity
fantastic
three musketeers
prosecutor princess
pasta
manny
night light
who are you—school 2015
temperature of love
oh my venus
trot lovers
the man living in our house
running man
memory
defendant
athena
lucky romance
white christmas
the moon that embraces the sun
sly and single again
yong-pal
individua

In [49]:
# calculate the scores of the various categories, store in matrix
all_cat_scores = {}

# build comment vector for each show comprising the pos and neg comments only
all_pos_phrases = []
comm_vec_phrases = [] # to hold the comment adj-noun phrases
all_sentences = {} # key = show, value = dataframe of sentences


for k, show in enumerate(show_order):
    # compile all the comments from every episode of the show
    comments_list = []
    for ep in db[show].find():
        comments_list += pd.DataFrame(ep['Comments']).Comment.tolist()
    #     print len(comments_list)
    assert comments_list > 0
    
    # bin sentences
    like_sent, dislike_sent, neutral_sent, like_sent_score, dislike_sent_score, neutral_sent_score = bin_sentences(comments_list, threshold=0.3)
    sents_df = pd.DataFrame({'comments': like_sent+dislike_sent, 'sent_scores':like_sent_score+dislike_sent_score})
    
    # extract phrases & nouns
    all_phrases, all_nouns = get_phrases_nouns(like_sent+dislike_sent+neutral_sent)
    pos_phrases = get_phrases_nouns(like_sent)
    
    # get the scores of the various categories
    all_cat_scores[k] = {}
    comm_vec_phrases_show = []
    comm_vec_cat_show = []
    for cat, search_term in terms_dict.items():
        # phrases
        phrases = show_related_phrases(search_term, all_phrases)
        comm_vec_phrases_show += phrases.index.tolist()
        # sentences
        pos_sentiments, pos_count = show_related_sent(search_term, like_sent, like_sent_score)
        neg_sentiments, neg_count = show_related_sent(search_term, dislike_sent, dislike_sent_score)
        comm_vec_cat_show += pos_sentiments.sentence.tolist() + neg_sentiments.sentence.tolist()
        # scores
        all_cat_scores[k][cat] = calc_score(pos_count, neg_count)
        
    # aggregate the comments
    all_pos_phrases.append(pos_phrases)
    comm_vec_phrases.append(comm_vec_phrases_show)
    all_sentences[k] = sents_df
    print k,

# print once completed
print 'Completed'

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 27

In [22]:
# transpose the category score matrix
all_cat_scores_df = pd.DataFrame(all_cat_scores).T
all_cat_scores_df

Unnamed: 0,acting,ending,ent,music,rs,writing_direction
0,10.000000,,,,9.736842,
1,8.200000,6.428571,9.523810,8.181818,7.785714,6.274510
2,7.333333,6.428571,,,7.185185,5.625000
3,8.117647,8.000000,9.600000,6.923077,9.152542,6.774194
4,7.500000,6.923077,,,8.873239,7.500000
5,9.047619,7.619048,9.000000,,10.000000,6.956522
6,7.058824,8.000000,10.000000,7.692308,8.032129,6.750000
7,8.571429,,,,,6.500000
8,8.333333,,9.000000,,8.205128,9.523810
9,7.638889,7.200000,,7.692308,7.992832,6.969697


In [50]:
# check sentences collected
all_sentences[1].sort_values('sent_scores')

Unnamed: 0,comments,sent_scores
2538,"in joseon era dramas, hands down, we see many worse antagonist characters, lols.. as much as i dislike junhee, but i wonder, what will he do to hurt minho.. to hurt minho means to hurt soonjung.. oh dramaaa",-0.9618
2471,"So theoretically, what pushed Min Joon Gook and Lee Joon Hee to kill is because all their actions in the past years before they killed, the norms and values they were socialized to, gradually shaped them to be more and more violent until they 'snap' and kill.",-0.9562
2917,"Not that I necessarily agree with his particular attitude or his choice of battle plans for combating inequality, but I can understand how growing up poor and witnessing the inequality that exists between the owners of capital and those who work daily — sometimes extremely hard or incessantly for low wages...barely making enough to make ends meet — as an impressionable youth in a capitalistic and class society, can really take its toil and skew the mind toward destructive and irrational behavior.",-0.9548
2712,The whole crazy revenge of Min Ho centered around making his uncle suffer and taking.. destroying the company for which his father was betrayed.,-0.9509
2469,@KDaddictJCW I'm afraid I have to be an academia prick to answer the question of why people kill and why don't they think of anything but killing and harming others.,-0.9481
2412,"I'm getting a kick out of reading how both recappers are so annoyed by Joon H. I'd think that after so many years of watching KD and recapping, they'd be less affected, cos every KD has a villain, n villains do villainy things.",-0.9457
2996,"It has so much violence but it almost never surprises me because the show is about dangerous, violent people, living a risky life, that is what they expect to happen to them.",-0.9430
3138,"The Dong Wook+Soon Jung couple scenes were adorable, but I couldn't enjoy them that much when I was dreading the impending tragedy.",-0.9330
2601,"He's the ultimate villain with jihyun as the devil on his shoulder, whispering sweet evil to invoke his bad side.",-0.9313
2341,"Yes, threaten him, get beaten to a pulp, then tell him you are in front of the police station, and tell him the where-abouts of the disk, that way he can hurry up and sabotage or retrieve it by hurting or killing Min Ho whom you are relying on for the Big reveal, instead of quietly going about his own plan to expose him.",-0.9300


In [30]:
# check adjective-noun phrases extracted
comm_vec_phrases[0]

[u'quite cute high school romance',
 u'cute show',
 u'real kiss',
 u'really cute little drama',
 u'cute hijinks',
 u'drama seems cute',
 u'damn cute in Ma boy',
 u'hilarious ... love - romance',
 u'abc family',
 u'cute loveline',
 u'cute drama',
 u'cute bf',
 u'Their chemistry',
 u'show is cute',
 u'how cute is Kim So',
 u'really cute story',
 u'time cute twist LOL',
 u'cute plot',
 u'cute show in general',
 u'simple ending',
 u'definitive ending',
 u'lovely cast',
 u'light romantic comedy',
 u'decent comedic touch',
 u'light comedy',
 u'requisite comedic bits',
 u'enjoyable show',
 u'fanboys\u2019 plot is juvenile',
 u'emotional storyline',
 u'lazy writing',
 u'ongoing plot',
 u'serial plot',
 u'favorite plot devices',
 u'cute plot',
 u'girl usual storyline',
 u'laxative plot',
 u'kinda storyline',
 u'good directing']

In [28]:
# # pickle
version = 'v4-386'
pickle.dump(all_cat_scores_df, open('sentiment_files/{}_{}.pkl'.format('all_cat_scores_df', version), 'wb'))
pickle.dump(comm_vec_phrases, open('sentiment_files/{}_{}.pkl'.format('comm_vec_phrases', version), 'wb'))
pickle.dump(all_pos_phrases, open('sentiment_files/{}_{}.pkl'.format('all_pos_phrases', version), 'wb'))
pickle.dump(all_sentences, open('sentiment_files/{}_{}.pkl'.format('all_comment_sentences', version), 'wb'))

## What didn't work:

### 1. Calculating the cosine similarity of the categorized scores

I tried to calculate the cosine similarity based on the category scores:

In [23]:
# # fill all na values with -1 before calculating cosine similarity
# all_cat_scores_df1 = all_cat_scores_df.fillna(-1)

In [24]:
# # create cosine similarity matrix
# num_shows = all_cat_scores_df1.shape[0]
# cos_sim = np.zeros((num_shows, num_shows))

# for r in range(num_shows):
#     for c in range(num_shows):
#         x = all_cat_scores_df1.iloc[r,:].values.reshape(1, -1)
#         y = all_cat_scores_df1.iloc[c,:].values.reshape(1, -1)
#         cos_sim[r,c] = cosine_similarity(x,y)

In [25]:
# cos_sim_df = pd.DataFrame(cos_sim)
# cos_sim_df

This does not make sense logically... because I would not want to use category scores similarity to recommend shows...
e.g. if the show you liked had a poor score for acting, it doesn't make sense for me to recommend another show with poor acting.


### 2. Doc2Vec

Same logical flaw as calculating the similarity of category scores. e.g. if both shows have the comment "This show is awful!", we don't want to recommend awful shows because the show you liked was awful

Doc2Vec References:
- https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb
- https://ireneli.eu/2016/07/27/nlp-05-from-word2vec-to-doc2vec-a-simple-example-with-gensim/

In [None]:
# # combine all comments from 1 show into a para
# comm_vec_comb = [' '.join([sent for sent in show]) for show in comm_vec] # comm_vec, comm_vec_cat, comm_vec_phrases

In [None]:
# # tokenize
# tokenizer = RegexpTokenizer(r"[\w']+")
# tokenized = [tokenizer.tokenize(show)
#              for show in comm_vec_comb]

In [None]:
# # filter out stop words
# stop = stopwords.words('english')
# tokenized_c = [[word.lower() for word in show if word.lower() not in stop]
#                for show in tokenized]

In [None]:
# # construct the tagged document
# documents= []
# for i in range(len(tokenized_c)):
#     doc = doc2vec.TaggedDocument(tokenized_c[i], tags=[i])
#     documents.append(doc)

In [None]:
# # build the doc2vec model
# model = gensim.models.Doc2Vec(size=500, window=3, min_count=5, workers=3, iter=50)

# # build a vocab
# model.build_vocab(documents)

In [None]:
# # train the doc2vec model
# model.train(documents, total_examples=model.corpus_count, epochs=model.iter)

In [None]:
# # evaluate the model by inferring a vector from the trained model,
# # then checking if the model says that vector is most similar to itself
# # i.e. it's rank of similarity to itself is 0 (most similar)
# ranks = []
# second_ranks = []
# for doc_id in range(len(documents)):
#     inferred_vector = model.infer_vector(documents[doc_id].words)
#     sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
#     rank = [docid for docid, sim in sims].index(doc_id)
#     ranks.append(rank)
    
#     second_ranks.append(sims[doc_id])

# Counter(ranks) # perfect classification!

In [None]:
# # find most similar drama
# show_ind = 5
# user_phrase_vec = model.infer_vector(documents[show_ind].words)
# sims = model.docvecs.most_similar([user_phrase_vec], topn=10)
# indices = [ind for ind, sim in sims]
# for ind in indices:
#     print shownames[ind]


In [None]:
# Counter([' '.join(phrase) for phrase in all_phrases])

In [None]:
# # for fun, see if can search document corpus via free text?
# query_phrase = 'great writing, fantastic plot'

# user_phrase_vec = model.infer_vector(query_phrase.split())
# sims = model.docvecs.most_similar([user_phrase_vec], topn=10)
# indices = [ind for ind, sim in sims]
# for ind in indices:
#     print shownames[ind]

    
# prosecutor princess
# beautiful gong shim
# falling for innocence
# five fingers
# poseidon
# valid love
# hyde, jekyll, me
# sassy go go
# gu family book
# scent of a woman

### 3. Spacy's noun chunks
Did not perform as well as the adjective-noun pairs

In [3]:
# doc = nlp_spacy(like_sent[5])
# for chunk in doc.noun_chunks:
#     print(chunk.text) #, chunk.root.text, chunk.root.dep_, chunk.root.head.text)
# # really not any better than using adjective-noun phrases

### 4. Sentiment Analysis on Recaps
If we do sentiment analysis on recaps, getting 1 score on 1 recap, almost everything would be neutral, because the positives and negatives cancel each other out. 

Try to break the recaps down into sentences and get scores from that instead.

In [None]:
# # use 1 episode of 1 show to understand it better
# onerecap = db['hwarang'].find_one()['Recap']
# onerecap = nltk.tokenize.sent_tokenize(onerecap)
# recap_sent_score = []
# for sent in onerecap:
#     score = vader.polarity_scores(sent)
#     print score['compound'], sent
#     recap_sent_score.append(score)
# recap_sent_score = pd.DataFrame(recap_sent_score)

In [None]:
# # smoothing
# def smooth(scores, n=5):
#     '''smooth the scores'''
#     av = [scores[0] for i in range(n)]
#     sm = []
#     for s in scores:
#         av.append(s)
#         av.pop(0)
#         sm.append(np.mean(av))
#     return sm
# # smooth(recap_sent_score['compound'])

In [None]:
# # explore if there is a trend in sentiment score from sentence to sentence (aka. as the show progresses)
# recap_sent_score['smoothed'] = smooth(recap_sent_score['compound'], n=10)
# fig = plt.figure(figsize=(15,5))
# ax = fig.gca()
# recap_sent_score.plot(y='smoothed', ax=ax)
# plt.xlabel('Time')
# plt.ylabel('Sentiment Polarity')

In [None]:
# # how if we bin the scores to negative, neutral and positive?
# thres = 0.4
# recap_sent_score['sentiment'] = [-1 if score<-thres else 1 if score>thres else 0 for score in recap_sent_score['compound']]

# fig = plt.figure(figsize=(15,5))
# ax = fig.gca()
# recap_sent_score.plot(y='sentiment', ax=ax)
# plt.xlabel('Time')
# plt.ylabel('Sentiment Polarity')
# # even worse. lol

#### Textblob sentiment analysis on recaps

In [None]:
# # TextBlob, because, why not?
# pol = []
# subj = []
# for sent in onerecap:
#     tb = TextBlob(sent)
#     subj.append(tb.subjectivity)
#     pol.append(tb.polarity)
# sentiment = pd.DataFrame({'polarity': pol, 'subjectivity': subj, ' sentence': onerecap})
# sentiment
# # still terrible. confirmed.