In [2]:
# for mongodb
import pymongo

# normal stuff
import pandas as pd
import numpy as np
import cPickle as pickle
import matplotlib.pyplot as plt
%matplotlib inline

from collections import defaultdict, Counter
import string
import re
import codecs

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# gensim
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim import similarities

# spacy
import spacy
# python -m spacy download en ## to download english model for spacy, do in cmd




In [2]:
# load spacy english model
nlp_spacy = spacy.load('en')

## Connect to Dramabeans database
Connect to the Dramabeans database set up in p02-scrape-dramabeans.ipynb



In [3]:
# "C:\Program Files\MongoDB\Server\3.6\bin\mongod.exe" --dbpath "D:\Documents\Heidi\mongodb\data"

# Connection to Mongo DB
try:
    client = pymongo.MongoClient()
    print "Hooray, we have connected to MongoDB successfully!"
except pymongo.errors.ConnectionFailure, e:
    print "Could not connect to MongoDB: %s" % e 

Hooray, we have connected to MongoDB successfully!


### clean up database

so that we don't mess up the original scraped data, create a fresh working copy. Connect to this copy, clean up issues. There are a total of 386 shows in the database, and a total of 4,975 episodes worth of data.

In [4]:
# make a new copy to work on
# client.admin.command('copydb', fromdb='dramabeans_v2', todb='dramabeans_v3')

In [4]:
# connect to the dramabeans database
print client.database_names()
db = client.dramabeans_v4

[u'admin', u'config', u'dramabeans_v2', u'dramabeans_v3', u'dramabeans_v4', u'local', u'raw_dramabeans']


In [6]:
# # clean up the replicated drama - same drama, 2 names
# for doc in db[u'who are you—school 2015'].find():
#     db[u'who are you–school 2015'].insert_one(doc)
    
# db.drop_collection(u'who are you–school 2015')

In [5]:
# collection names
print 'No. of shows: {}'.format(len(db.collection_names()))

No. of shows: 386


In [9]:
# total number of documents (aka. total number of episodes of data)
counter = 0
for collection in db.collection_names():
    counter += db[collection].count()
print 'Total number of documents: {}'.format(counter)

Total number of documents: 4975


In [19]:
# total number of comments
num_comments = 0
for collection in db.collection_names():
    for ep in db[collection].find():
        num_comments += ep['Num_Comments']
print 'Total number of comments in database: {}'.format(num_comments)

Total number of comments in database: 645188


In [11]:
# put all show names into a list
shownames = db.collection_names()
# print shownames

# pick 3 shows to do initial playing
# shownames = [u'circle', u'weightlifting fairy kim bok-ju', u'oh hae-young again']

# randomly pick some shows
num_shows = 386
np.random.seed(58)
show_ids = np.random.choice(range(len(db.collection_names())), size=num_shows, replace=False)
shownames = [show for i, show in enumerate(shownames) if i in show_ids]

print shownames

[u'a gentleman\u2019s dignity', u'last', u'green rose', u'joseon x-files', u'drinking solo', u'love & marriage', u'dream', u'manny', u'smile', u'dream high 2', u'you from another star', u'i need romance', u'jang ok-jung, live by love', u'jugglers', u'splish splash love', u'medical top team', u'style', u'i\u2019m not a robot', u'witch amusement', u'ma boy', u'i am legend', u'when a man loves', u'the moon that embraces the sun', u'i miss you', u'school 2013', u'pied piper', u'the lonely shining goblin', u'man who dies to live', u'miss ripley', u'arang and the magistrate', u'beautiful gong shim', u'ad genius lee tae-baek', u'painter of the wind', u'she was pretty', u'five fingers', u'answer me 1994', u'boys before flowers', u'story of a man', u'fashion king', u'history of the salaryman', u'bride of the water god 2017', u'radiant office', u'uncontrollably fond', u'gu family book', u'wanted', u'woman with a suitcase', u'nothing to lose', u'heart to heart', u'hwarang', u'age of youth', u'iri

## build corpus
Build a corpus where each entry is the recaps of all the episodes of that show.

In [12]:
# put all the recaps into a dictionary
recaps = defaultdict(str)
for show in shownames:
    for doc in db[show].find():
        recaps[show] += doc['Recap']

recaps['circle']



## tokenisation
Tokenize the corpus

In [10]:
# tokenize
shows = recaps.keys()

tokenized = [nltk.tokenize.word_tokenize(recap)
             for recap in recaps.values()]

## NER

There is a need to remove the character names, otherwise all the topics identified will just contain names. Use Named Entity Recognition (NER) to form a list of stop words to exclude for each show. Include caps so that we filter out "Young" (Korean name) but not "young".

Since training an NER is a project in itself (have to manually identify NER in string), try some out-of-the-box NERs.

Models:
1. nltk: ne_chunk (nltk's current best NER)
2. spaCy

### NER using NLTK

In [13]:
# tokenize
shows = recaps.keys()

tokenized = [nltk.tokenize.word_tokenize(recap.encode('utf-8').decode(errors='ignore'))
             for recap in recaps.values()]
# print shows
# print len(tokenized)
# print len(tokenized[0]), len(tokenized[1])
tagged = [nltk.pos_tag(recap) for recap in tokenized]

In [None]:
# because the NLTK NER cannot handle special characters, have to remove them
weird_utf =  [u'\u2014', u'\u2019', u'\u201c', u'\u201d', u'\u2665']
tagged_clean = [[(word_tup[0].encode('utf-8').decode(), word_tup[1]) for word_tup in recap
               if word_tup[0] not in weird_utf]
               for recap in tagged ]
tagged_clean[0]
entities = [[nltk.chunk.ne_chunk(recap, binary=False)] for recap in tagged_clean] 
# if binary=False, classify into GPE, PERSON, ORGANIZATION... etc.

# save named entities in list, all_ne_nltk
all_ne_nltk = []
for entity in entities:
    set_names = []
    for sent in entity:
        for chunk in sent:
            if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
                set_names.append([ne for tree in chunk for ne in tree])
    all_ne_nltk.append( set([word.lower() for item in set_names for word in item if word != 'NNP']))
    
all_ne_nltk

[{u'adams',
  u'adobe',
  u'afterward',
  u'alas',
  u'amanda',
  u'arts',
  u'baby',
  u'bernard',
  u'boy',
  u'bynes',
  u'chi',
  u'cut',
  u'daehan',
  u'delete',
  u'dont',
  u'download',
  u'duff',
  u'dumb',
  u'fantastic',
  u'flash',
  u'funny',
  u'gahhh',
  u'grar',
  u'hes',
  u'highs',
  u'hilary',
  u'idiot',
  u'ill',
  u'irene',
  u'irenes',
  u'itll',
  u'jang',
  u'kang',
  u'kim',
  u'ma',
  u'nearby',
  'nnps',
  'nns',
  u'omg',
  u'paparazzo',
  u'pesky',
  u'player',
  u'reporter',
  u'schoolyard',
  u'scroll',
  u'shes',
  u'snerk',
  u'so-hyun',
  u'sorry',
  u'stab',
  u'tackle',
  u'tackles',
  u'thats',
  u'thinkin',
  u'tooniverse',
  u'trio',
  u'twinkle',
  u'uncle',
  u'wait',
  u'which'},
 {u'administration',
  u'adobe',
  u'adorbs',
  u'adult',
  u'afterward',
  u'ahhh',
  u'always',
  u'angry',
  u'anyhow',
  u'are',
  u'aww',
  u'b',
  u'bad',
  u'blearily',
  u'cant',
  u'captain',
  u'ceo',
  u'chairman',
  u'club',
  u'coldplay',
  u'cosmetics',


### NER using spaCy

In [None]:
# set up the list holding all the persons identified
all_persons = []

# loop through each show, identify named entities - persons
for show_num in range(len(recaps)):
    # load each show's recaps into spacy nlp model
    recap = nlp_spacy(recaps.values()[show_num]) 
    persons = {ent.text.replace('.', '').strip() for ent in recap.ents 
               if (ent.label_ in ['PERSON', 'GPE', 'ORG', 'NORP', 'FACILITY', 'PRODUCT', 'WORK_OF_ART']) 
               and not (ent.text.isspace())}
    # make lowercase, don't use words with spaces or with special characters
    persons = {part.lower() for word in persons for part in word.split('-') if part.isalpha()}
    m_persons = list(persons) + [word+'s' for word in persons]
    all_persons.append(m_persons)

# this is the names that we should exclude

#### Combine NLTK & spaCy NER for best results
After trying out both, it seems like they both miss things out - it was best to combine them

In [None]:
# combine both nltk and spacy NER for best results!!
name_stop = [list(set(show + list(all_ne_nltk[i]))) for i, show in enumerate(all_persons)]

### more preprocessing

POS tagging, lemmatization, cleaning, forming bigram tokens

In [None]:
# define function to convert POS tag into a form that Wordnet can recognise
def pos_to_wordnet(tag):
    '''convert from nltk tag to wordnet tag'''

    if tag.startswith('J'):
        wn = 'a'
    elif tag.startswith('R'):
        wn = 'r'
    elif tag.startswith('V'):
        wn = 'v'
    else:
        wn = 'n' # the default
    return wn

In [None]:
# remove stopwords, punctuation, remove words that only have 1 letter, remove names, convert POS tag from NLTK to wordnet format
stop = stopwords.words('english')
stop = list(set(stop + [word.replace("'", '') for word in stop])) + ['thats', 'hed', 'hes'] # remove the apostrophes, append to stop words list
tokenized_clean = [[(word[0].strip().lower(), pos_to_wordnet(word[1])) for word in show 
                    if ((word[0].strip().lower() not in stop) and (word[0].lower() not in list(name_stop[i]))) # use all_ne_nltk OR all_persons
                    if (word[0].isalpha()) and (len(word[0]) >2)] # or (('-' in word) and word !='-')
                    for i, show in enumerate(tagged)]
# print len(tokenized_clean)
# print len(tokenized_clean[0]), len(tokenized_clean[1])
# consider only removing names for that particular show....

In [None]:
# lemmatise - choose lemmatizer instead of stemmer as stemming is too harsh, 
# especially for korean words which we want to keep intact
stemmer = WordNetLemmatizer()
tokenized_clean_lem = [[stemmer.lemmatize(word[0], word[1]) for word in show] 
                        for show in tokenized_clean]
# print len(tokenized_clean_lem[0]), len(tokenized_clean_lem[1])

# after lemmatisation, check for stop words again
tokenized_clean_lem = [[word for word in show if word not in stop]
                       for show in tokenized_clean_lem]

In [None]:
# create bigrams too
bigrms = [list(nltk.bigrams(show)) for show in tokenized_clean_lem]
bigrms = [[' '.join(list(bi)) for bi in show] for show in bigrms]
bigrms_clean_lem = [show + tokenized_clean_lem[i] for i, show in enumerate(bigrms)]

### GENSIM
Explore several methods to do topic modelling using GENSIM. 

In [None]:
# create gensim dictionary & bag of words corpus - FOR BIGRAMS
dictionary = Dictionary(bigrms_clean_lem)
corpus = [dictionary.doc2bow(show) for show in bigrms_clean_lem]

In [None]:
def topwords(bow, dictionary, num_words=10):
    '''displays the top number of words as specified by num_words 
    using the bow (word_id, count/freq)'''
    
    bow_doc = sorted(bow, key=lambda w: w[1], reverse=True)
    for word_id, word_count in bow_doc[:num_words]:
        print dictionary.get(word_id), word_count

#### Method 1: bag-of-words
As expected, themes identified sucked. There are many recurring words in a recap. e.g. "asks", "like", "back", etc.

In [None]:
# # using basic bag-of-words: print 5 most common words
# for i, show in enumerate(shows):
#     print '------', show.upper()
#     topwords(corpus[i], dictionary, num_words=5)

#### Method 2: TF-IDF
TF-IDF = term frequency - inverse document frequency

term frequency = raw count of a term in a document
inverse doc frequency = log of total no of documents in corpus / number of documents where term appears

Because it filters out words that occur frequently over the entire corpus, it addresses the problem with bag-of-words.

- common words are penalised, rare words gain importance

Actually this was pretty good! 

In [None]:
%%time 
# tf-idf
tfidf = TfidfModel(corpus)
for i, show in enumerate(shows):
    print '------', show.upper()
    topwords(tfidf[corpus[i]], dictionary, num_words=10)
    
# getting a lot of names of people... need to remove them - removed via NER
# should tune word lemmatizer
# should tune NER, consider n-grams - e.g. serial & killer should be serial killer

In [None]:
# compare similarity of docs
index = similarities.MatrixSimilarity(tfidf[corpus])

In [None]:
df_sims = pd.DataFrame(list(index))

In [None]:
sims = pd.DataFrame({'shownames': shows, 'similarity':index[tfidf[corpus[0]]]})
sims.sort_values('similarity', ascending=False)
# for i in range(num_shows):
#     print sims[i], shownames[i]

In [None]:
# save as a pickle file
version = 'v8-386-bigram'
to_pickle = ['df_sims', 'corpus', 'tfidf', 'dictionary', 'shows']

for var in to_pickle:
    pickle.dump(eval(var), open('tfidf_files/{}_{}.pkl'.format(var, version), 'wb'))



In [None]:
# save show indexing as pickle file
pickle.dump(sims.shownames, open('tfidf_files/show_mapping.pkl', 'wb'))

### Method 3: LDA

not so great for this application, because:
- not able to predict how many topics ahead of time
- the topics are not named, just grouped
- not able to get features of a particular show

In [None]:
# %%time 
# # LDA - over all shows - doesn't work very well, even when we put in the tf-idf bow
# tfidf_corpus = tfidf[corpus]
# lda = LdaModel(tfidf_corpus, num_topics=num_shows, id2word=dictionary, passes=20)

# # print
# for topic in lda.print_topics():
#     print topic

In [None]:
# %%time 
# # compare with LDA multicore
# ldamulti = LdaMulticore(corpus, num_topics=3, id2word=dictionary, passes=50, workers=3)

# # doesn't seem like it is faster at all... O.O KIV for next time

# # print
# print ldamulti.print_topics()

In general, TF-IDF looks best. Choose to go with it for feature extraction of the theme of a particular show.