In [1]:
# Unicode Handling
from __future__ import unicode_literals
import codecs, json
import pandas as pd
import numpy as np
import gensim

# spacy is used for pre-processing and traditional NLP
import spacy
from spacy.en import English

# Gensim is used for LDA and word2vec
from gensim.models.word2vec import Word2Vec

Importing the stumbleupon dataset from last week

In [2]:
data = pd.read_csv("../../lesson-11/code/data/stumbleupon.tsv", sep='\t', encoding="utf-8")
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))

### CountVectorizer
The CountVectorizer returns a matrix where each row is a document, and the column values indicate whether and how many times the term for that column appears in the document

http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=False, stop_words='english', min_df=3)

docs = cv.fit_transform(data.body.fillna(""))

# Build a mapping of numerical ID to word
id2word = dict(enumerate(cv.get_feature_names()))
# id2word2 = dict((v, k) for k, v in cv.vocabulary_.iteritems())

In [4]:
docs

<7395x27283 sparse matrix of type '<type 'numpy.int64'>'
	with 995549 stored elements in Compressed Sparse Row format>

In [5]:
print(docs)

  (0, 10986)	1
  (0, 24886)	1
  (0, 20514)	1
  (0, 8543)	1
  (0, 16572)	2
  (0, 21110)	1
  (0, 23470)	2
  (0, 20381)	1
  (0, 6124)	2
  (0, 13551)	1
  (0, 8045)	1
  (0, 26998)	1
  (0, 4381)	1
  (0, 24663)	1
  (0, 19378)	1
  (0, 16653)	1
  (0, 24270)	1
  (0, 19491)	1
  (0, 18853)	1
  (0, 9722)	1
  (0, 24735)	1
  (0, 16206)	1
  (0, 4273)	1
  (0, 2881)	1
  (0, 3310)	1
  :	:
  (7394, 1634)	1
  (7394, 15974)	1
  (7394, 22186)	1
  (7394, 13326)	3
  (7394, 2172)	1
  (7394, 14007)	1
  (7394, 489)	3
  (7394, 8541)	4
  (7394, 10729)	3
  (7394, 24063)	8
  (7394, 12539)	4
  (7394, 22135)	6
  (7394, 18271)	3
  (7394, 15746)	1
  (7394, 15972)	2
  (7394, 161)	1
  (7394, 394)	1
  (7394, 3061)	1
  (7394, 13339)	1
  (7394, 23050)	4
  (7394, 5680)	3
  (7394, 19448)	1
  (7394, 15977)	3
  (7394, 18284)	1
  (7394, 11149)	1


In [7]:
docs.todense()

matrix([[0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [8]:
docs.todense().shape

(7395, 27283)

In [9]:
id2word

{0: u'00',
 1: u'000',
 2: u'000000',
 3: u'001',
 4: u'007',
 5: u'00am',
 6: u'00pm',
 7: u'01',
 8: u'01pm',
 9: u'02',
 10: u'0206790666',
 11: u'025',
 12: u'03',
 13: u'04',
 14: u'044',
 15: u'05',
 16: u'06',
 17: u'0674921071',
 18: u'07',
 19: u'075',
 20: u'0782835788',
 21: u'08',
 22: u'09',
 23: u'0g',
 24: u'0http',
 25: u'0px',
 26: u'0s',
 27: u'0sodium',
 28: u'10',
 29: u'100',
 30: u'1000',
 31: u'100000000000000000',
 32: u'1000000000000000000',
 33: u'1000px',
 34: u'1000s',
 35: u'1001',
 36: u'10013',
 37: u'100g',
 38: u'100k',
 39: u'100m',
 40: u'100ml',
 41: u'100px',
 42: u'100th',
 43: u'101',
 44: u'10184',
 45: u'102',
 46: u'1024',
 47: u'103',
 48: u'1034',
 49: u'1036',
 50: u'104',
 51: u'105',
 52: u'10522',
 53: u'10529',
 54: u'106',
 55: u'107',
 56: u'108',
 57: u'1080p',
 58: u'109',
 59: u'1090',
 60: u'10am',
 61: u'10g',
 62: u'10km',
 63: u'10lbs',
 64: u'10m',
 65: u'10mm',
 66: u'10oz',
 67: u'10pm',
 68: u'10px',
 69: u'10th',
 70: u'11'

In [11]:
id2word[3310]

u'bills'

In [12]:
#data.body[0]

### Gensim
https://radimrehurek.com/gensim/

In [13]:
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus

# First we convert our word-matrix into gensim's format
corpus = Sparse2Corpus(docs, documents_columns = False)

num_topics = 15

# Then we fit an LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

In [14]:
num_words_per_topic = 20
for ti, topic in enumerate(lda_model.show_topics(num_topics = num_topics, num_words = num_words_per_topic)):
    print("Topic: {} \n{}\n".format(ti, topic))

Topic: 0 
(0, u'0.012*"chocolate" + 0.012*"cup" + 0.012*"butter" + 0.010*"sugar" + 0.009*"minutes" + 0.009*"recipe" + 0.009*"baking" + 0.008*"add" + 0.007*"cream" + 0.007*"make" + 0.007*"bowl" + 0.006*"oven" + 0.006*"cake" + 0.006*"flour" + 0.006*"just" + 0.006*"dough" + 0.005*"pan" + 0.005*"mixture" + 0.005*"vanilla" + 0.005*"cookies"')

Topic: 1 
(1, u'0.005*"new" + 0.004*"said" + 0.004*"2010" + 0.004*"world" + 0.004*"just" + 0.004*"10" + 0.004*"sports" + 0.004*"2009" + 0.003*"news" + 0.003*"like" + 0.003*"year" + 0.003*"time" + 0.003*"11" + 0.003*"2008" + 0.003*"12" + 0.003*"2011" + 0.003*"2012" + 0.002*"best" + 0.002*"people" + 0.002*"2007"')

Topic: 2 
(2, u'0.007*"cancer" + 0.006*"swimsuit" + 0.006*"skin" + 0.006*"si" + 0.005*"health" + 0.004*"care" + 0.004*"models" + 0.003*"new" + 0.003*"like" + 0.003*"time" + 0.003*"cells" + 0.003*"people" + 0.003*"illustrated" + 0.003*"use" + 0.003*"just" + 0.003*"sports" + 0.002*"blood" + 0.002*"stem" + 0.002*"help" + 0.002*"medical"')

Topic

In [15]:
for num, doc_topics in enumerate(lda_model.get_document_topics(corpus)):
    print("Row: {} Topics: {}".format(num, doc_topics))
    if num > 10:
        break

Row: 0 Topics: [(1, 0.75918729614994973), (6, 0.090882746959243199), (7, 0.035416576153582666), (13, 0.10497649130916581)]
Row: 1 Topics: [(1, 0.85655282617158135), (6, 0.069011691968632369), (13, 0.071285868010988754)]
Row: 2 Topics: [(2, 0.15974527175785408), (3, 0.062022994034052668), (10, 0.088731534627191966), (13, 0.68381543561646585)]
Row: 3 Topics: [(1, 0.050752331827460098), (3, 0.43491831684221927), (13, 0.51110353887082483)]
Row: 4 Topics: [(1, 0.57758381652284752), (3, 0.01110947574868322), (5, 0.31229925072147885), (7, 0.029151224751857586), (8, 0.051952133461489761)]
Row: 5 Topics: [(2, 0.99498207047993514)]
Row: 6 Topics: [(1, 0.40424912810100155), (8, 0.50496314230166306), (10, 0.066475299434717555), (11, 0.016593116943847828)]
Row: 7 Topics: [(6, 0.36664752155644886), (13, 0.62658163296187663)]
Row: 8 Topics: [(1, 0.17702556486937554), (4, 0.08263498153652149), (8, 0.48586621612053965), (14, 0.24410618465890685)]
Row: 9 Topics: [(0, 0.97361797519074067), (4, 0.02337275

In [16]:
data.body[9]

u"More brownies It seems that I can t get through one full week without trying a new brownie combination Just wait until you see the next one I am going to experiment with These are super simple I used my go to brownie recipe and just added a few Oreos You could even do the same with boxed brownies Just make the frosting and add some more cookies I only frosted a few because I just don t love frosting on brownies Mr How Sweet really enjoyed the frosted ones If you waste a good 3 4 of your day on Twitter like I do you will know that last night Mr How Sweet brought me home a dozen hot pink roses I didn t make him dinner my half unpacked mess is strewn in 4 different rooms and I was in holey sweat pants once he got home Oh yes and I was half asleep while he professed his undying love to me And I just continued to lay there Sounds eerily similar to the night he proposed Good thing there were brownies on the counter As my dad says marriage is rarely 50 50 The cookies somewhat melted into th

### Word2Vec 

https://radimrehurek.com/gensim/models/word2vec.html

- size: dimensionality of vectors 
- window: maximum distance between current and predicted word in sentence
- min_count: ignore words with total frequency less than this
- workers: number of worker threads

In [17]:
# Setup the body text
text = data.body.fillna("").map(lambda x: x.split())

from gensim.models import Word2Vec
model = Word2Vec(text, size=100, window=5, min_count=5, workers=4)

In [19]:
text.head()

0    [A, sign, stands, outside, the, International,...
1    [And, that, can, be, carried, on, a, plane, wi...
2    [Apples, The, most, popular, source, of, antio...
3    [There, was, a, period, in, my, life, when, I,...
4    [Jersey, sales, is, a, curious, business, Whet...
Name: body, dtype: object

In [20]:
model.most_similar(positive=['cookie', 'brownie'])

[(u'cupcake', 0.9163855314254761),
 (u'crust', 0.838455080986023),
 (u'pie', 0.8346596956253052),
 (u'cake', 0.8325323462486267),
 (u'cheesecake', 0.8286693096160889),
 (u'cakes', 0.8167980909347534),
 (u'icing', 0.816005289554596),
 (u'granola', 0.8104453682899475),
 (u'candy', 0.8082937002182007),
 (u'buttercream', 0.8036230206489563)]

### Twitter Exercise 

In [21]:
# Loading the tweet data
filename = 'data/captured-tweets.txt'
tweets = []
for tweet in codecs.open(filename, 'r', encoding="utf-8"):
    tweets.append(tweet)
# Setting up spacy
nlp = English()

### Exercise 1a

Write a function that can take a take a sentence parsed by `spacy` and identify if it mentions a company named 'Google'. Remember, `spacy` can find entities and codes them as `ORG` if they are a company. Look at the slides for class 13 if you need a hint:

### Bonus (1b)

Parameterize the company name so that the function works for any company.

In [25]:
def mentions_company(parsed):
    for entity in parsed.ents:
        if entity.text == "Google" and entity.label_ == 'ORG':
            return True
    return False

# 1b

def mentions_company(parsed, company='Google'):
    for entity in parsed.ents:
        if entity.text == company and entity.label_ == 'ORG':
            return True
    return False

### Exercise 1c

Write a function that can take a sentence parsed by `spacy` 
and return the verbs of the sentence (preferably lemmatized)

In [26]:
def get_actions(parsed):
    actions = []
    for el in parsed:
        if el.pos == spacy.parts_of_speech.VERB:
            actions.append(el.text)
    return actions

### Exercise 1d
For each tweet, parse it using spacy and print it out if the tweet has 'release' or 'announce' as a verb. You'll need to use your `mentions_company` and `get_actions` functions.

In [27]:
for tweet in tweets:
    parsed = nlp(tweet)
    if mentions_company(parsed, 'Google'):
        actions = get_actions(parsed)
        if 'release' in actions or 'announce' in actions:
            print(tweet)

### Exercise 1e
Write a function that identifies countries - HINT: the entity label for countries is GPE (or GeoPolitical Entity)



In [28]:
def mentions_country(parsed, country):
    for entity in parsed.ents:
        if entity.text == country and entity.label_ == 'GPE':
            return True
    return False

### Exercise 1f

Re-run (d) to find country tweets that discuss 'Iran' announcing or releasing.


In [29]:
for tweet in tweets:
    parsed = nlp(tweet)
    if mentions_country(parsed, 'Iran'):
        actions = get_actions(parsed)
        if 'release' in actions or 'announce' in actions:
            print(tweet)

RT @cerenomri: "Literally every US ally in Mideast is on brink of hot war w/ Iran, so we're going to release $100 billion to Iran this mont…

GOBE! Iran warns Nigeria to release Shiite leader El-Zakzaky - SEE https://t.co/TRshnC6sVU

GOBE! Iran warns Nigeria to release Shiite leader El-Zakzaky - SEE https://t.co/SlvcQtk3vE

RT @cerenomri: "Literally every US ally in Mideast is on brink of hot war w/ Iran, so we're going to release $100 billion to Iran this mont…

Hhmmm. Iran claiming to have 'warned Nigeria' to release detained Shiite leader.... @afalli

RT @cerenomri: "Literally every US ally in Mideast is on brink of hot war w/ Iran, so we're going to release $100 billion to Iran this mont…



### Exercise 2
Build a word2vec model of the tweets we have collected using gensim.
First take the collection of tweets and tokenize them using spacy.

### Exercise 2a:
* Think about how this should be done. 
* Should you only use upper-case or lower-case? 
* Should you remove punctuations or symbols? 

In [30]:
text_split = [[x.text if x.pos != spacy.parts_of_speech.VERB else x.lemma_ for x in nlp(t)] for t in tweets]

### Exercise 2b:
Build a word2vec model.
Test the window size as well - this is how many surrounding words need to be used to model a word. What do you think is appropriate for Twitter? 

In [31]:
model = Word2Vec(text_split, size=100, window=4, min_count=5, workers=4)

### Exercise 2c:
Test your word2vec model with a few similarity functions. 
* Find words similar to 'Syria'.
* Find words similar to 'war'.
* Find words similar to "Iran".
* Find words similar to 'Verizon'. 



In [32]:
model.most_similar(positive=['Syria'])

[(u'opposition', 0.9989997148513794),
 (u'Russia', 0.997747004032135),
 (u'casualties', 0.9965444803237915),
 (u'Ads', 0.9961587190628052),
 (u'movements', 0.9960920810699463),
 (u'Iran', 0.9959622621536255),
 (u'democractic', 0.9959388971328735),
 (u'/', 0.9955971240997314),
 (u'must', 0.99542236328125),
 (u'cartoon', 0.9953904151916504)]

### Exercise 2d

Adjust the choices in (b) and (c) as necessary.


### Exercise 3

Filter tweets to those that mention 'Iran' or similar entities and 'war' or similar entities.
* Do this using just spacy.
* Do this using word2vec similarity scores.

In [33]:
# Using spacy
for tweet in tweets:
    parsed = nlp(tweet)
    if mentions_country(parsed, 'Iran') or mentions_country(parsed, 'Iraq'): # ... you could add more
        if 'attack' in get_actions(parsed):
            print(tweet)

In [39]:
# Using word2vec similarity scores
for tweet in tweets[:200]:
    parsed = nlp(tweet)
    similarity_to_iran = max([model.similarity('Iran', tok.text) for tok in parsed if tok.text in model.wv.vocab])
    similarity_to_war = max([model.similarity('war', tok.text) for tok in parsed if tok.text in model.wv.vocab])
    if similarity_to_iran > 0.9 and similarity_to_war > 0.9:
        print(similarity_to_iran, similarity_to_war, tweet)
        #print tweet

(0.9899881542169926, 0.99755790073547113, u'I made a(n) Small Tourmaline in Paradise Island! https://t.co/cAoW1b6DRc #Gameinsight #Androidgames #Android\n')
(0.98579101640126132, 0.99724397695883105, u'RT @PURELOVEBEAST: -\u0e40\u0e0a\u0e47\u0e04\u0e23\u0e32\u0e22\u0e25\u0e30\u0e40\u0e2d\u0e35\u0e22\u0e14- 27th BIRTHDAY SPECIAL GOODS - 3D YOSEOP USB\n')
(0.98556919747726068, 0.99451391608419959, u'https://t.co/EOfBdVQUfO\n')
(0.98556919747726068, 0.99478210221187491, u'@ViGiGu google it :) simple\n')
(0.99144206136722079, 0.99878837839975498, u'nerd ass girl  https://t.co/T7kDirxPEL\n')
(0.99144206136722079, 0.99901895247679173, u'LeadCorp Media @leadcorpmedia_  https://t.co/vRJG9Xnzw8\n')
(0.98873754646251555, 0.99895179866077188, u'RT mackdrama1017: ChieMoney use google and learn how to!\n')
(0.98556919747726068, 0.99932113639084963, u'@ShaffieWeru Morning bro here is my new video i need yr help im talented bt i lack a manager bro https://t.co/NTs3QM5YU5\n')
(0.99139136428588204, 0.9

(0.99144206136722079, 0.99908779294393923, u'RT @TheMoneyGenie: Why People Un-Follow You On Social Media Platforms  quick cash https://t.co/AlwvHy0mkz https://t.co/d3xZ63Xj8j\n')
(0.9907624376623354, 0.99918085946428237, u"@Mrkenneyy I saw ur tweet no need for Google lol it's a very large dog\n")
(0.98868037827833433, 0.99856631730883882, u'New Google Chat App Could Destroy Facebook Messenger https://t.co/71Cu1vKcOI #google\n')
(0.98834851516985611, 0.99697354752185652, u'RT @lexicaaan: Me af \U0001f602\U0001f602\U0001f602 https://t.co/fUw8UrVIV0\n')
(0.98973055475056348, 0.99732074484448641, u'#RADIO #90s #juice Now Playing #Breathe (Radio Edit) #MIDGE URE #APPS Apple https://t.co/gsMf7U2vzO Google https://t.co/idypaT7UqA\n')
(0.98556919747726068, 0.99661213686896288, u'@sebast_lj @McFaul @Inna4848 \'cause google translates it as "return to California"\n')
(0.98561294076224293, 0.99774009380547257, u'My Google Play account makes me realize I was a dysfunctional teen with deep emotiona