In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [196]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim
from utils.common.db_utils import read_all_results

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = stopwords.words('english')

# Create p_stemmer of class PorterStemmer
p_stemmer = SnowballStemmer('english',ignore_stopwords=True)

In [4]:
config = dict(parameters={},DEFAULT={})
config["parameters"]["input_db"] = "tier-1"
config["DEFAULT"]["tier-1"] = "/Users/jklinger/Nesta/nesta_dataflow/db_config/tier-1.cnf"
config["parameters"]["input_table"] = "course_descriptions"
results = read_all_results(config,"input_db","input_table")
doc_set = [r[1] for r in results]

  result = self._query(query)


In [379]:
def generate_corpus(doc_set,extra_stops=[]):
    # list for tokenized documents in loop
    texts = []

    # loop through document list
    for doc in doc_set:
        if doc is None:
            continue
        # clean and tokenize document string
        raw = doc.lower()
        tokens = tokenizer.tokenize(raw)
        
        # remove stop words from tokens
        tokens = [t for t in tokens if t not in en_stop]

        # stem tokens
        tokens = [p_stemmer.stem(s) for s in tokens]
        before = len(tokens)
        
        tokens = [t for t in tokens if t not in extra_stops]
        after = len(tokens)
        
        diff = before - after
        if diff > 0:
            print(diff)
        
        # add tokens to list
        texts.append(tokens)

    print("Got",len(texts))

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]
    return texts,corpus,dictionary

In [211]:
# generate LDA model
texts,corpus,dictionary = generate_corpus(doc_set)
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=1000)

In [219]:
ldamodel.print_topics(num_topics=30)

[(0,
  '0.030*"water" + 0.024*"english" + 0.022*"environment" + 0.009*"anim" + 0.009*"univers" + 0.009*"state" + 0.008*"world" + 0.008*"market" + 0.008*"teach" + 0.007*"mani"'),
 (1,
  '0.021*"polit" + 0.016*"special" + 0.013*"safeti" + 0.013*"control" + 0.010*"need" + 0.010*"occup" + 0.009*"teach" + 0.009*"intern" + 0.009*"student" + 0.009*"petroleum"'),
 (2,
  '0.042*"tourism" + 0.032*"geographi" + 0.030*"logist" + 0.030*"place" + 0.020*"suppli" + 0.014*"chain" + 0.012*"point" + 0.010*"origin" + 0.010*"tourist" + 0.010*"concern"'),
 (3,
  '0.023*"hr" + 0.022*"employe" + 0.020*"technolog" + 0.018*"learn" + 0.014*"train" + 0.012*"resourc" + 0.012*"field" + 0.012*"profession" + 0.012*"benefit" + 0.010*"teach"'),
 (4,
  '0.026*"process" + 0.016*"translat" + 0.012*"specif" + 0.011*"activ" + 0.011*"creat" + 0.011*"peopl" + 0.011*"capit" + 0.010*"collect" + 0.010*"refer" + 0.010*"associ"'),
 (5,
  '0.020*"multimedia" + 0.018*"network" + 0.013*"devic" + 0.011*"content" + 0.011*"ict" + 0.010*

# Try word2vec to identify words which aren't useful

In [1]:
model = gensim.models.Word2Vec(texts, size=500, window=5, min_count=5, workers=4, sample=1e-3)
model.wv.init_sims()
wv = model.wv.syn0
word_mapping = {model.wv.index2word[i]:v for i,v in enumerate(wv)}

NameError: name 'gensim' is not defined

In [294]:
import numpy as np
from collections import Counter

class WordPotential:
    def __init__(self):
        self.topics = []
        self.weights = []
        self.distances = []
    def append(self,distance,weight,topic):
        self.topics.append(topic)
        self.distances.append(distance)
        self.weights.append(weight)
    def forces(self):
        for d,w in self.items():
            yield w*d
    def energies(self):
        for d,w in self.items():
            yield 0.5*w*d*d
    def items(self):
        return zip(self.distances,self.weights)
    
topic_positions = []
all_forces = []
word_potentials = {}
x = []
for itopic,topic in enumerate(ldamodel.get_topics()):
    weighted_positions = []
    for idx,weight in enumerate(topic):
        word = dictionary.get(idx)
        if word in word_mapping:
            pos = word_mapping[word]
            weighted_positions.append(weight * pos)
        elif weight > 0.01:
            print(word,weight)
    topic_pos = np.average(weighted_positions,axis=0)
    #print("-------------")
    #print(topic_pos)
    topic_positions.append(topic_pos)
    
    # Plot get collection of weight*distance for this topic
    forces = []
    for idx,weight in enumerate(topic):
        word = dictionary.get(idx)
        if not word in word_mapping:
            continue
        pos = word_mapping[word]
        distance = np.linalg.norm(topic_pos - pos)
        x.append(distance)
        #print(distance)
        #print(pos)        
        #if np.abs(distance - 1.) > 1e-5:
        #    print(type(distance),distance,distance-1.0)
        #if weight > 1e-4:
        force = distance*weight
        forces.append(force)
        if word not in word_potentials:
            word_potentials[word] = WordPotential()
        word_potentials[word].append(distance,weight,itopic)
    
    #break 
    all_forces.append(forces)


In [295]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import mpld3


word_mean_potential = [max(wp.energies()) for word,wp in word_potentials.items()]
word_min_distance = [min(wp.distances) for word,wp in word_potentials.items()]
n = [counter[word] for word,wp in word_potentials.items()]
#n = [c if c < 100 else 100 for c in n]

fig,ax = plt.subplots(figsize=(12,6))
scatter = ax.scatter(word_mean_potential,word_min_distance,c=n,cmap="gist_rainbow",vmax=250)
ax.set_xlabel("Word's maximum energy",fontsize=14)
ax.set_ylabel("Closest distance to a topic",fontsize=14)

#ax.set_ylim(0.,0.0004)
#ax.set_xlim(0.0,0.05)

fig.colorbar(scatter)

labels = [word for word,wp in word_potentials.items()]
#print(labels[0:10])
tooltip = mpld3.plugins.PointLabelTooltip(scatter,labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display(fig)

#word_energy_total,np.arange(0,0.12,0.001))

In [347]:
word_mean_potential = [np.std(list(wp.energies())) for word,wp in word_potentials.items()]
word_max_potential = [np.sum(list(wp.energies())) for word,wp in word_potentials.items()]
n = [counter[word] for word,wp in word_potentials.items()]

fig,ax = plt.subplots(figsize=(12,6))
scatter = ax.scatter(word_mean_potential,word_max_potential,c=n,cmap="gist_rainbow",vmax=250)
ax.set_xlabel("Word's median energy (Genericity)",fontsize=14)
ax.set_ylabel("Word's maximum force (Ambiguity) ",fontsize=14)
#ax.set_ylim(0.,1.5)
#ax.set_xlim(0.0,0.018)

fig.colorbar(scatter)

labels = [word for word,wp in word_potentials.items()]
#print(labels[0:10])
tooltip = mpld3.plugins.PointLabelTooltip(scatter,labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display(fig)


In [360]:
sum_e = [np.std(list(wp.energies())/np.mean(list(wp.energies()))) for word,wp in word_potentials.items()]
med_e = [np.sqrt(np.sum(np.square(wp.weights),axis=0)) for word,wp in word_potentials.items()]

#sum_e = [x for _,x in sorted(zip(n,sum_e))]
#med_e = [x for _,x in sorted(zip(n,med_e))]

fig,ax = plt.subplots(figsize=(12,6))
scatter = ax.scatter(sum_e,med_e,c=n,cmap="gist_rainbow",vmax=250)
ax.set_xlabel("Std/Mean word energy (Specificity)",fontsize=14)
ax.set_ylabel("Sqrt sum of word weight^2 (Topic prevalance) ",fontsize=14)
#ax.set_ylim(0.,0.002)
#ax.set_xlim(0.0,0.018)

fig.colorbar(scatter)

labels = [word for word,wp in word_potentials.items()]
#print(labels[0:10])
tooltip = mpld3.plugins.PointLabelTooltip(scatter,labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display(fig)


In [380]:
extra_stops = []
for word,wp in word_potentials.items():
    specificity = np.std(list(wp.energies())/np.mean(list(wp.energies())))
    if specificity < 1.5: # and max_force < 0.35:
        extra_stops.append(word)
print(extra_stops)        
text,corpus,dictionary = generate_corpus(doc_set,extra_stops=extra_stops)

['establish', 'call', 'activ', 'includ', 'known', 'often', 'use', 'sever', 'relat', 'analysi', 'may', 'present', 'common', 'profession', 'general', 'various', 'unit', 'major', 'plan', 'direct', 'within', 'provid', 'also', 'requir', 'one', 'well', 'human', 'work', 'world', 'peopl', 'concern', 'develop', 'two', 'deal', 'larg', 'refer', 'scienc', 'part', 'industri', 'associ', 'describ', 'howev', 'studi', 'area', 'number', 'base', 'involv', 'produc', 'level', 'exampl', 'make', 'natur', 'consid', 'non', 'applic', 'differ', 'find', 'practic', 'theori', 'first', 'purpos', 'certain', 'increas', 'creat', 'techniqu', 'whole', 'defin', 'specif', 'person', 'case', 'generat', 'import']
25
25
21
11
22
20
36
20
20
23
16
8
30
6
9
11
8
4
3
26
19
15
17
42
3
18
6
7
7
7
18
25
26
5
46
3
13
10
10
36
8
8
6
8
10
6
1
6
6
28
22
22
30
9
17
3
54
5
10
11
22
4
9
22
21
21
6
6
26
12
47
8
10
1
16
20
16
5
5
13
7
13
11
28
17
9
32
23
9
23
23
17
12
18
12
15
4
1
7
5
15
15
54
28
3
6
13
3
14
27
19
19
5
37
17
37
11
10
23
1
13

In [381]:
# generate LDA model
text,corpus,dictionary = generate_corpus(doc_set,extra_stops=extra_stops)
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=30, id2word = dictionary, passes=1000)

25
25
21
11
22
20
36
20
20
23
16
8
30
6
9
11
8
4
3
26
19
15
17
42
3
18
6
7
7
7
18
25
26
5
46
3
13
10
10
36
8
8
6
8
10
6
1
6
6
28
22
22
30
9
17
3
54
5
10
11
22
4
9
22
21
21
6
6
26
12
47
8
10
1
16
20
16
5
5
13
7
13
11
28
17
9
32
23
9
23
23
17
12
18
12
15
4
1
7
5
15
15
54
28
3
6
13
3
14
27
19
19
5
37
17
37
11
10
23
1
13
16
31
11
1
31
31
27
17
13
1
23
8
34
25
20
35
47
16
1
69
69
27
25
11
3
20
44
11
47
5
7
19
26
8
6
18
5
18
6
11
12
11
26
17
26
25
40
6
6
11
7
32
36
17
10
9
10
20
3
21
5
6
30
6
28
75
15
21
34
19
47
5
19
10
16
27
2
34
34
34
41
26
26
26
17
26
18
14
10
10
10
26
12
25
52
11
11
1
21
11
3
8
22
18
10
10
27
3
3
5
28
41
41
28
16
16
7
4
14
7
31
30
2
4
35
48
36
10
15
34
5
12
2
3
15
5
5
12
12
45
25
6
6
7
9
37
12
27
32
9
38
27
11
11
Got 278


In [382]:
ldamodel.print_topics(num_topics=30)

[(0,
  '0.028*"optic" + 0.022*"radio" + 0.021*"stern" + 0.016*"light" + 0.013*"sensor" + 0.011*"signal" + 0.009*"year" + 0.009*"million" + 0.009*"show" + 0.007*"new"'),
 (1,
  '0.022*"busi" + 0.015*"technolog" + 0.015*"econom" + 0.014*"process" + 0.013*"environment" + 0.010*"game" + 0.009*"new" + 0.009*"field" + 0.009*"entrepreneurship" + 0.008*"product"'),
 (2,
  '0.026*"manag" + 0.023*"employe" + 0.022*"hr" + 0.015*"financ" + 0.015*"train" + 0.014*"financi" + 0.013*"compani" + 0.012*"educ" + 0.012*"benefit" + 0.012*"focus"'),
 (3,
  '0.043*"engin" + 0.031*"telecommun" + 0.030*"english" + 0.016*"system" + 0.014*"telecom" + 0.013*"judg" + 0.013*"judici" + 0.012*"design" + 0.012*"communic" + 0.012*"servic"'),
 (4,
  '0.042*"technolog" + 0.024*"educ" + 0.024*"manag" + 0.022*"school" + 0.018*"innov" + 0.016*"market" + 0.013*"organ" + 0.013*"ministri" + 0.011*"indonesia" + 0.011*"societi"'),
 (5,
  '0.111*"engin" + 0.035*"design" + 0.027*"electr" + 0.018*"field" + 0.017*"electron" + 0.017*