In [8]:
import pandas as pd
import gensim

# Preprocessing text
import string
import re

In [9]:
import json
with open('../datasets/udacity-api.json', 'r') as f:
    J = json.load(f)

In [10]:
J.keys()

[u'courses', u'tracks', u'degrees']

In [82]:
def load_data(raw_df, SEARCH_FIELDS, TAG):
    data = raw_df.set_index(TAG, drop=False)[SEARCH_FIELDS]
    ### drop row if no text in search fields
    data = data.dropna(subset=SEARCH_FIELDS)
    metadata = raw_df.set_index(TAG, drop=False)[[col for col in raw_df.columns if col not in SEARCH_FIELDS]]
    return data, metadata
    
### Udacity
SEARCH_FIELDS = ["title", "subtitle", "expected_learning", "syllabus", "summary", "short_summary"] 
TAG = 'key'
raw_df = pd.DataFrame.from_records(J['courses'])
data, metadata = load_data(raw_df, SEARCH_FIELDS, TAG)

### dart
SEARCH_FIELDS = ['contents']
TAG = 'course_id'
raw_df = pd.read_csv('~/Dropbox/_VPAL/findability/research/data/Corpus/latest_flag_plus_rm_GSE1.csv')
data, metadata = load_data(raw_df, SEARCH_FIELDS, TAG)

In [85]:
# Preprocessing text
import string
import re
from pprint import pprint
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)

# Gensim
from gensim.models import Doc2Vec, Phrases
from gensim.models.doc2vec import LabeledSentence
from gensim.matutils import unitvec

# Stop Words
from gensim.parsing.preprocessing import STOPWORDS as stop_words
letters = list('abcdefghijklmnopqrstuvwxyz')
numbers = list('123456789')
stop_words = stop_words.union(set(letters)).union(set(numbers))

from random import shuffle


def preprocess(text):
    # Remove all punctuation and make all lowercase 
    return RE_PUNCT.sub(" ", text).lower().split()

def make_movie_doc(text, title, drop_stopwords=True):
    """Make documents into LabeledSentence objects for doc2vec training"""
    doctag = '_'.join(preprocess(title))    
    docwords = filter(lambda word: word not in stop_words,
                      bigram[preprocess(text)])
    return LabeledSentence(docwords, [doctag])

# Train bigrammer to detect two-word phrases, e.g., breaking_bad
TEXTS = data['contents'].tolist()
DOC_TAGS = data.index.tolist()
bigram = Phrases(map(preprocess, TEXTS)) 

DOCS = [make_movie_doc(text, title) for text, title in
        zip(TEXTS, DOC_TAGS)]

shuffle(DOCS)

In [86]:
model = Doc2Vec(dm=0, dbow_words=1, min_count=4, negative=3,
                hs=0, sample=1e-4, window=10, size=100, workers=8)

model.build_vocab(DOCS)
model.train(DOCS)

18714165

In [101]:
searches = [
#     "neural networks",
#     "swift",
#     "venture capital",
#     "object-oriented programming",
#     "deep learning",
#     "macOS",
    "cognition",
    "einstein",
    "biochemistry"
]

# Find words similar to query word
for term in searches:
    pprint(model.most_similar(term))
    print '\n'

[(u'actuating', 0.8611820936203003),
 (u'purport', 0.8554942607879639),
 (u'cognitions', 0.847074031829834),
 (u'definite_conception', 0.8439792990684509),
 (u'pure_philosophy', 0.843144416809082),
 (u'mere_appearances', 0.8416832089424133),
 (u'pure_rational', 0.8401669859886169),
 (u'moral_judgement', 0.8398066163063049),
 (u'pure_practical', 0.8389104604721069),
 (u'receptivity', 0.8370202779769897)]


[(u'bohr', 0.8465549349784851),
 (u'general_relativity', 0.8431010842323303),
 (u'adler', 0.8430156111717224),
 (u'lorentz', 0.8339930772781372),
 (u'peter_galison', 0.8281943202018738),
 (u'albert_einstein', 0.8274244070053101),
 (u'mach', 0.8261105418205261),
 (u'physicists', 0.8256953954696655),
 (u'quantum_theory', 0.8238811492919922),
 (u'friedrich_adler', 0.8147999048233032)]


[(u'prebiotic_chemistry', 0.846264123916626),
 (u'modern_biology', 0.809945821762085),
 (u'genetic_molecules', 0.8055306077003479),
 (u'complicated_machinery', 0.7995242476463318),
 (u'protein_enzymes', 0

In [102]:
# If you want to look at words in the vocabulary
for w,wobj in model.wv.vocab.items():
    print w, wobj.count

fawn 21
rate_adding 6
career_readiness 10
vassall 7
nunnery 4
æ¼¢æ_hàn 8
lord_bacon 6
utnapishtim 7
cash_flows 9
woods 79
spiders 6
hanging 70
bromelain 4
woody 4
minhang 4
localized 69
regularize 8
should_“check 8
sprague 4
bigmemory 11
scold 6
inasmuch_as 60
nanking 34
originality 42
crossbar 15
hermann 13
watch_tv 10
cytochrome 13
downward_strokes 6
june_23 8
june_20 7
june_26 6
gasses 20
senate_majority 7
black_pepper 19
yàngbÇnxì 6
stipulate 10
pigment 34
primarily_composed 7
ffad33_cursor 9
perfectly_aware 7
february_2013 4
superiority_trial 7
strictest 7
screaming 17
seir 4
decisive_objection 6
elevates 11
yúnyì 4
worldwide_tests 14
grueling 5
e_cigarettes 12
wooden 76
lightbox_position 9
wednesday 55
highly_placed 10
stauffer 7
arbitrary_factors 15
fùshìzú 4
profit_motive 7
amplifications 14
despotical 16
reads_end2 7
emr_past 6
social_unrest 9
pantheistic 4
policy_maker 10
thrace 12
inevitably 136
1070s 9
francesco 12
scraper 7
270 33
271 4
272 5
273 25
274 12
275 20
276 5
