In [1]:
import os
import sys
import re
import numpy as np
import pandas as pd
import pickle
import requests
import mwparserfromhell
# import wikipedia
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import scipy.spatial.distance as sdist
from itertools import zip_longest
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

# Local
## Allow local relative imports
module_path = os.path.abspath('..')
include_path = os.path.join(module_path, 'include')
if include_path not in sys.path:
    sys.path.append(include_path)

from my_nlp import Tokenizer

In [2]:
def grouper(iterable, n, fillvalue = None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue = fillvalue)

In [3]:
def score_on_doc(tfidf_vectorizer, doc, comp_text, metric = sdist.cosine):
    doc_tfidf = tfidf_vectorizer.transform([doc]).toarray()
    comp_text_tfidf = tfidf_vectorizer.transform(comp_text).toarray()
    scores = []
    for i in range(comp_text_tfidf.shape[0]):
        if doc_tfidf.dot(comp_text_tfidf[i]) != 0:
            scores.append((i, sdist.cosine(doc_tfidf, comp_text_tfidf[i])))
    return sorted(scores, key = lambda x: x[1], reverse = True)

In [5]:
titles = []
with open(module_path + "/data/titles-sorted.txt", "r") as titles_stream:
    for title in titles_stream:
        titles.append(title.rstrip('\n'))

In [6]:
links_dict = {}
with open(module_path + "/data/links-simple-sorted.txt", "r") as links_stream:
    for links in links_stream:
        origin_str, targets_str = links.rstrip('\n').split(': ')
        origin = int(origin_str)
        links_dict[origin] = []
        for target in targets_str.split():
            target = int(target)
            links_dict[origin].append(target)

In [None]:
for origin, targets in list(links_dict.items()):
    for target in list(targets):
        if not (target in links_dict and origin in links_dict[target]):
            links_dict[origin].remove(target)
    if len(links_dict[origin]) == 0:
        del links_dict[origin]

In [None]:
with open(module_path + '/data/links.pickle', 'wb') as handle:
    pickle.dump(links_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open(module_path + '/data/titles.pickle', 'wb') as handle:
    pickle.dump(titles, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [4]:
with open(module_path + '/data/links.pickle', 'rb') as handle:
    links_dict = pickle.load(handle)

with open(module_path + '/data/titles.pickle', 'rb') as handle:
    titles = pickle.load(handle)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ross/Dropbox/Projects/insight/wikontext/data/links.pickle'

In [231]:
origin_topic = "Boston"
target_topic = "New_York_City"

In [321]:
origin_index = titles.index(origin_topic) + 1
target_index = titles.index(target_topic) + 1

In [1]:
sess = requests.Session()
wapi_url = "https://en.wikipedia.org/w/api.php"
wapi_params = {
    'action': "query",
    'maxlag': 5,
    'prop': "revisions",
    'titles': titles[:100],
    'rvprop': "content",
    'format': "json"
}
origin_data = sess.get(url = wapi_url, params = wapi_params).json()

NameError: name 'requests' is not defined

In [329]:
def wiki_labeled_pair_gen(all_titles, links_dict, n_titles = 100, maxlag = 5):
    sess = requests.Session()
    wapi_url = "https://en.wikipedia.org/w/api.php"
    for title_inds in grouper(links_dict, n_titles):
        titles = []
        link_targets = []
        for i in title_inds:
            if not i is None:
                titles.append(all_titles[i - 1])
                link_targets.append(links_dict[i])
        wapi_params = {
            'action': "query",
            'maxlag': maxlag,
            'prop': "revisions",
            'titles': titles[i:i + n_titles],
            'rvprop': "content",
            'format': "json"
        }
        origin_data = sess.get(url = wapi_url, params = wapi_params).json()
        origin_wikitext = list(origin_data['query']['pages'].values())[0]['revisions'][0]['*']
        origin_wikisentences = origin_wikitext.split(". ")
        for target in link_dict[i]
        origin_target_wikisentences = [x for x in origin_wikisentences if " [[" + titles[target - 1].replace("_", " ") + "]] " in x]
        origin_target_sentences = [mwparserfromhell.parse(x).strip_code() for x in origin_target_wikisentences]

In [324]:
with open(module_path + '/data/links.csv', 'a') as links_stream:
    for origin, targets in links_dict.items():
        for target in targets:
            wapi_params = {
                'action': "query",
                'maxlag': 5,
                'prop': "revisions",
                'titles': titles[origin - 1],
                'rvprop': "content",
                'format': "json"
            }

            origin_data = sess.get(url = wapi_url, params = wapi_params).json()
            origin_wikitext = list(origin_data['query']['pages'].values())[0]['revisions'][0]['*']
            origin_wikisentences = origin_wikitext.split(". ")
            origin_target_wikisentences = [x for x in origin_wikisentences if " [[" + titles[target - 1].replace("_", " ") + "]] " in x]
            origin_target_sentences = [mwparserfromhell.parse(x).strip_code() for x in origin_target_wikisentences]

            for origin_target_sentence in origin_target_sentences:
                print("{},{}".format(origin_target_sentence, titles[origin - 1]), file = links_stream)

In [328]:
origin_target_sentences

['By the mid-18th century, New York City and Philadelphia surpassed Boston in wealth']

In [None]:
from gensim.models import Word2Vec
model = Word2Vec.load("path/to/word2vec/en.model")
model.similarity('woman', 'man')

In [239]:
origin_page = wikipedia.page(origin_topic)
target_page = wikipedia.page(target_topic)

In [123]:
t = Tokenizer()

t.load(base_page.content)
t.tokenize(lemmatize = True)
base_docs = [' '.join(x) for x in t.sentence_tokens]

t.load('. '.join([s for s in base_page.content.split(". ") if target_topic.lower() in s.lower()]))
t.tokenize(lemmatize = True)
base_sents = [' '.join(x) for x in t.sentence_tokens]

t.load(target_page.content)
t.tokenize(lemmatize = True)
target_docs = [' '.join(x) for x in t.sentence_tokens]

In [166]:
max_df = 0.8
min_df = 5

max_features = 5000

min_n_gram = 1
max_n_gram = 4

tfidf_vectorizer = TfidfVectorizer(min_df = min_df, max_df = max_df,
                                   max_features = max_features,
                                   ngram_range = (min_n_gram, max_n_gram),
                                   stop_words = 'english').fit(base_docs)

terms = tfidf_vectorizer.get_feature_names()

In [167]:
base_sents_tfidf = tfidf_vectorizer.transform(base_sents).toarray()
target_tfidf = tfidf_vectorizer.transform(target_docs).toarray()

In [169]:
base_sents[0]

'by the mid 18th century new york city and philadelphia surpass boston in wealth'

In [168]:
[target_docs[x[0]] for x in score_on_doc(tfidf_vectorizer, base_sents[0], target_docs)[:5]]

['the city and surround area suffer the bulk of the economic damage and largest loss of human life in the aftermath of the september attack when of the terrorist associate with al qaeda pilot american airline flight into the north tower of the world trade center and unite airline flight into the south tower of the world trade center and later destroy them kill civilian firefighter and law enforcement officer who be in the tower and in the surround area',
 'hispanic of any race represent of the population while asian constitute the fastest grow segment of the city s population between and the non hispanic white population decline percent the smallest record decline in decade and for the first time since the civil war the number of black decline over a decade',
 'some of the natural relief in topography have be even out especially in manhattan the city s total area be square mile km2 include sq mi km2 of land and sq mi km2 of this be water',
 'race and ethnicity the city s population in 

In [70]:
t = Tokenizer()

t.load(base_page.content)
t.tokenize(lemmatize = True)
base_docs = [TaggedDocument(x, [i]) for i, x in enumerate(t.sentence_tokens)]

t.load('. '.join([s for s in base_page.content.split(". ") if target_topic.lower() in s.lower()]))
t.tokenize(lemmatize = True)
base_sents = t.sentence_tokens


t.load(target_page.content)
t.tokenize(lemmatize = True)
target_docs = [TaggedDocument(x, [i]) for i, x in enumerate(t.sentence_tokens)]

In [89]:
model = Doc2Vec(vector_size = 500, window = 2, min_count = 3, workers = 4, seed = 0, epochs = 3)
model.build_vocab(target_docs)

In [90]:
%%time
model.train(target_docs, total_examples = model.corpus_count, epochs = model.epochs)

CPU times: user 99.9 ms, sys: 9.43 ms, total: 109 ms
Wall time: 71.6 ms


In [91]:
model.wv.most_similar("new")

  if np.issubdtype(vec.dtype, np.int):


[('the', 0.9990196228027344),
 ('be', 0.9988275766372681),
 ('of', 0.9987856149673462),
 ('and', 0.9987653493881226),
 ('in', 0.9987581372261047),
 ('a', 0.9984621405601501),
 ('to', 0.9982845783233643),
 ('city', 0.998284101486206),
 ('york', 0.9981852769851685),
 ('s', 0.998073935508728)]

In [92]:
def get_sent(model, s):
    inferred_vec = model.infer_vector(s)
    sims = model.wv.most_similar([inferred_vec], topn = len(model.docvecs))
[''.join(x[0]) for x in sims]

  if np.issubdtype(vec.dtype, np.int):


['from',
 'to',
 'in',
 'on',
 'city',
 'area',
 's',
 'be',
 'island',
 'york',
 'population',
 'u',
 'the',
 'new',
 'and',
 'largest',
 'of',
 'manhattan',
 'a',
 'many',
 'state',
 'national',
 'million',
 'center',
 'which',
 'that',
 'with',
 'for',
 'bronx',
 'home',
 'number',
 'include',
 'it',
 'by',
 'american',
 'world',
 'all',
 'other',
 'day',
 'park',
 'most',
 'an',
 'at',
 'over',
 'brooklyn',
 'system',
 'unite',
 'such',
 'than',
 'long',
 'also',
 'public',
 'street',
 'asian',
 'have',
 'hispanic',
 'into',
 'borough',
 'one',
 'lower',
 'make',
 'staten',
 'year',
 'industry',
 'central',
 'metropolitan',
 'major',
 'square',
 'parade',
 'acre',
 'county',
 'immigrant',
 'become',
 'technology',
 'build',
 'dutch',
 'financial',
 'medium',
 'billion',
 'mile',
 'black',
 'queen',
 'white',
 'or',
 'first',
 'between',
 'river',
 'economic',
 'bridge',
 'museum',
 'record',
 'after',
 'while',
 'more',
 'would',
 'group',
 'global',
 'several',
 'approximately',
 

In [97]:
model.wv.similarity(model.infer_vector(base_sents[0]), model.infer_vector(target_docs[0].words))

KeyError: "word '-0.0015622723' not in vocabulary"

In [95]:
[x for x in base_sents[0] if x in model.wv.vocab]

['by',
 'the',
 'mid',
 'century',
 'new',
 'york',
 'city',
 'and',
 'philadelphia',
 'surpass',
 'in']