In [141]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

import spacy
import ot
 
import seaborn as sns

from mlutil.embeddings import load_gensim_embedding_model, TextEncoderVectorizer, WordEmbeddingsVectorizer

In [2]:
!wget -nc -O wikihowAll.csv https://query.data.world/s/lult233wfonljfadtexn2t5x5rb7is

File ‘wikihowAll.csv’ already there; not retrieving.


In [114]:
def sent_analyzer(sentence, stop_words=[], excluded_pos=['PUNCT', 'SPACE', 'PRON', 'X'], lowercase=True, lemmatize=True):
    def preprocess_token(token):
        if lemmatize:
            word = token.lemma_
        else:
            word = token.text
        if lowercase:
            return word.lower()
        else:
            return word
    
    if lowercase:
        words = [preprocess_token(token) for token in sentence if token.pos_ not in excluded_pos]
    else:
        words = [preprocess_token(token) for token in sentence if token.pos_ not in excluded_pos]
    return [token for token in words if not token in stop_words]


def make_uniform_like(n):
    return np.ones(n) / n


def get_masked_p_doc(i, p_doc):
    ith_doc_indices_start = sum(mat.shape[0] for mat in sentence_matrices[:i])
    ith_doc_indices_end = ith_doc_indices_start + sentence_matrices[i].shape[0]
    
    mask = np.ones_like(p_doc)
    mask[ith_doc_indices_start:ith_doc_indices_end] = 0
    p_doc = p_doc * mask
    return p_doc / p_doc.sum()


def transport_matrix(i, mask_p_doc=True, reg=1):
    n, m = transport_costs[i].shape
    p_example = make_uniform_like(n)
    p_doc = make_uniform_like(m)
    if mask_p_doc:
        p_doc = get_masked_p_doc(i, p_doc)
    return ot.sinkhorn(p_example, p_doc, transport_costs[i], reg)


def transport_cost(i, reg=1):
    return (transport_costs[i] * transport_matrix(i, reg)).sum()

In [67]:
wikihow_df = pd.read_csv('wikihowAll.csv')

In [68]:
nlp = spacy.load('en_core_web_sm')

keyed_vectors = load_gensim_embedding_model('glove-wiki-gigaword-50')
vectorizer = WordEmbeddingsVectorizer(keyed_vectors)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [69]:
small_wikihow_df = wikihow_df[:100]
small_wikihow_df['text'] = small_wikihow_df['text'].str.replace('\n', ' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [70]:
text = small_wikihow_df.iloc[0]['text'].replace('\n', ' ')

In [71]:
doc = nlp(text)

In [72]:
example_sentence = list(doc.sents)[0]

In [73]:
example_sent = list(doc.sents)[14]

In [115]:
min_sentence_length = 6

sentences = list(doc.sents)
analyzed_sentences = [sent_analyzer(sent, stop_words=nlp.Defaults.stop_words) for sent in sentences]
cleaned_sentences = [' '.join(sent) for sent in analyzed_sentences if len(sent) >= min_sentence_length]
filtered_sentences = [sent for (sent, analyzed_sent) in zip(sentences, analyzed_sentences) if len(analyzed_sent) >= min_sentence_length]
sentence_matrices = vectorizer.transform(cleaned_sentences, aggregate=False)

In [116]:
whole_document = np.vstack(sentence_matrices)

In [117]:
transport_costs = [cosine_distances(sent, whole_document) for sent in sentence_matrices]

In [118]:
calculated_costs = [transport_cost(i) for i in range(len(sentence_matrices))]

In [119]:
calculated_costs[0]

0.7489980524374757

In [120]:
sorted_scores = np.argsort(calculated_costs)
sorted_scores

array([13, 25,  3, 23, 12, 21, 22, 24, 16, 20, 15,  7,  5,  8, 14, 19,  1,
        2, 10, 18,  6,  9,  4, 17, 11,  0])

In [121]:
len(sentence_matrices[14])

12

In [122]:
for i in range(5):
    print(i+1, 'th most central sentence')
    print(filtered_sentences[sorted_scores[i]])
    print(cleaned_sentences[sorted_scores[i]])
    print()

1 th most central sentence
Cheap and easy, this is also a good way to handle papers and ideas you touch regularly or need to pin up and down for inspiration.
cheap easy good way handle paper idea touch regularly need pin inspiration

2 th most central sentence
If you haven't used it in the last six months there is little chance you'll use it in the next six months.
use month little chance use month

3 th most central sentence
Some ideas include:   Essential supplies area -- the things you use every day.
idea include essential supply area thing use day

4 th most central sentence
This is a good thing, but only if you set aside time to declutter.
good thing set aside time declutter

5 th most central sentence
Simply string up the wires across a wall or along the ceiling and use them to hold essential papers that you don't want to cut or ruin with tacks or tape.
simply stre wire wall ceiling use hold essential paper want cut ruin tack tape



In [103]:
cleaned_sentences[sorted_scores[0]]

'cheap easy good way handle paper idea -pron- touch regularly need pin inspiration'

In [64]:
sentences[sorted_scores[1]]

Some ideas, beyond those just mentioned, include:   Canvas shoe racks on the back of the door Wine racks with cups in each slot to hold pens/pencils.

In [151]:
n_sentences = whole_document.shape[0]
p_sentences = make_uniform_like(n_sentences)
K = cosine_distances(whole_document, whole_document)
transport_matrix = ot.sinkhorn(p_sentences, p_sentences, K, 0.01)