In [8]:
!python -m spacy download en_core_web_sm

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import spacy
import re
import pandas as pd
import numpy as np
import requests

import networkx as nx
import matplotlib.pyplot as plt

from common import *

from collections import OrderedDict

nlp = spacy.load('en_core_web_sm')
pd.set_option('display.max_colwidth', 200)
%matplotlib inline


## Sentence preparation

In [2]:
book = read_file('data/Strang-Linear Algebra.txt')
chapter = OrderedDict(get_one_chapter_strang(3, book, subsections=True, 
                                      split=True, sentence_spliter=lambda ss: nlp(ss).sents))


In [3]:
chapter.keys()


odict_keys(['Orthogonal Vectors and Subspaces', 'Cosines and Projections onto Lines ', 'Projections and Least Squares ', 'Orthogonal Bases and Gram-Schmidt ', 'The Fast Fourier Transform '])

# Sentence selection

## Features for gap-fill question-generatable and informative sentences selection

In [4]:
### FEATURES

def get_tags(sentence):
    return [token.tag_ for token in nlp(sentence)]

def get_noun_adj_tokens(words):
    return [token.lemma_ for token in nlp(words) 
            if token.pos_ == 'ADJ' or token.pos_ == 'NOUN']

    
# Informative 
def is_first_sentence(f, c):
    return f == c


def has_superlatives(curr):
    pos_tags = get_tags(curr)
    return 'JJR' in pos_tags or 'JJS' in pos_tags


def has_abbreviation(curr):
    is_abbr = lambda word: word.upper() == word and len(word) > 1
    return any(is_abbr(x) for x in curr.split())


def has_correct_ending(curr):
    return curr[-1] in ['?', '.', '!']

# Generative


def relative_number_of_words(curr):
    abs_n = abs(len(curr.split()) - 10)
    return -abs_n if abs_n > 5 else abs_n


def relative_index(i, doc_length):
    abs_i = abs(i - doc_length/2)
    return abs_i if abs_i > doc_length/4 else -abs_i


def common_tokens_count(curr, title):
    curr_tokens = get_noun_adj_tokens(curr)
    title_tokens = get_noun_adj_tokens(title.lower())
    
    return sum([tok in curr_tokens for tok in title_tokens])


def begins_with_discourse_connective(curr):
    discource_connective = ['because', 'since', 'when', 'thus', 
                            'however', 'although', 'for example', 
                            'and', 'for instance', 'how', 'in other words',
                            'therefore', 'up to this point']
    curr = curr.lower()
    return any(curr.startswith(x) for x in discource_connective)


def nouns_number(curr):
    return sum(x.pos_ == 'NOUN' for x in nlp(curr))


def pronouns_number(curr):
    return sum(x.pos_ == 'PRON' for x in nlp(curr))



## Features calculation

In [5]:
feature_weights = {
    +4:lambda s, indx, title, first_s, doc_length:  is_first_sentence(s, first_s),
    +1:lambda s, indx, title, first_s, doc_length:  has_superlatives(s),
    +1:lambda s, indx, title, first_s, doc_length:  has_abbreviation(s),
    +.5:lambda s, indx, title, first_s, doc_length:  relative_number_of_words(s),
    +2:lambda s, indx, title, first_s, doc_length:  common_tokens_count(s, title),
    -2:lambda s, indx, title, first_s, doc_length:  begins_with_discourse_connective(s),
    +1:lambda s, indx, title, first_s, doc_length:  nouns_number(s),
    -2.5:lambda s, indx, title, first_s, doc_length:  pronouns_number(s),
    +0.01:lambda s, indx, title, first_s, doc_length:  relative_index(indx, doc_length),
    +2:lambda s, indx, title, first_s, doc_length:  has_correct_ending(s)
}


def get_sentence_score(sentence, index, title, first_sentence, doc_length, weights):
    return sum(key * weights[key](sentence, index, title, first_sentence, doc_length) 
               for key in weights)

In [6]:
scores = np.zeros(sum(len(chapter[x]) for x in chapter))
global_indx = 0
for key in chapter:
    document = chapter[key]
    title = key
    doc_length = len(document)
    first_sentence = document[0]
    
    for i, sentence in enumerate(document):
        scores[global_indx] = get_sentence_score(
                              sentence, i, title, first_sentence, 
                              doc_length, feature_weights)
        global_indx += 1

In [7]:
docs = OrderedDict((key, len(chapter[key])) for key in chapter)

def get_sentence_index_in_document(doc_sent_indx, docs):
    indexes =list(docs.values())
    i = -1
    prev = 0
    while doc_sent_indx >= 0:
        i += 1
        prev = doc_sent_indx
        doc_sent_indx -= indexes[i]
    doc_name = list(docs.keys())[i]
    return doc_name, prev

## Getting sentences with best scores

In [8]:
ordered_scores = np.flip(np.argsort(scores))
top_scores = ordered_scores[:20]

top_sentences = []
for s in top_scores:
    doc_name, index = get_sentence_index_in_document(s, docs)
    top_sentences.append((doc_name, index))
    
    print(doc_name)
    print(chapter[doc_name][index])
    print(scores[s])
    print()

Orthogonal Bases and Gram-Schmidt 
In an orthogonal basis, every vector is perpendicular to every other vector.
16.11

Orthogonal Bases and Gram-Schmidt 
The Gram-Schmidt process and its interpretation as a new factorization A = QR.
13.99

Orthogonal Vectors and Subspaces
A basis is a set of independent vectors that span a space.
13.71

Orthogonal Bases and Gram-Schmidt 
improvement is easy: Divide each vector by its length, to make it a unit vector.
13.08

Orthogonal Bases and Gram-Schmidt 
1 0  reflects every point (x;y) into (y;x), its mirror image across the 45 line.
12.98

The Fast Fourier Transform 
The Fourier series is linear algebra in infinite dimensions.
12.95

Orthogonal Bases and Gram-Schmidt 
All the terms in the series are projections onto a sine or cosine.
12.84

Orthogonal Bases and Gram-Schmidt 
Its component p in this direction is exactly b1 sin x.
12.8

Orthogonal Bases and Gram-Schmidt 
The solution of Qx = b, either n by n or rectangular (least squares).
12.51

Th

# Key selection

In [9]:
from collections import defaultdict


key_list = defaultdict(lambda: list())

for i, (doc_name, sent_i) in enumerate(top_sentences):
    sent = chapter[doc_name][sent_i]
    for chunk in nlp(sent).noun_chunks:
        key_list[i].append(chunk.text)

key_list

defaultdict(<function __main__.<lambda>()>,
            {0: ['an orthogonal basis', 'every vector', 'every other vector'],
             1: ['The Gram-Schmidt process',
              'its interpretation',
              'a new factorization',
              'A = QR'],
             2: ['A basis', 'a set', 'independent vectors', 'a space'],
             3: ['improvement', 'each vector', 'its length', 'it'],
             4: ['every point',
              'x;y',
              '(y;x',
              'its mirror image',
              'the 45 line'],
             5: ['The Fourier series',
              'linear algebra',
              'infinite dimensions'],
             6: ['All the terms',
              'the series',
              'projections',
              'a sine',
              'cosine'],
             7: ['Its component p', 'this direction', 'exactly b1 sin x.'],
             8: ['The solution', 'Qx = b', 'n'],
             9: ['the right', 'the permutation matrix', 'c', 'c0', 'c00'],
      

In [10]:
def get_most_important_word(chunk):
    importance_order = ['ADJ','NOUN', 'NUM']
    for pos in importance_order:
        for i in nlp(chunk):
            if str(i.pos_) == pos:
                return i.text
        

In [11]:
[x.pos_ for x in nlp('Even numbers such as 2 and 4')]

['ADV', 'NOUN', 'ADJ', 'SCONJ', 'NUM', 'CCONJ', 'NUM']

In [12]:
get_most_important_word('Even numbers such as 2 and 4')

'such'

### Features for key selection

In [13]:
def title_occurance(key, title):
    return common_tokens_count(key, title)


def document_occurance(key, doc):
    total = 0
    for s in doc:
        total += common_tokens_count(key, s)
    return total


def get_depth_in_syntactic_tree(token, depth=0):
    d = [get_depth_in_syntactic_tree(child, depth+1) for child in token.children]
    d.append(0)
    return max(d)
    

def depth_in_sentence(key, s):
    most_imp = get_most_important_word(key)
    for tok in nlp(s):
        if str(tok.text) == most_imp:
            return get_depth_in_syntactic_tree(tok)
    return 0


In [14]:
def get_best_key(key_list, sentence, doc, title):
    scores = [title_occurance((key), title)  + \
              document_occurance((key), doc) +
              depth_in_sentence((key), sentence)
                  for key in key_list]
    return key_list[scores.index(max(scores))]

In [15]:
for i in key_list:
    doc = chapter[top_sentences[i][0]]
    s = doc[top_sentences[i][1]]
    print(key_list[i])
    key_list[i] = get_best_key(key_list[i], s, doc, top_sentences[i][0])
    print(key_list[i])

key_list

['an orthogonal basis', 'every vector', 'every other vector']
an orthogonal basis
['The Gram-Schmidt process', 'its interpretation', 'a new factorization', 'A = QR']
a new factorization
['A basis', 'a set', 'independent vectors', 'a space']
independent vectors
['improvement', 'each vector', 'its length', 'it']
each vector
['every point', 'x;y', '(y;x', 'its mirror image', 'the 45 line']
the 45 line
['The Fourier series', 'linear algebra', 'infinite dimensions']
linear algebra
['All the terms', 'the series', 'projections', 'a sine', 'cosine']
projections
['Its component p', 'this direction', 'exactly b1 sin x.']
Its component p
['The solution', 'Qx = b', 'n']
Qx = b
['the right', 'the permutation matrix', 'c', 'c0', 'c00']
the permutation matrix
['= x5', 'least squares']
least squares
['an orthogonal Q', 'the product', 'a rotation', 'a reflection']
an orthogonal Q
['Even numbers', 'numbers', 'numbers']
Even numbers
['spite', 'their unsolvability', 'inconsistent equations', 'all the time

defaultdict(<function __main__.<lambda>()>,
            {0: 'an orthogonal basis',
             1: 'a new factorization',
             2: 'independent vectors',
             3: 'each vector',
             4: 'the 45 line',
             5: 'linear algebra',
             6: 'projections',
             7: 'Its component p',
             8: 'Qx = b',
             9: 'the permutation matrix',
             10: 'least squares',
             11: 'an orthogonal Q',
             12: 'Even numbers',
             13: 'inconsistent equations',
             14: 'an orthogonal matrix',
             15: 'the length formula',
             16: 'These rotation matrices',
             17: 'a simple formula',
             18: 'the column space',
             19: 'All inner products'})

In [17]:
for i, (doc_name, sentence_index) in enumerate(top_sentences):
    s = chapter[doc_name][sentence_index]
    create_gap_filled_question(s, key_list[i])
    print()

In ___________________, every vector is perpendicular to every other vector.
Answer: an orthogonal basis

The Gram-Schmidt process and its interpretation as ___________________ A = QR.
Answer: a new factorization

A basis is a set of ___________________ that span a space.
Answer: independent vectors

improvement is easy: Divide ___________ by its length, to make it a unit vector.
Answer: each vector

1 0  reflects every point (x;y) into (y;x), its mirror image across ___________.
Answer: the 45 line

The Fourier series is ______________ in infinite dimensions.
Answer: linear algebra

All the terms in the series are ___________ onto a sine or cosine.
Answer: projections

_______________ in this direction is exactly b1 sin x.
Answer: Its component p

The solution of ______, either n by n or rectangular (least squares).
Answer: Qx = b

At the right is ______________________ that separates c into c0 and c00.
Answer: the permutation matrix

= x5 by _____________.
Answer: least squares

Geom

# Distractors selection

In [None]:
 ## HELOOOOOOOOOOO
##   top_sentences = [ (name of document, index in document),...]  see above s = chapter[top_sentences[i][0]][top_sentences[i][1]]

##get sentence = chapter[name of document][index]
