In [1]:
import spacy
import re
import pandas as pd
import numpy as np
import requests

import networkx as nx
import matplotlib.pyplot as plt

from common import *

from collections import OrderedDict

nlp = spacy.load('en_core_web_sm')
pd.set_option('display.max_colwidth', 200)
%matplotlib inline

# Sentence preparation

In [23]:
book = read_file('data/Strang-Linear Algebra.txt')
chapter = OrderedDict(get_one_chapter_strang(2, book, subsections=True, 
                                      split=True, sentence_spliter=lambda ss: nlp(ss).sents))


In [24]:
chapter.keys()


odict_keys(['Vector Spaces and Subspaces', 'Solving Ax = 0 and Ax = b ', 'Linear Independence, Basis, and Dimension ', 'The Four Fundamental Subspaces ', 'Graphs and Networks ', 'Linear Transformations '])

# Sentence selection

## Features for gap-fill question-generatable and informative sentences selection

In [25]:
### FEATURES

def get_tags(sentence):
    return [token.tag_ for token in nlp(sentence)]

def get_noun_adj_tokens(words):
    return [token.lemma_ for token in nlp(words) 
            if token.pos_ == 'ADJ' or token.pos_ == 'NOUN']

    
# Informative 
def is_first_sentence(f, c):
    return f == c


def has_superlatives(curr):
    pos_tags = get_tags(curr)
    return 'JJR' in pos_tags or 'JJS' in pos_tags


def has_abbreviation(curr):
    is_abbr = lambda word: word.upper() == word and len(word) > 1
    return any(is_abbr(x) for x in curr.split())


def has_correct_ending(curr):
    return curr[-1] in ['?', '.', '!']

# Generative


def relative_number_of_words(curr):
    abs_n = abs(len(curr.split()) - 10)
    return -abs_n if abs_n > 5 else abs_n


def relative_index(i, doc_length):
    abs_i = abs(i - doc_length/2)
    return abs_i if abs_i > doc_length/4 else -abs_i


def common_tokens_count(curr, title):
    curr_tokens = get_noun_adj_tokens(curr)
    title_tokens = get_noun_adj_tokens(title.lower())
    
    return sum([tok in curr_tokens for tok in title_tokens])


def begins_with_discourse_connective(curr):
    discource_connective = ['because', 'since', 'when', 'thus', 
                            'however', 'although', 'for example', 
                            'and', 'for instance', 'how', 'in other words',
                            'therefore', 'up to this point']
    curr = curr.lower()
    return any(curr.startswith(x) for x in discource_connective)


def nouns_number(curr):
    return sum(x.pos_ == 'NOUN' for x in nlp(curr))


def pronouns_number(curr):
    return sum(x.pos_ == 'PRON' for x in nlp(curr))



## Features calculation

In [26]:
feature_weights = {
    +4:lambda s, indx, title, first_s, doc_length:  is_first_sentence(s, first_s),
    +1:lambda s, indx, title, first_s, doc_length:  has_superlatives(s),
    +1:lambda s, indx, title, first_s, doc_length:  has_abbreviation(s),
    +.5:lambda s, indx, title, first_s, doc_length:  relative_number_of_words(s),
    +2:lambda s, indx, title, first_s, doc_length:  common_tokens_count(s, title),
    -2:lambda s, indx, title, first_s, doc_length:  begins_with_discourse_connective(s),
    +1:lambda s, indx, title, first_s, doc_length:  nouns_number(s),
    -2.5:lambda s, indx, title, first_s, doc_length:  pronouns_number(s),
    +0.01:lambda s, indx, title, first_s, doc_length:  relative_index(indx, doc_length),
    +2:lambda s, indx, title, first_s, doc_length:  has_correct_ending(s)
    
}


def get_sentence_score(sentence, index, title, first_sentence, doc_length, weights):
    return sum(key * weights[key](sentence, index, title, first_sentence, doc_length) 
               for key in weights)

In [27]:
scores = np.zeros(sum(len(chapter[x]) for x in chapter))
global_indx = 0
for key in chapter:
    document = chapter[key]
    title = key
    doc_length = len(document)
    first_sentence = document[0]
    
    for i, sentence in enumerate(document):
        scores[global_indx] = get_sentence_score(
                              sentence, i, title, first_sentence, 
                              doc_length, feature_weights)
        global_indx += 1

In [28]:
docs = OrderedDict((key, len(chapter[key])) for key in chapter)

def get_sentence_index_in_document(doc_sent_indx, docs):
    indexes =list(docs.values())
    i = -1
    prev = 0
    while doc_sent_indx >= 0:
        i += 1
        prev = doc_sent_indx
        doc_sent_indx -= indexes[i]
    doc_name = list(docs.keys())[i]
    return doc_name, prev

## Getting sentences with best scores

In [29]:
ordered_scores = np.flip(np.argsort(scores))
top_scores = ordered_scores[:5]

top_sentences = []
for s in top_scores:
    doc_name, index = get_sentence_index_in_document(s, docs)
    top_sentences.append((doc_name, index))
    
    print(doc_name)
    print(chapter[doc_name][index])
    print(scores[s])
    print()

Solving Ax = 0 and Ax = b 
Chapter 1 concentrated on square invertible matrices.
13.955

Solving Ax = 0 and Ax = b 
This is the equation for the plane (in the first description of the column space).
12.965

Solving Ax = 0 and Ax = b 
That makes Ax = b solvable, so b is in the column space.
12.945

The Four Fundamental Subspaces 
The previous section dealt with definitions rather than constructions.
12.655000000000001

Solving Ax = 0 and Ax = b 
The column space of the 1 by 1 zero matrix contains only b = 0.
12.645



# Key selection

In [30]:
from collections import defaultdict


key_list = defaultdict(lambda: list())

for i, (doc_name, sent_i) in enumerate(top_sentences):
    sent = chapter[doc_name][sent_i]
    for chunk in nlp(sent).noun_chunks:
        if check_wordset(chunk.text):
            key_list[i].append(chunk.text)

key_list

defaultdict(<function __main__.<lambda>()>,
            {0: ['Chapter', 'square invertible matrices'],
             1: ['the equation',
              'the plane',
              'the first description',
              'the column space'],
             2: ['Ax', 'b', 'the column space'],
             3: ['The previous section', 'definitions', 'constructions'],
             4: ['The column space', 'only b']})

In [31]:
def get_most_important_word(chunk):
    importance_order = ['ADJ','NOUN', 'NUM']
    for pos in importance_order:
        for i in nlp(chunk):
            if str(i.pos_) == pos:
                return i.text
        

In [32]:
[x.pos_ for x in nlp('Even numbers such as 2 and 4')]

['ADV', 'NOUN', 'ADJ', 'SCONJ', 'NUM', 'CCONJ', 'NUM']

In [33]:
get_most_important_word('Even numbers such as 2 and 4')

'such'

### Features for key selection

In [34]:
def title_occurance(key, title):
    return common_tokens_count(key, title)


def document_occurance(key, doc):
    total = 0
    for s in doc:
        total += common_tokens_count(key, s)
    return total


def get_depth_in_syntactic_tree(token, depth=0):
    d = [get_depth_in_syntactic_tree(child, depth+1) for child in token.children]
    d.append(0)
    return max(d)
    

def depth_in_sentence(key, s):
    most_imp = get_most_important_word(key)
    for tok in nlp(s):
        if str(tok.text) == most_imp:
            return get_depth_in_syntactic_tree(tok)
    return 0


In [35]:
def get_best_key(key_list, sentence, doc, title):
    scores = [title_occurance((key), title)  + \
              document_occurance((key), doc) +
              depth_in_sentence((key), sentence)
                  for key in key_list]
    return key_list[scores.index(max(scores))]

In [None]:
for i in key_list:
    doc = chapter[top_sentences[i][0]]
    doc = [i.lower() for i in doc]
    s = doc[top_sentences[i][1]]
    print(key_list[i])
    key_list[i] = get_best_key(key_list[i], s, doc, top_sentences[i][0])
key_list

['Chapter', 'square invertible matrices']
['the equation', 'the plane', 'the first description', 'the column space']
['Ax', 'b', 'the column space']
['The previous section', 'definitions', 'constructions']
['The column space', 'only b']


# Distractors selection


In [None]:
from distractors_generation import get_distractors
import spacy
nlp_model = spacy.load(".")

In [None]:
for i in key_list:
    doc = chapter[top_sentences[i][0]]
    doc = [i.lower() for i in doc]
    s = doc[top_sentences[i][1]]
    print(s)
    distractors = get_distractors(doc, key_list[i].lower(), s, nlp_model, 4)
    print(distractors[0], distractors[1][1])
key_list

# Quiz Construction

In [None]:
from quiz_generation import create_quiz   

n = 10

questions = []
for i, (doc_name, sentence_index) in enumerate(top_sentences[:n]):
    s = chapter[doc_name][sentence_index]
    if key_list[i]:
        distractors = get_distractors(chapter[doc_name], key_list[i].lower(), s.lower(), nlp_model, 3)
        questions.append([s, distractors[1][1], distractors[0]])
print("\n")
print(create_quiz(questions, correct_answer=False, save=True))