In [162]:
!python -m spacy download en_core_web_sm
!python -m spacy download en

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
symbolic link created for C:\Users\asus\AppData\Roaming\Python\Python37\site-packages\spacy\data\en <<===>> C:\ProgramData\Anaconda3\lib\site-packages\en_core_web_sm
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
[+] Linking successful
C:\ProgramData\Anaconda3\lib\site-packages\en_core_web_sm -->
C:\Users\asus\AppData\Roaming\Python\Python37\site-packages\spacy\data\en
You can now load the model via spacy.load('en')


In [137]:
import spacy
import re
import pandas as pd
import numpy as np
import requests

import networkx as nx
import matplotlib.pyplot as plt

from common import *

from collections import OrderedDict

nlp = spacy.load('en_core_web_sm')
pd.set_option('display.max_colwidth', 200)
%matplotlib inline


## Sentence preparation

In [138]:
book = read_file('data/Strang-Linear Algebra.txt')
chapter = OrderedDict(get_one_chapter_strang(3, book, subsections=True, 
                                      split=True, sentence_spliter=lambda ss: nlp(ss).sents))


In [139]:
chapter.keys()


odict_keys(['Orthogonal Vectors and Subspaces', 'Cosines and Projections onto Lines ', 'Projections and Least Squares ', 'Orthogonal Bases and Gram-Schmidt ', 'The Fast Fourier Transform '])

# Sentence selection

## Features for gap-fill question-generatable and informative sentences selection

In [140]:
### FEATURES

def get_tags(sentence):
    return [token.tag_ for token in nlp(sentence)]

def get_noun_adj_tokens(words):
    return [token.lemma_ for token in nlp(words) 
            if token.pos_ == 'ADJ' or token.pos_ == 'NOUN']

    
# Informative 
def is_first_sentence(f, c):
    return f == c


def has_superlatives(curr):
    pos_tags = get_tags(curr)
    return 'JJR' in pos_tags or 'JJS' in pos_tags


def has_abbreviation(curr):
    is_abbr = lambda word: word.upper() == word and len(word) > 1
    return any(is_abbr(x) for x in curr.split())


def has_correct_ending(curr):
    return curr[-1] in ['?', '.', '!']

# Generative


def relative_number_of_words(curr):
    abs_n = abs(len(curr.split()) - 10)
    return -abs_n if abs_n > 5 else abs_n


def relative_index(i, doc_length):
    abs_i = abs(i - doc_length/2)
    return abs_i if abs_i > doc_length/4 else -abs_i


def common_tokens_count(curr, title):
    curr_tokens = get_noun_adj_tokens(curr)
    title_tokens = get_noun_adj_tokens(title.lower())
    
    return sum([tok in curr_tokens for tok in title_tokens])


def begins_with_discourse_connective(curr):
    discource_connective = ['because', 'since', 'when', 'thus', 
                            'however', 'although', 'for example', 
                            'and', 'for instance', 'how', 'in other words',
                            'therefore', 'up to this point']
    curr = curr.lower()
    return any(curr.startswith(x) for x in discource_connective)


def nouns_number(curr):
    return sum(x.pos_ == 'NOUN' for x in nlp(curr))


def pronouns_number(curr):
    return sum(x.pos_ == 'PRON' for x in nlp(curr))



## Features calculation

In [141]:
feature_weights = {
    +4:lambda s, indx, title, first_s, doc_length:  is_first_sentence(s, first_s),
    +1:lambda s, indx, title, first_s, doc_length:  has_superlatives(s),
    +1:lambda s, indx, title, first_s, doc_length:  has_abbreviation(s),
    +.5:lambda s, indx, title, first_s, doc_length:  relative_number_of_words(s),
    +2:lambda s, indx, title, first_s, doc_length:  common_tokens_count(s, title),
    -2:lambda s, indx, title, first_s, doc_length:  begins_with_discourse_connective(s),
    +1:lambda s, indx, title, first_s, doc_length:  nouns_number(s),
    -2.5:lambda s, indx, title, first_s, doc_length:  pronouns_number(s),
    +0.01:lambda s, indx, title, first_s, doc_length:  relative_index(indx, doc_length),
    +2:lambda s, indx, title, first_s, doc_length:  has_correct_ending(s)
    
}


def get_sentence_score(sentence, index, title, first_sentence, doc_length, weights):
    return sum(key * weights[key](sentence, index, title, first_sentence, doc_length) 
               for key in weights)

In [142]:
scores = np.zeros(sum(len(chapter[x]) for x in chapter))
global_indx = 0
for key in chapter:
    document = chapter[key]
    title = key
    doc_length = len(document)
    first_sentence = document[0]
    
    for i, sentence in enumerate(document):
        scores[global_indx] = get_sentence_score(
                              sentence, i, title, first_sentence, 
                              doc_length, feature_weights)
        global_indx += 1

In [143]:
docs = OrderedDict((key, len(chapter[key])) for key in chapter)

def get_sentence_index_in_document(doc_sent_indx, docs):
    indexes =list(docs.values())
    i = -1
    prev = 0
    while doc_sent_indx >= 0:
        i += 1
        prev = doc_sent_indx
        doc_sent_indx -= indexes[i]
    doc_name = list(docs.keys())[i]
    return doc_name, prev

## Getting sentences with best scores

In [144]:
ordered_scores = np.flip(np.argsort(scores))
top_scores = ordered_scores[:5]

top_sentences = []
for s in top_scores:
    doc_name, index = get_sentence_index_in_document(s, docs)
    top_sentences.append((doc_name, index))
    
    print(doc_name)
    print(chapter[doc_name][index])
    print(scores[s])
    print()

Orthogonal Bases and Gram-Schmidt 
In an orthogonal basis, every vector is perpendicular to every other vector.
16.225

Orthogonal Bases and Gram-Schmidt 
The Gram-Schmidt process and its interpretation as a new factorization A = QR.
14.085

Orthogonal Vectors and Subspaces
A basis is a set of independent vectors that span a space.
13.71

Orthogonal Bases and Gram-Schmidt 
¤ reflects every point (x;y) into (y;x), its mirror image across the 45° line.
13.515

Orthogonal Bases and Gram-Schmidt 
improvement is easy: Divide each vector by its length, to make it a unit vector.
13.195

The Fast Fourier Transform 
The Fourier series is linear algebra in infinite dimensions.
13.015

Orthogonal Bases and Gram-Schmidt 
All the terms in the series are projections onto a sine or cosine.
12.865

Orthogonal Bases and Gram-Schmidt 
Its component p in this direction is exactly b1 sin x.
12.825

Orthogonal Bases and Gram-Schmidt 
The solution of Qx = b, either n by n or rectangular (least squares).
12.

# Key selection

In [191]:
from collections import defaultdict


key_list = defaultdict(lambda: list())

for i, (doc_name, sent_i) in enumerate(top_sentences):
    sent = chapter[doc_name][sent_i]
    for chunk in nlp(sent).noun_chunks:
        key_list[i].append(chunk.text)

key_list

defaultdict(<function __main__.<lambda>()>,
            {0: ['an orthogonal basis', 'every vector', 'every other vector'],
             1: ['The Gram-Schmidt process',
              'its interpretation',
              'a new factorization',
              'A = QR'],
             2: ['A basis', 'a set', 'independent vectors', 'a space'],
             3: ['¤',
              'every point',
              'x;y',
              '(y;x',
              'its mirror image',
              'the 45° line'],
             4: ['improvement', 'each vector', 'its length', 'it'],
             5: ['The Fourier series',
              'linear algebra',
              'infinite dimensions'],
             6: ['All the terms',
              'the series',
              'projections',
              'a sine',
              'cosine'],
             7: ['Its component p', 'this direction', 'exactly b1 sin x.'],
             8: ['The solution', 'Qx = b', 'n'],
             9: ['the right', 'the permutation matrix', 'c', 

In [192]:
def get_most_important_word(chunk):
    importance_order = ['ADJ','NOUN', 'NUM']
    for pos in importance_order:
        for i in nlp(chunk):
            if str(i.pos_) == pos:
                return i.text
        

In [193]:
[x.pos_ for x in nlp('Even numbers such as 2 and 4')]

['ADV', 'NOUN', 'ADJ', 'SCONJ', 'NUM', 'CCONJ', 'NUM']

In [194]:
get_most_important_word('Even numbers such as 2 and 4')

'such'

### Features for key selection

In [195]:
def title_occurance(key, title):
    return common_tokens_count(key, title)


def document_occurance(key, doc):
    total = 0
    for s in doc:
        total += common_tokens_count(key, s)
    return total


def get_depth_in_syntactic_tree(token, depth=0):
    d = [get_depth_in_syntactic_tree(child, depth+1) for child in token.children]
    d.append(0)
    return max(d)
    

def depth_in_sentence(key, s):
    most_imp = get_most_important_word(key)
    for tok in nlp(s):
        if str(tok.text) == most_imp:
            return get_depth_in_syntactic_tree(tok)
    return 0


In [196]:
def get_best_key(key_list, sentence, doc, title):
    scores = [title_occurance((key), title)  + \
              document_occurance((key), doc) +
              depth_in_sentence((key), sentence)
                  for key in key_list]
    return key_list[scores.index(max(scores))]

In [220]:
for i in key_list:
    doc = chapter[top_sentences[i][0]]
    doc = [i.lower() for i in doc]
    s = doc[top_sentences[i][1]]
    print(key_list[i])
#     key_list[i] = get_best_key(key_list[i], s, doc, top_sentences[i][0])
#     print(key_list[i])
    for j in key_list[i]:
        dist = distractors(doc, j.lower(), s)
    print("Proposed distractor -", dist)

key_list

['an orthogonal basis', 'every vector', 'every other vector']


KeyError: 'r.'

In [None]:
for i, (doc_name, sentence_index) in enumerate(top_sentences):
    s = chapter[doc_name][sentence_index]
    create_gap_filled_question(s, key_list[i])

# Distractors selection

In [219]:
 ## HELOOOOOOOOOOO
##   top_sentences = [ (name of document, index in document),...]  see above s = chapter[top_sentences[i][0]][top_sentences[i][1]]
# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
##get sentence = chapter[name of document][index]
from sklearn.feature_extraction.text import TfidfVectorizer 
def distractors(chapter_sentences, key, sentence):
  candidates = []
  cand_sents = []
  nlp2 = spacy.load('.')
  key_nlp = nlp2(key)

  label = key_nlp.ents

  tfidf_vectorizer=TfidfVectorizer()
  tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(chapter_sentences)
  tag = tfidf_vectorizer.get_feature_names()
  n = tfidf_vectorizer_vectors.sum(axis=0).A1
  candidates_scores = []
  result = dict(zip(tag,n))

  key_score = sum([result[key.split()[i]] for i in range(len(key.split()))])/len(key.split())
#   print(key_score)
  max_score = [- float("inf"),""]
  key_tags = []

  sent = sentence
  context = sent[:sent.index(key)] + "." +sent[sent.index(key)+len(key):]
#   print(context)
  context = context.split(".")[0].split()[-2:] + context.split(".")[1].split()[:2]
#   print(context)
  for word in context:
      tag = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
      key_tags.append(tag)

  if label:
    key_label = label[0].label_
  else:
    key_label = 'obj'
  for sentence in chapter_sentences:
    doc = nlp2(sentence)
    for ent in doc.ents:
      if ent.label_ == key_label:
          candidates.append(ent.text)
          cand_sents.append(sentence)
#           print(ent.label_, ent.text)

  for cand, sent in zip(candidates, cand_sents):
    if cand not in key and key not in cand:
      sent_simil = sum([2 for i in sent if i in sentence])/(len(sentence.split())+len(sent.split())) #sentence_similarity()
      cand_tags = 0
      context = sent[:sent.index(cand)] + "." +sent[sent.index(cand)+len(cand):]
#       print(context)
      context = context.split(".")[0].split()[-2:] + context.split(".")[1].split()[:2]
#       print(context)
      for word in [0,1,2,3]:
        if word < len(key_tags) and word < len(context):
            tag = nltk.pos_tag(nltk.word_tokenize(context[word]))
#             print(tag)
            tag = tag[0][1]
            cand_tags -= int(tag != key_tags[word])                                                     #context_similarity()

      diff_score = (sum([result[cand.split()[i]] for i in range(len(cand.split()))])/len(cand.split()) - key_score + 1)/2   #importance_difference()
      score = [sent_simil + cand_tags/len(key_tags) - diff_score, cand]
      max_score = max([max_score, score],key=lambda x: x[0])
  return max_score[1]
    

In [211]:
distractors(['this is a symmetric matrix and also a plane is here',
             'this can be an invertible matrix',
             'use matrix multiplication for this problem'], 
            'matrix', 'this is a symmetric matrix and also a plane is here')

'plane'