In [None]:
# Packages to install
!pip install textract
!pip install sentencepiece
!pip install transformers
!pip3 install tensorflow==1.15
!pip3 install bert-serving-server
!pip3 install bert-serving-client
!pip install numpy==1.19.5

In [2]:
# Imports
import pandas as pd
import numpy as np
import textract
import re, json, os
from os.path import join
from bs4 import BeautifulSoup
from gensim.summarization import keywords
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoModelWithLMHead, AutoTokenizer
import networkx as nx
import matplotlib.pyplot as plt
import nltk
import spacy
from tqdm import tqdm
import requests
import json
from nltk.corpus import wordnet as wn

In [None]:
# Downloads
nltk.download('punkt')
nlp = spacy.load('en_core_web_sm')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")

!unzip -u "/content/drive/MyDrive/uncased_L-12_H-768_A-12.zip"
!unzip -u "/content/drive/MyDrive/Concept-Acquisition-Pipeline.zip"

Configurables

In [4]:
SEMESTER = "f19"
data_path = "/content/drive/MyDrive/oli/"

Data Extraction from OLI Functions

In [5]:
def extract_content_from_pdf(filename):
    # text variable which contains all the text derived from our PDF file
    full_text = textract.process(filename, method='pdfminer', language='eng').decode('utf-8')
    return full_text.encode('ascii','ignore').lower().decode('utf-8')

def extract_keyword(full_text):
    keyword_summaries = keywords(text = full_text, split = "\n", scores = True)
    data = pd.DataFrame(keyword_summaries, columns = ["keyword", "score"])
    return data.sort_values("score", ascending = False)

def get_module_unit_from_org(page_id):
    try:
        resource_ref = oli_org_soup.find('resourceref', {'idref': page_id})
        curr_module = resource_ref.find_parent('module').find('title').get_text()
        curr_unit = resource_ref.find_parent('unit').find('title').get_text()
        return curr_module, curr_unit
    except:
        return None, None

def is_header(p):
    # a header paragraph should have the form <p><em>...</em></p>, with no other inner tag
    n_contents = len([c for c in p.contents if not str(c.string).isspace()])
    return p.find("em") is not None and n_contents == 1

def get_file_content(filename):
    with open(data_path+f'{SEMESTER}/content/x-oli-workbook_page/' + filename ) as file:
        soup = BeautifulSoup(file.read(), 'lxml')
    page_id = soup.find('workbook_page')['id']
    curr_module, curr_unit = get_module_unit_from_org(page_id)
    title = soup.find("title").get_text().strip()   
    
    # extract the sub-headers <p><em>text</em></p> and remove them from the text content
    sub_headers = []
    for p in soup.find_all("p"):
        if is_header(p):
            sub_headers.append(p.find("em").get_text().strip())
            p.extract()
    
    all_text = "\n".join(p.get_text().strip() for p in soup.find_all("p"))
    all_text = re.sub(r"\n+", r"\n", all_text.strip())
    return {
        "Unit" : curr_unit, "Module" : curr_module, "Title" : title,
        "Text": all_text, "Subheaders" : ",".join(sub_headers)
    }

In [6]:
oli_org = open(data_path + f'{SEMESTER}/organizations/default/organization.xml', "r").read()
oli_org_soup = BeautifulSoup(oli_org, "lxml")
df_oli = pd.DataFrame([
    get_file_content(filename)
    for filename in os.listdir(data_path + f"{SEMESTER}/content/x-oli-workbook_page")
    if filename.endswith(".xml")
]).dropna()
df_oli.to_csv("oli_content.csv", index = False)
df_oli.reset_index(inplace=True)

Clean Dataframe

In [None]:
def clean_text_for_answer_extraction(text):
  print("")
  sentences = nltk.sent_tokenize(text)
  selected = list()
  for s in sentences:
    if s[-1] != '.':
      continue
    parts = s.split(" ")
    if parts[0] in ['We']:
      continue
    selected.append(s)
  
  return text

df_oli['cleaned_text'] = df_oli.apply(lambda row: clean_text_for_answer_extraction(row['Text']), axis=1)

Question Generation - Answer Generation from Headings

In [8]:
def get_question(answer, context, max_length=64):
  input_text = "answer: %s  context: %s </s>" % (answer, context)
  features = tokenizer([input_text], return_tensors='pt')

  output = model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'],
               max_length=max_length)

  return tokenizer.decode(output[0])

def clean_and_extract_questions(text):
  return text.replace("<pad> question: ", "").replace("</s>", "")

def get_question_headings():
  df_oli['question_unit'] = df_oli.apply(lambda row: clean_and_extract_questions(get_question(answer = row['Unit'], context = row['Text'])), axis=1)
  df_oli['question_module'] = df_oli.apply(lambda row: clean_and_extract_questions(get_question(answer = row['Module'], context = row['Text'])), axis=1)
  df_oli['question_title'] = df_oli.apply(lambda row: clean_and_extract_questions(get_question(answer = row['Title'], context = row['Text'])), axis=1)

  df_oli.to_csv(data_path + "oli_heading_answers.csv", index = False)

get_question_headings()

def save_questions():
  unit = df_oli['question_unit'].to_csv(data_path + 'unit_questions.csv')
  module = df_oli['question_module'].to_csv(data_path + 'module_questions.csv')
  title = df_oli['question_title'].to_csv(data_path + 'title_questions.csv')

save_questions()

Token indices sequence length is longer than the specified maximum sequence length for this model (777 > 512). Running this sequence through the model will result in indexing errors


Concept Hierarchy and Extraction

In [None]:
def get_content_text(line_text):
  with open(data_path + 'content.txt', 'a') as filehandle:
    filehandle.write(line_text)

df_oli.apply(lambda row: get_content_text(row['cleaned_text']), axis=1)

In [None]:
# Start this as another process (background execution on terminal)
#!bert-serving-start -model_dir uncased_L-12_H-768_A-12 -num_worker=1

In [None]:
# Move content.txt into Concept-Acquisition-Pipeline/input_data/context
# Update result_path (line 17 in Concept-Acquisition-Pipeline/config.py) to the name of results of file

In [12]:
%cd content/Concept-Acquisition-Pipeline-master

/content/content/Concept-Acquisition-Pipeline-master


In [None]:
!python confidence_propagation.py -l en -task extract

In [None]:
!zip -r /content/drive/MyDrive/Concept-Acquisition-Pipeline.zip /content/content/Concept-Acquisition-Pipeline-master

Concept Pruning

In [14]:
def is_article(word):
  if word in ['a', 'the', 'an']:
    return True
  else:
    return False

def is_preposition(word):
  if word in ['aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'anti', 'around', 'as', 'at', 'before', 'below', 'beneath', 'beside', 'besides', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'during', 'except', 'excepting', 'excluding', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'minus', 'near', 'of', 'off', 'on', 'onto', 'opposite', 'outside', 'over', 'past', 'per', 'plus', 'regarding', 'round', 'save', 'since', 'than', 'through', 'to', 'toward', 'towards', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'with', 'within', 'without']:
    return True
  else:
    return False

def is_verb(word): #Which is better, wordnet or pos tagging
  if " " not in word:
    syn = wn.synsets(word)
    if len(syn) > 0:
      category = syn[0].pos()
      if str(category) == "v":
        return True

  return False

def is_single_letter(word):
  if len(word) == 1:
    return True
  return False

def is_valid_concept(concept):
  is_valid = True
  if is_article(concept):
    is_valid = False
  
  if concept.isdecimal():
    is_valid = False
  
  if is_preposition(concept):
    is_valid = False
  
  if is_verb(concept):
    is_valid = False
  
  if is_single_letter(concept):
    is_valid = False

  return is_valid

In [None]:
# Can be pruned manually, or programatically, code for which is given below:

valid_concepts = list()
concept_file_path = "/content/content/Concept-Acquisition-Pipeline-master/processed_data/propagation_results/result_oli.json"
#concept_file_path = "/content/result_oli.json"
with open(concept_file_path, "r") as filehandle:
  concepts = filehandle.readlines()
  for c in concepts:
    concept = c.split(",")[0].split(":")[1].replace('"', '').strip()
    if is_valid_concept(concept):
      valid_concepts.append(concept)
  
for c in valid_concepts:
  print(c)

Ranking

In [18]:
def get_rank(question, standardise=False):
  rank = 0
  tokens = question.split(" ")
  for t in tokens:
    if t in valid_concepts:
      rank = rank - 1
  
  if standardise == True:
    rank = rank/len(tokens)
  return rank

def adjust_rank(df, std=True):
  min_rank = abs(df['rank'].min())
  df['rank'] = df.apply(lambda row: row['rank'] + min_rank, axis=1)

  min_rank_std = abs(df['std_rank'].min())
  df['std_rank'] = df.apply(lambda row: row['std_rank'] + min_rank_std, axis=1)

  return df

module_questions = pd.read_csv("/content/drive/MyDrive/oli/module_questions.csv")
module_questions.drop(columns=['Unnamed: 0'], inplace=True)
title_questions = pd.read_csv("/content/drive/MyDrive/oli/title_questions.csv")
title_questions.drop(columns=['Unnamed: 0'], inplace=True)
unit_questions = pd.read_csv("/content/drive/MyDrive/oli/unit_questions.csv")
unit_questions.drop(columns=['Unnamed: 0'], inplace=True)

# Calculate rank
module_questions['rank'] = module_questions.apply(lambda row: get_rank(row['question_module']), axis=1)
title_questions['rank'] = title_questions.apply(lambda row: get_rank(row['question_title']), axis=1)
unit_questions['rank'] = unit_questions.apply(lambda row: get_rank(row['question_unit']), axis=1)

# Calculate standardized rank
module_questions['std_rank'] = module_questions.apply(lambda row: get_rank(row['question_module'], standardise=True), axis=1)
title_questions['std_rank'] = title_questions.apply(lambda row: get_rank(row['question_title'], standardise=True), axis=1)
unit_questions['std_rank'] = unit_questions.apply(lambda row: get_rank(row['question_unit'], standardise=True), axis=1)

# Scale rank
module_questions = adjust_rank(module_questions, std=True)
title_questions = adjust_rank(title_questions, std=True)
unit_questions = adjust_rank(unit_questions, std=True)

# Write output to csv files
module_questions.to_csv('ranked_module_qs.csv')
title_questions.to_csv('ranked_title_qs.csv')
unit_questions.to_csv('ranked_unit_qs.csv')

In [31]:
# Combine all questions together

module_questions = pd.read_csv("/content/drive/MyDrive/oli/module_questions.csv")
module_questions.drop(columns=['Unnamed: 0'], inplace=True)
module_questions.rename(columns={"question_module": "question"}, inplace = True)
title_questions = pd.read_csv("/content/drive/MyDrive/oli/title_questions.csv")
title_questions.drop(columns=['Unnamed: 0'], inplace=True)
title_questions.rename(columns={"question_title": "question"}, inplace = True)
unit_questions = pd.read_csv("/content/drive/MyDrive/oli/unit_questions.csv")
unit_questions.rename(columns={"question_unit": "question"}, inplace = True)
unit_questions.drop(columns=['Unnamed: 0'], inplace=True)

combined_df = module_questions.append(title_questions, ignore_index=True)
combined_df = combined_df.append(unit_questions)

combined_df.to_csv('all_questions.csv')

In [32]:
combined_df['rank'] = combined_df.apply(lambda row: get_rank(row['question']), axis=1)
combined_df['std_rank'] = combined_df.apply(lambda row: get_rank(row['question'], standardise=True), axis=1)
combined_df = adjust_rank(combined_df, std=True)
combined_df.to_csv('all_questions.csv')