# OLI Content Extraction and Analysis

In [1]:
import pandas as pd
import numpy as np
import textract
import re, json, os
from os.path import join
from bs4 import BeautifulSoup
from gensim.summarization import keywords
import warnings
warnings.filterwarnings("ignore")

## 1. Extract content from PDF file

In [2]:
def extract_content_from_pdf(filename):
    # text variable which contains all the text derived from our PDF file
    full_text = textract.process(filename, method='pdfminer', language='eng').decode('utf-8')
    return full_text.encode('ascii','ignore').lower().decode('utf-8')

def extract_keyword(full_text):
    keyword_summaries = keywords(text = full_text, split = "\n", scores = True)
    data = pd.DataFrame(keyword_summaries, columns = ["keyword", "score"])
    return data.sort_values("score", ascending = False)

## 2. Extract content from XML files 

In [3]:
SEMESTER = "f19"
oli_org = open(f'{SEMESTER}/organizations/default/organization.xml', "r").read()
oli_org_soup = BeautifulSoup(oli_org, "lxml")

In [4]:
def get_module_unit_from_org(page_id):
    try:
        resource_ref = oli_org_soup.find('resourceref', {'idref': page_id})
        curr_module = resource_ref.find_parent('module').find('title').get_text()
        curr_unit = resource_ref.find_parent('unit').find('title').get_text()
        return curr_module, curr_unit
    except:
        return None, None

def is_header(p):
    # a header paragraph should have the form <p><em>...</em></p>, with no other inner tag
    n_contents = len([c for c in p.contents if not str(c.string).isspace()])
    return p.find("em") is not None and n_contents == 1

def get_file_content(filename):
    with open(f'{SEMESTER}/content/x-oli-workbook_page/' + filename ) as file:
        soup = BeautifulSoup(file.read(), 'lxml')
    page_id = soup.find('workbook_page')['id']
    curr_module, curr_unit = get_module_unit_from_org(page_id)
    title = soup.find("title").get_text().strip()   
    
    # extract the sub-headers <p><em>text</em></p> and remove them from the text content
    sub_headers = []
    for p in soup.find_all("p"):
        if is_header(p):
            sub_headers.append(p.find("em").get_text().strip())
            p.extract()
    
    all_text = "\n".join(p.get_text().strip() for p in soup.find_all("p"))
    all_text = re.sub(r"\n+", r"\n", all_text.strip())
    return {
        "Unit" : curr_unit, "Module" : curr_module, "Title" : title,
        "Text": all_text, "Subheaders" : ",".join(sub_headers)
    }

In [5]:
df_oli = pd.DataFrame([
    get_file_content(filename)
    for filename in os.listdir(f"{SEMESTER}/content/x-oli-workbook_page")
    if filename.endswith(".xml")
]).dropna()
df_oli.to_csv("oli_content.csv", index = False)
df_oli.head()

Unnamed: 0,Unit,Module,Title,Text,Subheaders
0,Data Gathering and Wrangling,Data Gathering,Data Management,A data scientist’s role involves utilizing com...,"Data Gathering Overview,Data Management"
1,Data Gathering and Wrangling,Data Collection Process,Summary and Quiz 2,Each data science project is unique and will r...,
5,Analytic Algorithms and Model Building,Data Science Patterns,Summary and Quiz 6,Prediction involves using a model to predict o...,
7,Analytic Requirements Gathering,Requirements Gathering Techniques,Successful Requirements Gathering,The requirements gathering process is not line...,Validating Requirements
8,Exploratory Data Analysis,Feature Engineering,Summary and Quiz 5,"In this module, we explored a technique used t...",


## 3. Topic modeling with LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
def plot_10_most_common_words(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    # get the count of each word across the entire corpus
    total_counts = count_data.sum(axis = 0).A.ravel()

    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x: x[1], reverse=True)[0:10]
    words, counts = zip(*count_dict)
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, list(counts), palette='husl')
    plt.xticks(x_pos, list(words), rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

# Preprocess the text to prepare for LDA
df_oli["Cleaned Text"] = df_oli["Text"].apply(lambda text: re.sub('[,\.!?]', '', text).lower())
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(df_oli['Cleaned Text'])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)

## 4. QA generation with pretrained huggingface model


In [9]:
from pipelines import pipeline
question_generator = pipeline("e2e-qg");

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1348.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=791656.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=31.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1786.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=124.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=242013376.0), HTML(value='')))




Some weights of the model checkpoint at valhalla/t5-small-e2e-qg were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
def generate_question_by_paragraph(df):
    df["Paragraph"] = df["Text"].str.split("\n")
    df_para = df.explode("Paragraph")
    df_para["Generated Question"] = df_para["Paragraph"].apply(question_generator)
    return df_para.explode("Generated Question")

In [11]:
%time df_oli_with_questions = generate_question_by_paragraph(df_oli)

CPU times: user 9h 24min 51s, sys: 14min 57s, total: 9h 39min 49s
Wall time: 50min 42s


In [14]:
df_oli_with_questions.drop(columns = "Text").to_csv("oli_content_with_questions.csv", index = False)

In [19]:
df_oli_with_questions.reset_index()

Unnamed: 0,index,Unit,Module,Title,Text,Subheaders,Paragraph,Generated Question
0,0,Data Gathering and Wrangling,Data Gathering,Data Management,A data scientist’s role involves utilizing com...,"Data Gathering Overview,Data Management",A data scientist’s role involves utilizing com...,What skills does a data scientist use to uncov...
1,0,Data Gathering and Wrangling,Data Gathering,Data Management,A data scientist’s role involves utilizing com...,"Data Gathering Overview,Data Management",A data scientist’s role involves utilizing com...,What is a crucial phase of the data science pr...
2,0,Data Gathering and Wrangling,Data Gathering,Data Management,A data scientist’s role involves utilizing com...,"Data Gathering Overview,Data Management",A data scientist’s role involves utilizing com...,What is the next phase of the data science pro...
3,0,Data Gathering and Wrangling,Data Gathering,Data Management,A data scientist’s role involves utilizing com...,"Data Gathering Overview,Data Management",A data scientist’s role involves utilizing com...,How much of your time will you spend understan...
4,0,Data Gathering and Wrangling,Data Gathering,Data Management,A data scientist’s role involves utilizing com...,"Data Gathering Overview,Data Management",A data scientist’s role involves utilizing com...,What does the quality of your data have a dire...
...,...,...,...,...,...,...,...,...
43995,113,Problem Identification and Solution Vision,Problem Identification,"AI Philosophy: A Process, not a Product","AI Philosophy: A Process, not a Product\nAI Ph...",,AI Enabled. The organization has deployed the ...,What does AI Ready mean?
43996,113,Problem Identification and Solution Vision,Problem Identification,"AI Philosophy: A Process, not a Product","AI Philosophy: A Process, not a Product\nAI Ph...",,AI Enabled. The organization has deployed the ...,How can AI help end users?
43997,113,Problem Identification and Solution Vision,Problem Identification,"AI Philosophy: A Process, not a Product","AI Philosophy: A Process, not a Product\nAI Ph...",,AI Enabled. The organization has deployed the ...,What is an organization able to measure in a r...
43998,113,Problem Identification and Solution Vision,Problem Identification,"AI Philosophy: A Process, not a Product","AI Philosophy: A Process, not a Product\nAI Ph...",,AI Enabled. The organization has deployed the ...,What can an organization claim that it has imp...


In [20]:
df_oli_with_questions.reset_index().iloc[43996, [-2, -1]].tolist()

['AI Enabled. The organization has deployed the new software in a relevant context, and is able to directly measure the impact (e.g., increased sales). At this point the organization can claim that it has implemented AI, and is gathering feedback to show that it really works with real end users. An organization can introduce the data science decision into a real world setting and measure if this implementation works.',
 'How can AI help end users?']