In [1]:
from pathlib import Path

try:
    from google.colab import drive
    drive.mount("/content/drive")
    ROOT = Path('/content/drive/MyDrive/textbook-topic-analysis')
except ImportError:
    ROOT = Path(".")

In [2]:
%%capture
! pip install -r {str(ROOT / "requirements.txt")}

In [3]:
PARSED_TEXTBOOKS_DIRECTORY = ROOT / "parsed_textbooks"
if not PARSED_TEXTBOOKS_DIRECTORY.exists():
    PARSED_TEXTBOOKS_DIRECTORY.mkdir()


#### CAN COMMENT THIS OUT IF TEXTBOOKS HAVE BEEN PARSED ####
# from encoding import process_files

# file_mapping = {
#     path: PARSED_TEXTBOOKS_DIRECTORY / (path.stem + ".json")
#     for path in Path(ROOT / "textbooks").glob("*.xml")
# }
# process_files(file_mapping)
############################################################

In [4]:
from encoding import convert_json_to_textbook


textbooks = [
    convert_json_to_textbook(path)
    for path in Path(PARSED_TEXTBOOKS_DIRECTORY).glob("*.json")
]

# Integrating TOCs

In [5]:
from copy import copy

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from textbooks import IntegratedTextbook


def initialise_textbooks(textbooks, compute_vector, aggregate_subsection_vectors):
    initialised_textbooks = []
    for textbook in textbooks:
        textbook = copy(textbook)
        textbook.compute_vector = compute_vector
        textbook.aggregate_subsection_vectors = aggregate_subsection_vectors
        textbook.compute_section_vectors()
        initialised_textbooks.append(textbook)
    return initialised_textbooks

## Cosine Similarity

In [6]:
def cosine_similarity_toc_integration(text_extraction_function, similarity_threshold):
    flattened_sections = {
        section: text_extraction_function(section)
        for textbook in textbooks
        for section in textbook.all_subsections
    }

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(flattened_sections.values())
    section_vectors = dict(zip(flattened_sections, tfidf_matrix))

    textbooks_with_section_vectors = initialise_textbooks(
        textbooks,
        compute_vector=section_vectors.get,
        aggregate_subsection_vectors=False,
    )

    base_textbook = textbooks_with_section_vectors[0]
    integrated_textbook = IntegratedTextbook(
        base_textbook, lambda a, b: cosine_similarity(a, b)[0][0], similarity_threshold
    )
    integrated_textbook.integrate_sections(textbooks_with_section_vectors[1:])

    integrated_textbook.print_matches()

### Headers only

In [7]:
cosine_similarity_toc_integration(lambda section: section.header, 0.5)

1: probability
-	Section(section_id='seg_23', header='probability', section_number=(2,), textbook=Textbook(name='2017_Book_IntroductoryStatisticsForBusin'))
-	Section(section_id='seg_23', header='probability', section_number=(2,), textbook=Textbook(name='Walpole_Probability_and_Statistics'))
-	Section(section_id='seg_31', header='uniform probability', section_number=(2, 2, 2), textbook=Textbook(name='2017_Book_IntroductoryStatisticsForBusin'))
-	Section(section_id='seg_13', header='probability', section_number=(2,), textbook=Textbook(name='2012_Book_ModernMathematicalStatisticsWi'))
-	Section(section_id='seg_295', header='probability measure,', section_number=(5, 14, 1), textbook=Textbook(name='2015_StatisticsForScieAndEng'))
-	Section(section_id='seg_23', header='conditional probability', section_number=(2, 5), textbook=Textbook(name='2012_Book_ModernMathematicalStatisticsWi'))
-	Section(section_id='seg_57', header='conditional probability', section_number=(4,), textbook=Textbook(name

### Text content

In [8]:
cosine_similarity_toc_integration(lambda section: section.content_string, 0.5)

1: probability
	1.1: classical and legal probability
	-	Section(section_id='seg_61', header='probability density functions', section_number=(5, 1), textbook=Textbook(name='2005_Book_AModernIntroductionToProbabili'))
	-	Section(section_id='seg_95', header='binomial and multinomial distributions', section_number=(5, 2), textbook=Textbook(name='Walpole_Probability_and_Statistics'))
	-	Section(section_id='seg_3', header='probability and events', section_number=(1, 1), textbook=Textbook(name='2012_Book_AConciseGuideToStatistics'))
	-	Section(section_id='seg_491', header='bayesian concepts', section_number=(18, 1), textbook=Textbook(name='Walpole_Probability_and_Statistics'))
	-	Section(section_id='seg_7', header='pictorial and tabular methods in descriptive statistics', section_number=(1, 3), textbook=Textbook(name='2012_Book_ModernMathematicalStatisticsWi'))
	-	Section(section_id='seg_53', header='methods of data collection', section_number=(3, 1), textbook=Textbook(name='2017_Book_Intuiti

In [9]:
cosine_similarity_toc_integration(lambda section: section.content_string, 0.7)

1: probability
	1.1: classical and legal probability
	1.2: bayes’s theorem
	1.3: screening tests
	1.4: debate over bayesian analysis
2: descriptive tools
	2.1: measures of central location
		2.1.1: mean
		2.1.2: median
		2.1.3: mode
		2.1.4: variants of the mean
	2.2: measures of dispersion
		2.2.1: variance and standard deviation
		2.2.2: variance of sample sums and the sample mean
	2.3: correlation
	2.4: measuring the disparity between two proportions
3: compound events
	3.1: the addition rule
	3.2: the product rule
4: significance
	4.1: the concept of significance
	4.2: rejecting the null hypothesis
	4.3: the two-or-three standard error rule
	4.4: statistical and legal significance
	4.5: factors that determine significance
	4.6: nonsignificant data
5: random variables and their distributions
	5.1: expectation, variance, and correlation
	5.2: the binomial distribution
	5.3: the hypergeometric distribution
	5.4: the normal distribution
	5.5: the poisson distribution
	5.6: student’s t-

### Annotations + Content

In [11]:
flattened_sections_headers = {
    section: section.header
    for textbook in textbooks
    for section in textbook.all_subsections
}
flattened_sections_annotations = {
    section: section.annotations
    for textbook in textbooks
    for section in textbook.all_subsections
}

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(
    list(flattened_sections_headers.values())
    + list(flattened_sections_annotations.values())
)

matrix_half_length = tfidf_matrix.shape[0] // 2
section_vectors_headers = dict(
    zip(flattened_sections_headers, tfidf_matrix[:matrix_half_length])
)
section_vectors_annotations = dict(
    zip(flattened_sections_annotations, tfidf_matrix[matrix_half_length:])
)


def compute_weighted_average(key, dict1, dict2, weights):
    return np.average([dict1[key], dict2[key]], weights=weights)


compute_vectors_fn = lambda section: compute_weighted_average(
    section, section_vectors_headers, section_vectors_annotations, [0.5, 0.5]
)

initialised_textbooks = initialise_textbooks(
    textbooks,
    compute_vector=compute_vectors_fn,
    aggregate_subsection_vectors=False,
)

base_textbook = initialised_textbooks[0]
integrated_textbook = IntegratedTextbook(
    base_textbook, lambda a, b: cosine_similarity(a, b)[0][0], 0.7
)
integrated_textbook.integrate_sections(initialised_textbooks[1:])
integrated_textbook.print_matches()

AttributeError: 'NoneType' object has no attribute 'lower'