In [119]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

## Loading data from various files

In [187]:
book_content_file = "required_data/physics_book_content.csv"
labeled_pairs_file = "required_data/physics_labeled_pairs.csv"
wikipedia_data_file = "required_data/physics_correct_wikipedia_data.csv"
concepts_file = "required_data/physics_concepts_ambiguity.csv"

def read_concepts_file():
    df = pd.read_csv(concepts_file, encoding = "utf-8")
    all_data = {}
    for i in range(df.shape[0]):
        all_data[i] = {
            "concept": df[["concept"]].iloc[i].values[0],
            "key_terms": df[["key_terms"]].iloc[i].values[0]
        }
    return all_data


def read_book_data():
    df = pd.read_csv(book_content_file, encoding = "utf-8")
    all_data = {}
    for i in range(df.shape[0]):
        if df[["content"]].iloc[i].isna().values[0]:
            content = ""
        else:
            content = df[["content"]].iloc[i].values[0]
        all_data[i] = {
            "section": df[["section"]].iloc[i].values[0],
            "title": df[["title"]].iloc[i].values[0],
            "page_no": df[["page_no"]].iloc[i].values[0],
            "content": content
        }
    return all_data


def read_labeled_pairs():
    df = pd.read_csv(labeled_pairs_file, encoding = "utf-8")
    all_data = {}
    for i in range(df.shape[0]):
        all_data[i] = {
            "topic_a": df[["topic_a"]].iloc[i].values[0],
            "topic_b": df[["topic_b"]].iloc[i].values[0],
            "relation": df[["relation"]].iloc[i].values[0],
        }
    return all_data


def read_wikipedia_data():
    df = pd.read_csv(wikipedia_data_file, encoding = "utf-8")
    all_data = {}
    for i in range(df.shape[0]):
        all_data[i] = {
            'topic': df[["topic"]].iloc[i].values[0],
            'wiki_title': df[["wiki_title"]].iloc[i].values[0],
            'wiki_summary': df[["wiki_summary"]].iloc[i].values[0],
            'wiki_content': df[["wiki_content"]].iloc[i].values[0],
            'wiki_links': df[["wiki_links"]].iloc[i].values[0],
        }
    return all_data


def read_concept_match(output_file):
    df = pd.read_csv(output_file, encoding = "utf-8")
    data = {}
    for i in range(df.shape[0]):
        concept = df[["concept"]].iloc[i].values[0]
        if df[["index"]].iloc[i].isna().values[0]:
            index = []
        else:
            index = df[["index"]].iloc[i].values[0].split("|")
        data[concept] = {
            "index" : index,
            "type" : df[["type"]].iloc[i].values[0]
        }
    return data

## Text Cleaning

In [122]:
def porter_stemming(text):
    porter_stemmer  = PorterStemmer()
    word_tokens = text.split(" ")
    words = [porter_stemmer.stem(word) for word in word_tokens]
    new_text = " ".join(words)
    return new_text


def wordnet_lemmatization(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    word_tokens = text.split(" ")
    words = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
    new_text = " ".join(words)
    return new_text


def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    sentence = " ".join(filtered_sentence)
    return sentence


def remove_punctuations(text):
    new_text = ""
    punctuations = "!\"#$%&()*+-.,:;<=>?@[\]^_'{|}~"
    for ch in text:
        if ch not in punctuations:
            new_text += ch
    return new_text


def clean_text(content):
    content = content.lower()
    content = re.sub(r'\d+', '', content)
    content = remove_punctuations(content)
    content = remove_stopwords(content)
    content = porter_stemming(content)
    # content = wordnet_lemmatization(content)
    content = content.strip()
    return content

## Matching concept title

In [180]:
def save_match_data(matching_data, output_file):
    df_match_data = pd.DataFrame(columns = ["concept", "index", "type"])
    for i in range(len(matching_data)):
        df_match_data = df_match_data.append(matching_data[i], ignore_index = True)
    df_match_data.to_csv(output_file)
    return True


def direct_matching(title, concept):
    title = clean_text(title).split(" ")
    concept = clean_text(concept).split(" ")
    if set(concept).issubset(set(title)) and len(concept) == len(title):
        return True
    else:
        return False

def concept_in_title(title, concept):
    title = clean_text(title).split(" ")
    concept = clean_text(concept).split(" ")
    if set(concept).issubset(set(title)):
        return True
    else:
        return False


def title_in_concept(title, concept):
    title = clean_text(title).split(" ")
    concept = clean_text(concept).split(" ")
    if set(title).issubset(set(concept)):
        return True
    else:
        return False


def matching_function(title, concept, key_terms, func_type):
    if func_type == 1:
        if direct_matching(title, concept):
            return True
        else:
            flag = 0
            for key_term in key_terms:
                if direct_matching(title, key_term):
                    flag = 1
                    break
            if flag: return True
            else: return False

    elif func_type == 2:
        if concept_in_title(title, concept):
            return True
        else:
            flag = 0
            for key_term in key_terms:
                if concept_in_title(title, key_term):
                    flag = 1
                    break
            if flag: return True
            else: return False

    elif func_type == 3:
        if title_in_concept(title, concept):
            return True
        else:
            flag = 0
            for key_term in key_terms:
                if title_in_concept(title, key_term):
                    flag = 1
                    break
            if flag: return True
            else: return False

#--------------------------------------------------------------------#

def match_title_concept():
    book_data = read_book_data()
    concept_data = read_concepts_file()
    matching_data = {}
    index = 0

    for i in range(len(concept_data)):
        concept = concept_data[i]["concept"]
        key_terms = concept_data[i]["key_terms"].split("|")
        matched_index = []

        for j in range(len(book_data)):
            title = book_data[j]["title"]
            if matching_function(title, concept, key_terms, func_type = 1):
                section = book_data[j]["section"]
                matched_index.append(section)

        if len(matched_index) == 0:
            for j in range(len(book_data)):
                title = book_data[j]["title"]
                if matching_function(title, concept, key_terms, func_type = 2):
                    section = book_data[j]["section"]
                    matched_index.append(section)
            if len(matched_index) == 0:
                for j in range(len(book_data)):
                    title = book_data[j]["title"]
                    if matching_function(title, concept, key_terms, func_type = 3):
                        section = book_data[j]["section"]
                        matched_index.append(section)
                if len(matched_index) == 0:
                    concept_type = 0
                else:
                    concept_type = 3
            else:
                concept_type = 2
        else:
            concept_type = 1

        matching_data[index] = {
            "concept": concept,
            "index": "|".join(matched_index),
            "type": concept_type
        }
        index += 1
    return matching_data

In [None]:
matching_data = match_title_concept()
output_file = "required_data/concept_title_match.csv"
save_match_data(matching_data, output_file)

In [None]:
def save_match_data(matching_data, output_file):
    df_match_data = pd.DataFrame(columns = ["concept", "index", "type"])
    for i in range(len(matching_data)):
        df_match_data = df_match_data.append(matching_data[i], ignore_index = True)
    df_match_data.to_csv(output_file)
    return true

In [140]:
def get_stats(concept_type):
    index = 0
    for i in range(len(matching_data)):
        concept = matching_data[i]["concept"]
        conc_type = matching_data[i]["type"]
        concept_len = len(matching_data[i]["index"])
        if conc_type == concept_type:
            print(concept, concept_len)
            index += 1
    print(index)

In [145]:
get_stats(0)

Angle of incidence 0
Real image 0
Crystallinity 0
Hardness 0
Virtual image 0
Electrical polarity 0
Hertz 0
Stiffness 0
Tangential and normal components 0
Joule 0
Planet 0
Temperature 0
Kilogram 0
Friction 0
Gravitational constant 0
15


In [152]:
concept_type = 1
index = 0
for i in range(len(matching_data)):
    concept = matching_data[i]["concept"]
    conc_type = matching_data[i]["type"]
    concept_len = len(matching_data[i]["index"])
    if conc_type == concept_type and concept_len == 1:
        print(concept, concept_len)
        index += 1
print(index)

Absorption spectroscopy 1
Refraction 1
Emission spectrum 1
Potential energy 1
Newton's laws of motion 1
Relative velocity 1
Electric potential 1
Hooke's law 1
Resonance (particle physics) 1
Work (physics) 1
Gravitational acceleration 1
Diffraction 1
Transverse wave 1
Electroscope 1
Torque 1
Photoelectric effect 1
Laser 1
Sound intensity 1
Dielectric 1
Standing wave 1
Refracting telescope 1
Telescope 1
Newton's law of universal gravitation 1
Elastic collision 1
Position (vector) 1
Optical microscope 1
Snell's law 1
Insulator (electricity) 1
Inelastic collision 1
Electromotive force 1
Huygens–Fresnel principle 1
Magnetic field 1
Plane mirror 1
Gravitational field 1
Electromagnetic radiation 1
Capacitance 1
Specular reflection 1
Electromagnetic spectrum 1
Direction (geometry) 1
Energy 1
Power (physics) 1
Reflection (physics) 1
Euclidean vector 1
Free fall 1
Interference (wave propagation) 1
Acceleration 1
Ohmmeter 1
Musical tone 1
Pitch (music) 1
Capacitor 1
Voltmeter 1
Kinetic energy 1
P

In [196]:
concept_type = 2
index = 0
for i in range(len(matching_data)):
    concept = matching_data[i]["concept"]
    conc_type = matching_data[i]["type"]
    concept_len = len(matching_data[i]["index"].split("|"))
    if conc_type == concept_type and concept_len > 1:
        print(concept, concept_len)
        index += 1
print(index)

Magnet 11
Gravity 3
Wavelength 4
Motion (physics) 18
Plasticity (physics) 2
Velocity 6
Light 8
Motion graphs and derivatives 2
Frequency 2
Mass 2
Amplitude 3
Speed 6
Sound 7
Elasticity (physics) 3
Electric field 3
Doppler effect 4
Le Sage's theory of gravitation 3
Wave 23
Geometrical optics 5
Creep (deformation) 2
Fracture 2
Color 11
Lever 3
Distance 2
Electron 7
Collision 3
Magnetism 11
Displacement (vector) 3
Electric charge 11
Pigment 2
Scalar (mathematics) 2
Electrostatics 3
Ray (optics) 3
Ohm 5
Projectile motion 2
Electrical conductor 2
Field (physics) 12
Gravity of Earth 3
Compass 2
Motion 18
40


In [155]:
concept_type = 3
index = 0
for i in range(len(matching_data)):
    concept = matching_data[i]["concept"]
    conc_type = matching_data[i]["type"]
    concept_len = len(matching_data[i]["index"])
    if conc_type == concept_type and concept_len == 1:
        print(concept, concept_len)
        index += 1
print(index)

Absorption spectroscopy 1
Refraction 1
Emission spectrum 1
Potential energy 1
Newton's laws of motion 1
Relative velocity 1
Electric potential 1
Hooke's law 1
Resonance (particle physics) 1
Work (physics) 1
Gravitational acceleration 1
Diffraction 1
Transverse wave 1
Electroscope 1
Torque 1
Photoelectric effect 1
Laser 1
Sound intensity 1
Dielectric 1
Standing wave 1
Refracting telescope 1
Telescope 1
Newton's law of universal gravitation 1
Elastic collision 1
Position (vector) 1
Optical microscope 1
Snell's law 1
Insulator (electricity) 1
Inelastic collision 1
Electromotive force 1
Huygens–Fresnel principle 1
Magnetic field 1
Plane mirror 1
Gravitational field 1
Electromagnetic radiation 1
Capacitance 1
Specular reflection 1
Electromagnetic spectrum 1
Direction (geometry) 1
Energy 1
Power (physics) 1
Reflection (physics) 1
Euclidean vector 1
Free fall 1
Interference (wave propagation) 1
Acceleration 1
Ohmmeter 1
Musical tone 1
Pitch (music) 1
Capacitor 1
Voltmeter 1
Kinetic energy 1
P

## Data Extraction Code

In [None]:
def get_wiki_data(concept):
    wiki_data = read_wikipedia_data()
    for i in range(len(wiki_data)):
        title = wiki_data[i]["topic"]
        if title == concept:
            break
    summary = wiki_data[i]["wiki_summary"]
    content = wiki_data[i]["wiki_content"]
    return (summary, content)

In [346]:
def get_section_data(book_data):
    section_list = [book_data[i]["section"] for i in range(len(book_data))]
    section_data = {}

    for i in range(len(section_list)):
        current_collection = [section_list[i]]
        for j in range(i+1, len(section_list)):
            x1 = len(section_list[i].split("."))
            x2 = len(section_list[j].split("."))
            if x2 > x1:
                current_collection.append(section_list[j])
            else:
                section = section_list[i]
                section_data[section] = "|".join(current_collection)
                break
        if section_list[i] == '31':
            section_data['31'] = "|".join(current_collection)
    return section_data


def get_data(book_data, section):
    for i in range(len(book_data)):
        if book_data[i]["section"] == section:
            break
    title = str(book_data[i]["title"])
    content = str(book_data[i]["content"])
    text = title + "\n" + content
    return text


def get_section_content(section, section_data, book_data):
    req_sections = section_data[section].split("|")
    content = ""
    for section in req_sections:
        content += get_data(book_data, section)
        content += "\n"
    return content


def get_book_data(section):
    book_data = read_book_data()
    section_data = get_section_data(book_data)
    print(section_data[section])
    content = get_section_content(section, section_data, book_data)
    return content

In [188]:
def get_matched_sections(concept):
    file_name = "required_data/concept_title_match.csv"
    matched_data = read_concept_match(file_name)
    data = matched_data[concept]
    return data            

In [197]:
get_matched_sections("Color")

{'index': ['25',
  '25.2',
  '25.3.1',
  '25.3.2',
  '25.3.3',
  '25.3.4',
  '25.3.5',
  '25.4.1',
  '25.4.2',
  '25.4.3',
  '31.4.3'],
 'type': 2}

## Document matching code

In [191]:
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [192]:
def text_cleaning(text):
    tokens = nltk.word_tokenize(text)
    return tokens

In [239]:
def tfidf_document_similarity(documents):
    tfidf_vectorizer = TfidfVectorizer(tokenizer = text_cleaning)
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    doc_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
    return doc_similarity

In [214]:
documents = (
    "The sky is blue",
    "The sun is bright",
    "The sun in the sky is bright",
    "We can see the shining sun, the bright sun"
)
print(tfidf_document_similarity(documents))

[[0.36651513 1.         0.72875508 0.50699287]]


In [291]:
index_list = get_matched_sections("Inertial frame of reference")["index"]

In [292]:
index_list

['3.2.1', '21.5', '21.5.2']

In [293]:
wiki_summary, wiki_content = get_wiki_data("Gravity")

In [295]:
documents = []
documents.append(wiki_content)

In [296]:
for index in index_list:
    content = get_book_data(index)
    documents.append(content)

In [297]:
score = tfidf_document_similarity(documents)

In [298]:
score[0]

array([1.        , 0.45095139, 0.5676293 , 0.47035654])

In [277]:
get_book_data("4.8")

'End of Chapter Exercises: Gravity and Mechanical Energy\n\n'

In [234]:
arr = ['24', '24.1', '25', '25.2', '25.3.1', '25.3.2', '25.3.3', '25.3.4', '25.3.5', '25.4.1', '25.4.2', '25.4.3', '31.4.3']

In [280]:
arr = ['3.2.1', '21.5', '21.5.2']

In [281]:
def sort_sections(arr):
    all_data = []
    current_arr = [arr[0]]
    i = 1
    while i < len(arr):
        if current_arr[0] in arr[i]:
            current_arr.append(arr[i])        
            i += 1
        else:
            all_data.append(current_arr)
            current_arr = [arr[i]]
            i += 1
    all_data.append(current_arr)
    return all_data

In [282]:
sort_sections(arr)

[['3.2.1'], ['21.5', '21.5.2']]

In [283]:
len(sort_sections(arr))

2

In [302]:
import itertools

stuff = ["1.2", "2,1", "3.4"]
for L in range(0, len(stuff)+1):
    for subset in itertools.combinations(stuff, L):
        print(subset)

()
('1.2',)
('2,1',)
('3.4',)
('1.2', '2,1')
('1.2', '3.4')
('2,1', '3.4')
('1.2', '2,1', '3.4')


In [305]:
arr = ["1.2", "2,1", "3.4"]
def get_section_combination(arr):
    subsets = []
    for l in range(0, len(arr) + 1):
        for subset in itertools.combinations(arr, l):
            subsets.append(subset)
    subsets = [subsets[i] for i in range(1, len(subsets))]
    return subsets

In [306]:
get_section_combination(arr)

[('1.2',),
 ('2,1',),
 ('3.4',),
 ('1.2', '2,1'),
 ('1.2', '3.4'),
 ('2,1', '3.4'),
 ('1.2', '2,1', '3.4')]

In [299]:
arr = ['12.3', '3.4', '4.5']

In [None]:
def xyz(arr):
    