In [119]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

## Loading data from various files

In [121]:
book_content_file = "required_data/physics_book_content.csv"
labeled_pairs_file = "required_data/physics_labeled_pairs.csv"
wikipedia_data_file = "required_data/physics_correct_wikipedia_data.csv"
concepts_file = "required_data/physics_concepts_ambiguity.csv"

def read_concepts_file():
    df = pd.read_csv(concepts_file, encoding = "utf-8")
    all_data = {}
    for i in range(df.shape[0]):
        all_data[i] = {
            "concept": df[["concept"]].iloc[i].values[0],
            "key_terms": df[["key_terms"]].iloc[i].values[0]
        }
    return all_data


def read_book_data():
    df = pd.read_csv(book_content_file, encoding = "utf-8")
    all_data = {}
    for i in range(df.shape[0]):
        all_data[i] = {
            "section": df[["section"]].iloc[i].values[0],
            "title": df[["title"]].iloc[i].values[0],
            "page_no": df[["page_no"]].iloc[i].values[0],
            "content": df[["content"]].iloc[i].values[0]
        }
    return all_data


def read_labeled_pairs():
    df = pd.read_csv(labeled_pairs_file, encoding = "utf-8")
    all_data = {}
    for i in range(df.shape[0]):
        all_data[i] = {
            "topic_a": df[["topic_a"]].iloc[i].values[0],
            "topic_b": df[["topic_b"]].iloc[i].values[0],
            "relation": df[["relation"]].iloc[i].values[0],
        }
    return all_data


def read_wikipedia_data():
    df = pd.read_csv(wikipedia_data_file, encoding = "utf-8")
    all_data = {}
    for i in range(df.shape[0]):
        all_data[i] = {
            'topic': df[["topic"]].iloc[i].values[0],
            'wiki_title': df[["wiki_title"]].iloc[i].values[0],
            'wiki_summary': df[["wiki_summary"]].iloc[i].values[0],
            'wiki_content': df[["wiki_content"]].iloc[i].values[0],
            'wiki_links': df[["wiki_links"]].iloc[i].values[0],
        }
    return all_data

## Text Cleaning

In [122]:
def porter_stemming(text):
    porter_stemmer  = PorterStemmer()
    word_tokens = text.split(" ")
    words = [porter_stemmer.stem(word) for word in word_tokens]
    new_text = " ".join(words)
    return new_text


def wordnet_lemmatization(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    word_tokens = text.split(" ")
    words = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
    new_text = " ".join(words)
    return new_text


def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    sentence = " ".join(filtered_sentence)
    return sentence


def remove_punctuations(text):
    new_text = ""
    punctuations = "!\"#$%&()*+-.,:;<=>?@[\]^_'{|}~"
    for ch in text:
        if ch not in punctuations:
            new_text += ch
    return new_text


def clean_text(content):
    content = content.lower()
    content = re.sub(r'\d+', '', content)
    content = remove_punctuations(content)
    content = remove_stopwords(content)
    content = porter_stemming(content)
    # content = wordnet_lemmatization(content)
    content = content.strip()
    return content

## Matching concept title

In [143]:
def direct_matching(title, concept):
    title = clean_text(title).split(" ")
    concept = clean_text(concept).split(" ")
    if set(concept).issubset(set(title)) and len(concept) == len(title):
        return True
    else:
        return False

def concept_in_title(title, concept):
    title = clean_text(title).split(" ")
    concept = clean_text(concept).split(" ")
    if set(concept).issubset(set(title)):
        return True
    else:
        return False


def title_in_concept(title, concept):
    title = clean_text(title).split(" ")
    concept = clean_text(concept).split(" ")
    if set(title).issubset(set(concept)):
        return True
    else:
        return False


def matching_function(title, concept, key_terms, func_type):
    if func_type == 1:
        if direct_matching(title, concept):
            return True
        else:
            flag = 0
            for key_term in key_terms:
                if direct_matching(title, key_term):
                    flag = 1
                    break
            if flag: return True
            else: return False

    elif func_type == 2:
        if concept_in_title(title, concept):
            return True
        else:
            flag = 0
            for key_term in key_terms:
                if concept_in_title(title, key_term):
                    flag = 1
                    break
            if flag: return True
            else: return False

    elif func_type == 3:
        if title_in_concept(title, concept):
            return True
        else:
            flag = 0
            for key_term in key_terms:
                if title_in_concept(title, key_term):
                    flag = 1
                    break
            if flag: return True
            else: return False

#--------------------------------------------------------------------#

def match_title_concept():
    book_data = read_book_data()
    concept_data = read_concepts_file()
    matching_data = {}
    index = 0

    for i in range(len(concept_data)):
        concept = concept_data[i]["concept"]
        key_terms = concept_data[i]["key_terms"].split("|")
        matched_index = []

        for j in range(len(book_data)):
            title = book_data[j]["title"]
            if matching_function(title, concept, key_terms, func_type = 1):
                matched_index.append(j)

        if len(matched_index) == 0:
            for j in range(len(book_data)):
                title = book_data[j]["title"]
                if matching_function(title, concept, key_terms, func_type = 2):
                    matched_index.append(j)
            if len(matched_index) == 0:
                for j in range(len(book_data)):
                    title = book_data[j]["title"]
                    if matching_function(title, concept, key_terms, func_type = 3):
                        matched_index.append(j)
                if len(matched_index) == 0:
                    concept_type = 0
                else:
                    concept_type = 3
            else:
                concept_type = 2
        else:
            concept_type = 1

        matching_data[index] = {
            "concept": concept,
            "index": matched_index,
            "type": concept_type
        }
        index += 1
    return matching_data

In [144]:
matching_data = match_title_concept()

In [140]:
def get_stats(concept_type):
    index = 0
    for i in range(len(matching_data)):
        concept = matching_data[i]["concept"]
        conc_type = matching_data[i]["type"]
        concept_len = len(matching_data[i]["index"])
        if conc_type == concept_type:
            print(concept, concept_len)
            index += 1
    print(index)

In [145]:
get_stats(0)

Angle of incidence 0
Real image 0
Crystallinity 0
Hardness 0
Virtual image 0
Electrical polarity 0
Hertz 0
Stiffness 0
Tangential and normal components 0
Joule 0
Planet 0
Temperature 0
Kilogram 0
Friction 0
Gravitational constant 0
15


In [152]:
concept_type = 1
index = 0
for i in range(len(matching_data)):
    concept = matching_data[i]["concept"]
    conc_type = matching_data[i]["type"]
    concept_len = len(matching_data[i]["index"])
    if conc_type == concept_type and concept_len == 1:
        print(concept, concept_len)
        index += 1
print(index)

Absorption spectroscopy 1
Refraction 1
Emission spectrum 1
Potential energy 1
Newton's laws of motion 1
Relative velocity 1
Electric potential 1
Hooke's law 1
Resonance (particle physics) 1
Work (physics) 1
Gravitational acceleration 1
Diffraction 1
Transverse wave 1
Electroscope 1
Torque 1
Photoelectric effect 1
Laser 1
Sound intensity 1
Dielectric 1
Standing wave 1
Refracting telescope 1
Telescope 1
Newton's law of universal gravitation 1
Elastic collision 1
Position (vector) 1
Optical microscope 1
Snell's law 1
Insulator (electricity) 1
Inelastic collision 1
Electromotive force 1
Huygens–Fresnel principle 1
Magnetic field 1
Plane mirror 1
Gravitational field 1
Electromagnetic radiation 1
Capacitance 1
Specular reflection 1
Electromagnetic spectrum 1
Direction (geometry) 1
Energy 1
Power (physics) 1
Reflection (physics) 1
Euclidean vector 1
Free fall 1
Interference (wave propagation) 1
Acceleration 1
Ohmmeter 1
Musical tone 1
Pitch (music) 1
Capacitor 1
Voltmeter 1
Kinetic energy 1
P

In [156]:
concept_type = 2
index = 0
for i in range(len(matching_data)):
    concept = matching_data[i]["concept"]
    conc_type = matching_data[i]["type"]
    concept_len = len(matching_data[i]["index"])
    if conc_type == concept_type and concept_len > 1:
        print(concept, concept_len)
        index += 1
print(index)

Inertial frame of reference 3
Normal force 2
Contact force 2
Non-inertial reference frame 4
Mirror image 3
Tension (physics) 3
6


In [155]:
concept_type = 3
index = 0
for i in range(len(matching_data)):
    concept = matching_data[i]["concept"]
    conc_type = matching_data[i]["type"]
    concept_len = len(matching_data[i]["index"])
    if conc_type == concept_type and concept_len == 1:
        print(concept, concept_len)
        index += 1
print(index)

Absorption spectroscopy 1
Refraction 1
Emission spectrum 1
Potential energy 1
Newton's laws of motion 1
Relative velocity 1
Electric potential 1
Hooke's law 1
Resonance (particle physics) 1
Work (physics) 1
Gravitational acceleration 1
Diffraction 1
Transverse wave 1
Electroscope 1
Torque 1
Photoelectric effect 1
Laser 1
Sound intensity 1
Dielectric 1
Standing wave 1
Refracting telescope 1
Telescope 1
Newton's law of universal gravitation 1
Elastic collision 1
Position (vector) 1
Optical microscope 1
Snell's law 1
Insulator (electricity) 1
Inelastic collision 1
Electromotive force 1
Huygens–Fresnel principle 1
Magnetic field 1
Plane mirror 1
Gravitational field 1
Electromagnetic radiation 1
Capacitance 1
Specular reflection 1
Electromagnetic spectrum 1
Direction (geometry) 1
Energy 1
Power (physics) 1
Reflection (physics) 1
Euclidean vector 1
Free fall 1
Interference (wave propagation) 1
Acceleration 1
Ohmmeter 1
Musical tone 1
Pitch (music) 1
Capacitor 1
Voltmeter 1
Kinetic energy 1
P

# Document Matching code

In [None]:
import nltk
import string
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

path = '/opt/datacourse/data/parts'
token_dict = {}
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

for subdir, dirs, files in os.walk(path):
    for file in files:
        file_path = subdir + os.path.sep + file
        shakes = open(file_path, 'r')
        text = shakes.read()
        lowers = text.lower()
        no_punctuation = lowers.translate(None, string.punctuation)
        token_dict[file] = no_punctuation
        
#this can take some time
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

In [21]:
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[[0.36651513 1.         0.72875508 0.50699287]]


In [32]:
import spacy
nlp = spacy.load('en_core_web_sm')

def process_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    result = " ".join(result)
    result = nlp(result)
    return result


def match_docs(content1, content2):
    content1 = process_text(content1)
    content2 = process_text(content2)
    score = content1.similarity(content2)
    return score

content1 = documents[0]
content2 = documents[3]

print(match_docs(content1, content2))

0.7131300042306161


  "__main__", mod_spec)
