In [1]:
import pandas as pd
import collections

In [2]:
book_data = "../output_files/physics_normalized_content.csv"
concept_data = "../output_files/concept_final_indexing.csv"

In [3]:
def read_concept_data(file_name):
	df = pd.read_csv(file_name, encoding = "utf-8")
	req_data = {}
	for i in range(df.shape[0]):
		concept = df[["concept"]].iloc[i].values[0]
		func_type = df[["type"]].iloc[i].values[0]
		if func_type != 0: title_match_index = df[["index"]].iloc[i].values[0]
		else: title_match_index = ""

		req_data[i] = {
			"concept": concept,
			"func_type": func_type,
			"title_match_index": title_match_index,
			"content_match_index": [],
			"freq_content_match": []
		}
	return req_data

def read_book_data(file_name):
	df = pd.read_csv(file_name, encoding = "utf-8")
	req_data = {}
	for i in range(df.shape[0]):
		section = df[["section"]].iloc[i].values[0]
		if df[["tagged_content"]].iloc[i].isna().values[0]:
			content = ""
		else:
			content = df[["tagged_content"]].iloc[i].values[0]
		req_data[i] = {
			"section": section,
			"content": content
		}
	return req_data

In [4]:
def get_concept_from_content(data):
    for i in range(len(data)):
        content = data[i]["content"]
        concepts = []

        c1 = content.split("<b>")
        for j in range(1, len(c1)):
            concept = c1[j].split("</b>")[0]
            concepts.append(concept)

        data[i]["concepts"] = concepts
        print(concepts)
    return data


def concepts_collections(concepts):
	counter = collections.Counter(concepts)
	concept_list = list(counter.keys())
	freq_list = list(counter.values())
	return (concept_list, freq_list)


def get_concept_content_matching(concept_data, book_data):
	for i in range(len(book_data)):
		concepts = book_data[i]["concepts"]
		concept_list, freq_list = concepts_collections(concepts)
		for j in range(len(concept_list)):
			concept = concept_list[j]
			for k in range(len(concept_data)):
				if concept_data[k]["concept"] == concept:
					section = book_data[i]["section"]
					concept_data[k]["content_match_index"].append(section)
					concept_data[k]["freq_content_match"].append(freq_list[j])
	return concept_data

In [5]:
def sort_concept_data(concept_data):
	for i in range(len(concept_data)):
		index = concept_data[i]["content_match_index"]
		concept_data[i]["content_match_index"] = "|".join(index)

		freq = concept_data[i]["freq_content_match"]
		concept_data[i]["freq_content_match"] = "|".join(freq)
	return concept_data



def save_concept_data(concept_data, output_file):
	columns = ["concept", "type", "title_match_index", "content_match_index", "freq_content_match"]
	df = pd.DataFrame(columns = columns)
	for i in range(len(concept_data)):
		df = df.append(data[i], ignore_index = True)
	df.to_csv(output_file)
	return True

In [6]:
book_data = read_book_data(book_data)
concept_index_data = read_concept_data(concept_data)

In [7]:
book_data = get_concept_from_content(book_data)

['Physics', 'Physics', 'Work (physics)', 'Physics', 'Field (physics)', 'Physics', 'Physics', 'Physics', 'Wave', 'Wave', 'Energy level', 'Physics']
[]
['Metre', 'Metre', 'Length', 'Physical quantity', 'Temperature', 'Distance', 'Physical quantity', 'Physical quantity', 'Length', 'Temperature', 'Distance', 'Physical quantity']
[]
['Physical quantity']
['Metre', 'Kilogram', 'Force', 'Work (physics)', 'Physics', 'Speed', 'Light', 'Energy', 'Physics']
['Metre', 'Metre', 'Hertz', 'Hertz']
['Work (physics)', 'Kilogram', 'Metre', 'Kilogram', 'Metre', 'Joule', 'Metre']
[]
[]
['Speed']
['Work (physics)', 'Work (physics)', 'Speed', 'Light', 'Speed', 'Light', 'Speed', 'Light']
[]
['Kilogram', 'Kilogram', 'Kilogram', 'Kilogram', 'Power (physics)']
['Work (physics)', 'Physical quantity']
['Metre', 'Metre']
['Speed', 'Temperature', 'Speed', 'Temperature', 'Temperature', 'Temperature', 'Temperature']
['Mass', 'Mass', 'Kilogram', 'Mass', 'Kilogram', 'Mass', 'Kilogram', 'Kilogram']
[]
['Position (vector

In [8]:
concept_data = get_concept_content_matching(concept_index_data, book_data)

In [8]:
print(concept_data[0])

{'concept': 'Crystallinity', 'func_type': 0, 'title_match_index': '', 'content_match_index': ['13.3.1', '20.2', '22.4.2', '27.3.1'], 'freq_content_match': [3, 1, 1, 1]}


In [9]:
concept_data = sort_concept_data(concept_data)

TypeError: sequence item 0: expected str instance, int found