In [13]:
import pandas as pd
import wikipedia
import json
import random

## Reading files from Course Dataset for CS

In [17]:
file1 = "RefD_dataset/Course/CS.edges"
file2 = "RefD_dataset/Course/CS.edges_neg"

f1 = open(file1)
f2 = open(file2)

all_topics = []

for line in f1:
    keywords = line.strip().split("\t")
    all_topics.append(keywords[0])
    all_topics.append(keywords[1])

for line in f2:
    keywords = line.strip().split("\t")
    all_topics.append(keywords[0])
    all_topics.append(keywords[1])

all_topics = list(set(all_topics))

In [18]:
print(len(all_topics))

63


## Extracting Wikipedia Content

In [19]:
def contain_section(line):
    line = line.strip()
    if len(line) > 4:
        if line[0] == "=" and line[1] == "=" and line[-2] == "=" and line[-1] == "=":
            return True
        else:
            return False

def wiki_section_extract(content):
    lines = content.split("\n")
    sections = ""
    for line in lines:
        if contain_section(line):
            sections += line[3:-3] + "\n"
    return sections.strip()

def keyword_data(topic = "", wiki_title = "", wiki_summary = "",
                 wiki_content = "", wiki_html = "", wiki_links = "", wiki_sections = ""):
    data = {
        'topic': topic,
        "wiki_title": wiki_title,
        "wiki_summary": wiki_summary,
        "wiki_content": wiki_content,
        "wiki_html": wiki_html,
        "wiki_links": wiki_links,
        "wiki_sections": wiki_sections
    }
    return data


def extract_data(topic):
    wiki_title = ""
    wiki_summary = ""
    wiki_content = ""
    wiki_html = ""
    wiki_links = ""
    wiki_sections = ""
    try:
        wiki = wikipedia.search(topic)[0]        
        try:
            wiki_data = wikipedia.page(wiki)
            wiki_title = wiki_data.title
            wiki_summary = wiki_data.summary
            wiki_content = wiki_data.content
            wiki_html = wiki_data.html()
            wiki_links = wiki_data.links
            wiki_sections = wiki_section_extract(wiki_content)

        except wikipedia.exceptions.DisambiguationError as e:
            print("blank")
        except wikipedia.exceptions.PageError as e:
            print("blank")

    except IndexError:
        print("blank")
    
    
    data = keyword_data(topic, wiki_title, wiki_summary,
                        wiki_content, wiki_html, wiki_links, wiki_sections)

    return data


In [20]:
list_len = len(all_topics)
all_keyword_data = {}

In [26]:
complete = 0
for i in range(list_len - complete):
    i += complete
    data = extract_data(all_topics[i])
    all_keyword_data[i] = data
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17




  lis = BeautifulSoup(html).find_all('li')


blank
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62


## Saving data in JSON Format

In [30]:
with open('output_data/keyterms_wiki_data.json', 'w') as file:
    json.dump(all_keyword_data, file)

## Saving data in CSV format

In [31]:
df = pd.DataFrame(columns=['topic', 'abbreviation', 'wiki_title', 'wiki_summary', 'wiki_content', 'wiki_html', 'wiki_links', 'wiki_sections'])

for i in range(len(all_keyword_data)):
    df = df.append(all_keyword_data[i], ignore_index=True)

df.to_csv("output_data/keyterms_wiki_data.csv")

# RefD code implementation

In [42]:
def get_id(topic):
    for i in all_keyword_data:
        if all_keyword_data[i]["topic"] == topic:
            break
    return i
        

def get_all_referred_links(topic):
    topic_id = get_id(topic)
    referred_links = all_keyword_data[topic_id]["wiki_links"]
    return referred_links


def get_r_value(topic_a, topic_b):
    referred_link_a = get_all_referred_links(topic_a)
    if topic_b in referred_link_a:
        return 1
    else:
        return 0


def get_w_value_equal(topic_a, topic_b):
    referred_link_b = get_all_referred_links(topic_b)
    if topic_a in referred_link_b:
        return 1
    else:
        return 0


def get_w_value_tfidf(topic_a, topic_b):
    pass



In [56]:
def part_a_calc(topic_a, topic_b):
    part_a = 0
    for topic in all_topics:
        part_a = part_a + get_r_value(topic, topic_b)*get_w_value_equal(topic, topic_a)
    return part_a


def part_b_calc(topic_a, topic_b):
    part_b = 0
    for topic in all_topics:
        part_b = part_b + get_w_value_equal(topic, topic_a)
    return part_b


def part_c_calc(topic_a, topic_b):
    part_c = 0
    for topic in all_topics:
        part_c = part_c + get_r_value(topic, topic_a)*get_w_value_equal(topic, topic_b)
    return part_c


def part_d_calc(topic_a, topic_b):
    part_d = 0
    for topic in all_topics:
        part_d = part_d + get_w_value_equal(topic, topic_b)
    return part_d


def refd_score_calc(topic_a, topic_b):
    part_a = part_a_calc(topic_a, topic_b)
    part_b = part_b_calc(topic_a, topic_b)
    part_c = part_c_calc(topic_a, topic_b)
    part_d = part_d_calc(topic_a, topic_b)
    
    if part_b == 0 or part_d == 0:
        return 0
    else:
        RefD_a_b = (part_a/part_b) - (part_c/part_d)
        return RefD_a_b

In [58]:
all_pairs_refd_value = []

for topic_a in all_topics:
    temp_topic = []
    for topic_b in all_topics:
        refd_score = refd_score_calc(topic_a, topic_b)
        temp_topic.append(refd_score)
    all_pairs_refd_value.append(temp_topic)

In [62]:
prereq_a = []
prereq_b = []

In [63]:
theta = 0.02
theta_neg = -0.02

for i in range(len(all_topics)):
    for j in range(len(all_topics)):
        if all_pairs_refd_value[i][j] > theta:
            prereq_b.append(all_topics[j])
            prereq_a.append(all_topics[i])
        elif all_pairs_refd_value[i][j] < theta_neg:
            prereq_b.append(all_topics[i])
            prereq_a.append(all_topics[j])
        else:
            continue

In [74]:
prereq_results = {}

for i in range(len(prereq_a)):
    prereq_results[i] = {
        "topic_a": prereq_a[i],
        "topic_b": prereq_b[i]
    }

In [76]:
df = pd.DataFrame(columns=['topic_a', 'topic_b'])

for i in range(len(prereq_results)):
    df = df.append(prereq_results[i], ignore_index=True)

df.to_csv("output_data/prereq_matches.csv")