In [1]:
import pandas as pd
import re, os, string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

In [2]:
def clean_text(text):
    """Doc cleaning"""
    
    # Lowering text
    text = text.lower()
    
    # Removing whitespace and newlines
    text = re.sub('\s+',' ',text)
    
    # extract Noun
    token_words = word_tokenize(text)
    noun_words=[]
    for word,pos in nltk.pos_tag(token_words):
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
            noun_words.append(word)   
            text = " ".join(noun_words)

    # lemmatize
    wl = WordNetLemmatizer()
    lemma_words=[]
    for word in noun_words:
        lemma_words.append(wl.lemmatize(word))
        lemma_words.append(" ")
        text = " ".join(lemma_words)
    
    return text

In [3]:
def get_stopwords_list(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))

In [4]:
def sort_coo(coo_matrix):
    """Sort a dict with highest score"""
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature, score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [5]:
def get_keywords(vectorizer, feature_names, doc):
    """Return top k keywords from a doc using TF-IDF method"""

    #generate tf-idf for the given document
    tf_idf_vector = vectorizer.transform([doc])
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only TOP_K_KEYWORDS
    keywords=extract_topn_from_vector(feature_names,sorted_items,TOP_K_KEYWORDS)
    
    return list(keywords.keys())

In [6]:
# Constants
PUNCTUATION = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""" 
TOP_K_KEYWORDS = 10 # top k number of keywords to retrieve in a ranked document
STOPWORD_PATH = "../data/stopwords/en_stopwords_list.txt"
PAPERS_PATH = "../data/dev_data.csv"

In [7]:
data = pd.read_csv(PAPERS_PATH)
data.head()

Unnamed: 0,id,title,content,gold_label,bl_label,w2v_label
0,1,"Solids, liquids and gases",The kinetic theory is an attempt to explain wh...,States of matter,States of matter,States of matter
1,2,Density,Jonny Nelson explains density with a GCSE Phys...,States of matter,States of matter,States of matter
2,3,Required practical - determining density,There are different ways to investigate densit...,States of matter,States of matter,States of matter
3,4,Gas pressure,The particles in a solid simply vibrate around...,States of matter,States of matter,States of matter
4,5,Heating and changes of state,"When a material is heated or cooled, two chang...",States of matter,States of matter,States of matter


In [8]:
data.dropna(subset=['content'], inplace=True)

In [9]:
data['content'] = data['content'].apply(clean_text)
data.head()

Unnamed: 0,id,title,content,gold_label,bl_label,w2v_label
0,1,"Solids, liquids and gases",theory attempt material form state p...,States of matter,States of matter,States of matter
1,2,Density,jonny nelson density gcse physic exp...,States of matter,States of matter,States of matter
2,3,Required practical - determining density,way density activity mass measure ma...,States of matter,States of matter,States of matter
3,4,Gas pressure,particle point particle liquid roll ...,States of matter,States of matter,States of matter
4,5,Heating and changes of state,material change particle material bond...,States of matter,States of matter,States of matter


In [10]:
corpora = data['content'].to_list()

In [11]:
#load a set of stop words
stopwords=get_stopwords_list(STOPWORD_PATH)

# Initializing TF-IDF Vectorizer with stopwords
vectorizer = TfidfVectorizer(stop_words=stopwords, smooth_idf=True, use_idf=True)

# Creating vocab with our corpora
vectorizer.fit_transform(corpora)

# Storing vocab
feature_names = vectorizer.get_feature_names()

In [12]:
result = [] #a list of dictionary, key is 'top_keywords', value is list of keywords
for doc in corpora:
    df = {}
    #df['full_text'] = doc
    df['top_keywords'] = get_keywords(vectorizer, feature_names, doc)
    result.append(df)
    
final = pd.DataFrame(result)
final

Unnamed: 0,top_keywords
0,"[liquid, particle, solid, bond, gas, theory, m..."
1,"[density, m3, rho, kg, mass, centimetre, volum..."
2,"[density, mass, steel, cylinder, pan, measure,..."
3,"[particle, gas, container, pressure, collision..."
4,"[energy, particle, store, bond, change, materi..."
...,...
287,"[variable, value, hypothesis, prediction, rate..."
288,"[equipment, cylinder, hazard, liquid, step, ex..."
289,"[cross, experiment, reaction, data, observatio..."
290,"[variable, graph, data, repeat, line, result, ..."


In [13]:
text_keyword=[]
for item in result:
    text_keyword.append(item['top_keywords'])
print(text_keyword[:5])

[['liquid', 'particle', 'solid', 'bond', 'gas', 'theory', 'marble', 'container', 'arrangement', 'property'], ['density', 'm3', 'rho', 'kg', 'mass', 'centimetre', 'volume', 'particle', 'frac', 'material'], ['density', 'mass', 'steel', 'cylinder', 'pan', 'measure', 'volume', 'cube', 'stone', 'balance'], ['particle', 'gas', 'container', 'pressure', 'collision', 'balloon', 'air', 'room', 'liquid', 'vary'], ['energy', 'particle', 'store', 'bond', 'change', 'material', 'ice', 'substance', 'thermal', 'system']]


In [14]:
# read taxonomy
import pickle
with open('../data/LO_picklefile/gcse.pickle','rb') as pickle_file:
    los = pickle.load(pickle_file)

In [15]:
extracted_key_value_from_dict = []
for item in los:
    key_to_extract = {'name','keywords'}
    extracted_dict = {key: item[key] for key in item.keys()
                               & key_to_extract}
    extracted_key_value_from_dict.append(extracted_dict) 


In [16]:
def classify_text(text_kws, los):
    """classify text by counting the overlapping keywords between text and taxonomy
       text_kws: list of lists, keywords extracted from text
       los: list of dicts, keywords of learning obkectives
    """
    rv = []
    for ks in text_kws:
        #initialize the value of lo['name']
        scores = {lo['name']: 0 for lo in los}
        for lo in los:
            #create a set to collect cleaned value of dictionary
            cl=set()
            for kw in lo['keywords']:
                # clean up the value of dictionary
                cw = kw.lower().replace('_',' ')
                cl.add(cw)
            scores[lo['name']] = len(cl & set(ks))
        rv.append(scores)
            
    return rv

In [17]:
text_kws = text_keyword
los = extracted_key_value_from_dict
classified=classify_text(text_kws, los)

In [21]:
#sort dictionary by the number of overlapping keywords, from high to low
#def sort_dict(classify_text):
for dic in classified:
    x=sorted(dic.items(), key=lambda dic:dic[1], reverse=True)
    print(x)
    print()
    print(x[0][0])
    print()


[('States of matter', 5), ('Atomic structure', 1), ('Combined Science', 0), ('Not related', 0), ('Building blocks', 0), ('Cells in animals and plants', 0), ('Transport into and out of cells', 0), ('Cell division', 0), ('Waves', 0), ('Transport over larger distances', 0), ('Respiration', 0), ('Exchange surfaces and transport systems', 0), ('The digestive system', 0), ('Coordination and control', 0), ('Plant organisation', 0), ('Photosynthesis', 0), ('Plant diseases', 0), ('Interactions with the environment', 0), ('Lifestyle and health', 0), ('Homeostasis', 0), ('Reproduction and fertility', 0), ('Radiation and risk', 0), ('Preventing, treating and curing diseases', 0), ('Explaining change', 0), ('The atmosphere', 0), ('Water', 0), ('Ecosystems and biodiversity', 0), ('Inheritance', 0), ('Evolution', 0), ('Building blocks for understanding', 0), ('The periodic table', 0), ('Groups in the periodic table', 0), ('Chemical equations', 0), ('Calculations in chemistry', 0), ('Interactions over