In [1]:
import sys
import os
ROOT_PATH = os.path.dirname(os.getcwd())
sys.path.append(ROOT_PATH)

import json
from data_preparation.cefr import get_numerical_cefr
from data_preparation.word_difficulty_dataset_generator import WordDifficultyData


from common.word_difficulty_classifier import WordDifficultyClassifier
from common.wdd_manager import WDDManager

from joblib import load
import math

Change directories as required

In [2]:
cefr_path = "../data/word_difficulty_classifier/cefr_min.json"
cefr_twinwords_path = "../data/word_difficulty_classifier/cefr_words_twinword.json"
non_cefr_twinwords = "../data/word_difficulty_classifier/cefr_words_twinword.json"
model_path = "../word_difficulty_classifier/word_difficulty_classifier.joblib"

## Functions used to evaluate both data sets

In [32]:
def compare(score1, score2):
    result = score1 - score2
    if result > 0:
        return 1
    elif result < 0:
        return -1
    else:
        return 0

def evaulate(cefr, twinword):
    """ 
        cefr: dictionary of oxford 5000 cefr values for words
            key = word, value = cefr level
        twinword: dictionary of twinword API score for all Oxford 5000 words
            key = word, value = twinword score
    """
    
    results = {}
    total = 0
    correct = 0
    
    # select two different words from the cefr words that have a different level
    for word1, cefr1 in cefr.items():
        for word2, cefr2 in cefr.items():
            if (word1 == word2 or cefr1 == cefr2):
                continue

            # get the numerical level for the two CEFR levels
            cefr_score1 = get_numerical_cefr(cefr1)
            cefr_score2 = get_numerical_cefr(cefr2)
            
            # Compare the words by their CEFR levels and then by their Twinword Levels
            # returns -1 if first word easier than second word, 1 if the first word is harder and 0 if they are equal
            try:
                cefr_result = compare(cefr_score1, cefr_score2)
                twinword_result = compare(twinword[word1], twinword[word2])
            except:
                continue
            
            # Check if the comparisons are the same for the twinword and the cefr comparisions
            if cefr_result == twinword_result:
                correct += 1
            results[(word1, word2)] = (cefr_result, twinword_result)
            total +=1
            
    return 100 * correct/total


In [4]:
with open(cefr_path, "r") as f:
    cefr = json.load(f)

with open(cefr_twinwords_path, "r") as f:
    cefr_twinwords = json.load(f)

with open(cefr_twinwords_path, "r") as f:
    non_cefr_twinwords = json.load(f)

## Load the Model

In [12]:
model = load(model_path)

## Predict the values using the model

In [15]:
model_twinword_predictions = {}
for word in non_cefr_twinwords.keys():
    cefr_value = model.get_cefr_level(word)
    if cefr_value:
        model_twinword_predictions[word] = cefr_value

## Evaluate the Oxford 5000 words

In [28]:
evaulate(model_twinword_predictions, cefr_twinwords)

(64.46051079270961, 4950954)

## Evaluate the Model Predicted Words

In [33]:
evaulate(cefr, cefr_twinwords)

64.41479978070892