In [1]:
import sys
import os
ROOT_PATH = os.path.dirname(os.getcwd())
sys.path.append(ROOT_PATH)

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler

from data_preparation.word_difficulty_dataset_generator import WordDifficultyData
from data_preparation.tf_idf import TfIdfGenerator

from common.word_difficulty_classifier import WordDifficultyClassifier
from common.model_accuracy import model_accuracy
from common.wdd_manager import WDDManager

import numpy as np
from joblib import dump, load
import math

Change paths to correct paths

In [2]:
wdds_path = "../data/wdds" # Path to folder containing data sets
preprocessed_data_path = "../data/reddit_data_processed" # Path to reddit preprocessed data
model_save_path = "word_difficulty_model.joblib" # Path to save the model

In [3]:
wdd_manager = WDDManager(wdds_path)
wdd = wdd_manager.get_wdd(scale_type="scaled", data_type="tf" )

loading tf, min max scaled...
Done!


# Build and train classifier

In [20]:
clf = BaggingClassifier(
    DecisionTreeClassifier(criterion="gini"), 
    max_samples=0.8, 
    n_estimators=710)

In [21]:
X = wdd.features()
y = wdd.output()

model = clf.fit(X, y)

# Create Feature Values for Entire Data Set

Get TF values for original data set

In [40]:
tf_idf_generator = TfIdfGenerator(preprocessed_data_path, comments_key="comment_processed_no_spell_corrections")
tf_idf_generator.compute_tf()
tf_values = tf_idf_generator.tf

Change form of Data Set with a list of the 59 values for each word and a word index dictionary

In [43]:
word_indexes = {}
tf_word_values = []
for index, word in enumerate(list(tf_values.values())[0].keys()):
    word_values = []
    for doc in tf_values.keys():
        value = tf_values[doc][word]
        if math.isnan(value):
            value = 0
        word_values.append(value)
    tf_word_values.append(word_values)
    word_indexes[word] = index

Load Unscaled TF CEFR data set

In [33]:
wdd_manager = WDDManager("../")
unscaled_wdd = wdd_manager.get_wdd(scale_type="no_change", data_type="tf")

loading tf, no change...
Done!


Fit MinMaxScaler to the Unscaled CEFR TF values and then Scale the entire TF values

In [46]:
scaler = MinMaxScaler()
scaler.fit(unscaled_wdd.features())
scaled_data = scaler.transform(tf_word_values)

scaled_word_frequencies = {}
for word, word_index in word_indexes.items():
    scaled_word_frequencies[word] = scaled_data[word_index]

# Create Class that predicts 

In [132]:
wdc = WordDifficultyClassifier(model, scaled_word_frequencies)

In [133]:
dump(wdc, model_save_path)

['word_difficulty_classifier.joblib']