In [None]:
import numpy as np
import pandas as pd

from nltk.tokenize import sent_tokenize
import nltk
from sklearn.linear_model import LinearRegression
from sklearn.metrics import auc, roc_curve, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib as plt

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
# number of words in a text
train['wordCount'] = [len(i) for i in train['excerpt']]
test['wordCount'] = [len(i) for i in test['excerpt']]

In [None]:
# OLD WORD LENGTH
def tokenize(texts):
    vec = CountVectorizer()
    result = vec.fit_transform(texts)
    vocab = np.array(vec.get_feature_names())
    return [vocab[result[i].indices] for i in range(result.shape[0])]
# average word length of words in a text
def wordLengthOld(texts):
    wordList = tokenize(texts)
    return [np.mean([len(word) for word in sublist]) for sublist in wordList]

In [None]:
# NEW WORD LENGTH
def wordLength(texts):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    wordList = [tokenizer.tokenize(txt) for txt in texts]
    return [np.mean([len(word) for word in sublist]) for sublist in wordList]

In [None]:
train['wordLength'] = wordLength(train["excerpt"])
test['wordLength'] = wordLength(test["excerpt"])
#train.plot.scatter('target', 'wordLength')

In [None]:
# SENTENCE LENGTH
def sentenceLength(texts):
    sentenceList = [sent_tokenize(txt) for txt in texts]
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    return [np.mean([len(tokenizer.tokenize(txt)) for txt in sublist]) for sublist in sentenceList]

def sentenceCount(texts):
    sentenceList = [sent_tokenize(txt) for txt in texts]
    return [(len(txt)) for txt in sentenceList]

In [None]:
train['sentenceLength'] = sentenceLength(train["excerpt"])
test['sentenceLength'] = sentenceLength(test["excerpt"])
#train.plot.scatter('target', 'sentenceLength')

In [None]:
train['sentenceCount'] = sentenceCount(train["excerpt"])
test['sentenceCount'] = sentenceCount(test["excerpt"])
#train.plot.scatter('target', 'sentenceCount')

In [None]:
def syllables(texts):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    wordList = [tokenizer.tokenize(txt) for txt in texts]
    return [np.mean([_syllables(word) for word in sublist]) for sublist in wordList]
    
def _syllables(word):
    syllable_count = 0
    vowels = 'aeiouy'
    if word[0] in vowels:
        syllable_count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            syllable_count += 1
    if word.endswith('e'):
        syllable_count -= 1
    if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
        syllable_count += 1
    if syllable_count == 0:
        syllable_count += 1
    return syllable_count

In [None]:
train['syllables'] = syllables(train["excerpt"])
test['syllables'] = syllables(test["excerpt"])
#train.plot.scatter('target', 'syllables')

#### Stats based method

In [None]:
model_lr = LinearRegression()
model_lr.fit(train[["wordCount", "wordLength", "sentenceLength", "sentenceCount", "syllables"]],train["target"])
p_lr = model_lr.predict(train[["wordCount", "wordLength", "sentenceLength", "sentenceCount", "syllables"]])

In [None]:
mean_squared_error(train["target"], p_lr)

#### Stats based method with xgboost

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor

In [None]:
X_stats = train[["wordCount", "wordLength", "sentenceLength", "sentenceCount", "syllables"]]
y = train["target"]
model_stats = XGBRegressor(objective = "reg:squarederror", max_depth=10, n_estimators=100)
model_stats.fit(X_stats, y)
p_stats = model_stats.predict(X_stats)

In [None]:
mean_squared_error(train["target"], p_stats)

#### Ensemble xgboost and logistic regression

In [None]:
#p_stats_test = model_stats.predict(test[["wordCount", "wordLength", "sentenceLength", "sentenceCount", "syllables"]])
# p_lr_test = model_lr.predict(test[["wordCount", "wordLength", "sentenceLength", "sentenceCount", "syllables"]])
# test["target"] = .7 * p_lr_test + .3 * p_stats_test
# test[["id", "target"]].to_csv("submission.csv", index=False)

#### Word2Vec with xgboost

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
embeddings = np.array([nlp(text).vector for text in train["excerpt"]])

In [None]:
X_w2v = embeddings
y = train["target"]
model_w2v = XGBRegressor(objective = "reg:squarederror", max_depth=10, n_estimators=100)
model_w2v.fit(X_w2v, y)
p_w2v = model_w2v.predict(X_w2v)

#### Word2vec with linear regression

In [None]:
model_w2v_lr = LinearRegression()
model_w2v_lr.fit(X_w2v,y)
p_w2v_lr = model_w2v_lr.predict(X_w2v)

#### Combination stats and word2vec (linear regression)

In [None]:
X_combo = np.concatenate((X_stats, X_w2v), axis = 1)
model_combo_lr = LinearRegression()
model_combo_lr.fit(X_combo, y)
p_combo_lr = model_combo_lr.predict(X_combo)

mean_squared_error(train["target"], p_combo_lr)

#### Combination stats and word2vec (xgboost)

In [None]:
model_combo_xgb = XGBRegressor(objective = "reg:squarederror", max_depth=10, n_estimators=100)
model_combo_xgb.fit(X_combo, y)
p_combo_xgb = model_combo_lr.predict(X_combo)
mean_squared_error(train["target"], p_combo_xgb)

embeddingsTest = np.array([nlp(text).vector for text in test["excerpt"]])
X_stats_test = test[["wordCount", "wordLength", "sentenceLength", "sentenceCount", "syllables"]]
p_combo_lr_test = model_combo_lr.predict(np.concatenate((X_stats_test, embeddingsTest), axis = 1))
p_combo_xgb_test = model_combo_xgb.predict(np.concatenate((X_stats_test, embeddingsTest), axis = 1))
test["target"] = 0.7 * p_combo_lr_test + 0.3 * p_combo_xgb_test
test[["id", "target"]].to_csv("submission.csv", index=False)

#### Nearest neighbor

from sklearn.neighbors import KNeighborsRegressor
def cos(a, b):
    return np.dot(a, b) / np.sqrt(np.dot(a, a) * np.dot(b, b))
model_nbr = KNeighborsRegressor(n_neighbors=5)#, metric = cos)
model_nbr.fit(embeddings, y)
p_nbr = model_nbr.predict(embeddings)

mean_squared_error(train["target"], p_nbr)

.751
embeddingsTest = np.array([nlp(text).vector for text in test["excerpt"]])
test["target"] = model_nbr.predict(embeddingsTest)
test[["id", "target"]].to_csv("submission.csv", index=False)

model_combo_nbr = KNeighborsRegressor(n_neighbors=5)#, metric = cos)
model_combo_nbr.fit(np.concatenate((X_stats, embeddings), axis = 1), y)
p_combo_nbr = model_combo_nbr.predict(np.concatenate((X_stats, embeddings), axis = 1))

mean_squared_error(train["target"], p_combo_nbr)

embeddingsTest = np.array([nlp(text).vector for text in test["excerpt"]])
X_stats_test = test[["wordCount", "wordLength", "sentenceLength", "sentenceCount", "syllables"]]
p_combo_lr_test = model_combo_lr.predict(np.concatenate((X_stats_test, embeddingsTest), axis = 1))
p_combo_xgb_test = model_combo_xgb.predict(np.concatenate((X_stats_test, embeddingsTest), axis = 1))
p_combo_nbr_test = model_combo_nbr.predict(np.concatenate((X_stats_test, embeddingsTest), axis = 1))
test["target"] = 0.6 * p_combo_lr_test + 0.3 * p_combo_xgb_test + 0.1 * p_combo_nbr_test
test[["id", "target"]].to_csv("submission.csv", index=False)

from sklearn.model_selection import KFold

def kfoldTrainPredict(X, y, X_submission, clfs, n_folds):
    np.random.seed(0)  # seed to shuffle the train set

    kf = KFold(n_splits=2)
    kf.get_n_splits(X)
    kf = list(kf.split(X))
    
    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_validate = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        dataset_blend_validate_j = np.zeros((X_submission.shape[0], len(kf)))
        for i, (train, validate) in enumerate(kf):
            X_train = X[train]
            y_train = y[train]
            X_validate = X[validate]
            y_validate = y[validate]
            clf.fit(X_train, y_train)
            y_submission = clf.predict(X_validate)
            dataset_blend_train[validate, j] = y_submission
            dataset_blend_validate_j[:, i] = clf.predict(X_submission)
        dataset_blend_validate[:, j] = dataset_blend_validate_j.mean(1)


    clf = LinearRegression()
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict(dataset_blend_validate)
    return y_submission

clfs = [XGBRegressor(objective = "reg:squarederror", max_depth=10, n_estimators=100),
       LinearRegression()]
X = np.concatenate((X_stats, embeddings), 1)
embeddingsTest = np.array([nlp(text).vector for text in test["excerpt"]])
X_stats_test = test[["wordCount", "wordLength", "sentenceLength", "sentenceCount", "syllables"]]
#X_submission = np.concatenate((X_stats_test, embeddingsTest), axis = 1)

#y_submission = kfoldTrainPredict(X, y, X_submission, clfs, 10)

test["target"] = y_submission
test[["id", "target"]].to_csv("submission.csv", index=False)


#### Roberta

In [None]:
import tensorflow as tf



#optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1))

In [None]:
from_saved = True
kaggle = True
hasInternet = False

from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf
import pandas as pd
import numpy as np

tf.random.set_seed(1)

class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_learning_rate, learning_rate_scaling, epoch, batch_size, nsamples):
        super().__init__()
        self.total_step = tf.cast(nsamples/batch_size * epoch, tf.float32)
        self.initial_learning_rate = tf.cast(initial_learning_rate, tf.float32)
        self.learning_rate_scaling = tf.cast(learning_rate_scaling, tf.float32)

    def __call__(self, step):
        print("step: ", tf.cast(step, tf.float32), ", total: ", self.total_step)
        #return self.initial_learning_rate / (step + 1)
        r1 = (1 - self.learning_rate_scaling) * 10 * self.initial_learning_rate / self.total_step * step + self.learning_rate_scaling * self.initial_learning_rate
        r2 = -self.initial_learning_rate / (0.9 * self.total_step) * step + 10/9 * self.initial_learning_rate
        r = tf.cond(step < 0.1 * self.total_step, lambda: r1, lambda: r2)
        print("r =",r)

        return r

def root_mean_squared_error(y_true, y_pred):
    return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))

if hasInternet:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    roberta = TFRobertaModel.from_pretrained('roberta-base')
    tokenizer.save_pretrained('tokenizer')
    roberta.save_pretrained('roberta')
else:
    tokenizer = RobertaTokenizer.from_pretrained('../input/roberta')
    roberta = TFRobertaModel.from_pretrained('../input/roberta')

#inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
#outputs = roberta(inputs)
#last_hidden_states = outputs.last_hidden_state

tf.random.set_seed(1)
if kaggle:
    train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
    test0 = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
else:
    train = pd.read_csv("train.csv")
    test0 = train[:7]

n = train.shape[0]
from sklearn.model_selection import train_test_split
df_train, df_valid = train_test_split(train, test_size = 0.15)


epoch=50
ngpus = 1
batch_size = 16*ngpus
MAX_LEN = 256
def encode_text(txt, labels, tokenizer):
    for tt, ll in zip(txt, labels):
        encoded = tokenizer.__call__(tt, max_length = MAX_LEN, add_special_tokens = True, truncation = True,
                                     padding = 'max_length', return_attention_mask = True, return_tensors='tf')
        #encoded = tokenizer.__call__(tt, max_length = MAX_LEN, return_tensors='tf', truncation=True)

        yield (encoded.input_ids[0], encoded.attention_mask[0], ll)
ds = tf.data.Dataset.from_generator(lambda: encode_text(df_train.excerpt, df_train.target, tokenizer), 
                                    output_types = (tf.int32, tf.int32, tf.float64),
                                    output_shapes = ((MAX_LEN,),(MAX_LEN,),()))
ds = ds.map(lambda a,b,c: ({"input_ids":a, "attention_mask":b}, c)).batch(batch_size).repeat(epoch)

ds_valid = tf.data.Dataset.from_generator(lambda: encode_text(df_valid.excerpt, df_valid.target, tokenizer), 
                                    output_types = (tf.int32, tf.int32, tf.float64),
                                    output_shapes = ((MAX_LEN,),(MAX_LEN,),()))
ds_valid = ds_valid.map(lambda a,b,c: ({"input_ids":a, "attention_mask":b}, c)).batch(batch_size)

n_test = test0.shape[0]; test_targets = np.array([1.0 for i in range(n_test)])
ds_test = tf.data.Dataset.from_generator(lambda: encode_text(test0.excerpt, test_targets, tokenizer), 
                                    output_types = (tf.int32, tf.int32, tf.float64),
                                    output_shapes = ((MAX_LEN,),(MAX_LEN,),()))
ds_test = ds_test.map(lambda a,b,c: ({"input_ids":a, "attention_mask":b}, c)).batch(batch_size)

class text_model(tf.keras.Model):
    def __init__(self, base):
        super(text_model,self).__init__(name="text_model")
        self.bert = base
        #self.dense1 = tf.keras.layers.Dense(128, activation = 'relu', name = "dense1")
        self.dense1 = tf.keras.layers.Dense(256, activation = 'relu', name = "dense1")
        self.dense2 = tf.keras.layers.Dense(64, activation = 'relu', name = "dense2")
        self.dense3 = tf.keras.layers.Dense(1, activation = 'linear', name = "dense3")
    def call(self, x):
        #print("pooler_output = ", self.bert(x).pooler_output.shape)
        #print("dim = ", self.bert(x)[0][:,0,:].shape)
        #print("last hidden = ", self.bert(x).last_hidden_state)
        #yb = self.bert(x)[0][:,0,:]
        yb = self.bert(x).pooler_output
        yb = self.dense1(yb)
        yb = self.dense2(yb)
        yb = self.dense3(yb)
        return yb


if  not from_saved:
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        "model.h5",
        monitor="val_loss",
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
        mode="auto",
        save_freq="epoch",
        options=None
    )
    n = df_train.shape[0]
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=5e-5, decay_steps=n/batch_size/ngpus, decay_rate=0.9)
    #lr_schedule = MyLRSchedule(initial_learning_rate=3e-4, learning_rate_scaling=0.5, 
                                                             #epoch = epoch, batch_size = batch_size, nsamples=n)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    #optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5, epsilon = 1e-8)
    #optimizer = tf.keras.optimizers.SGD(learning_rate = 5e-5)

    loss_fn = root_mean_squared_error #tf.keras.losses.MeanSquaredError()
    model = text_model(roberta)
    model.compile(loss=loss_fn, optimizer=optimizer, metrics=[root_mean_squared_error])
    model.fit(ds, batch_size = batch_size, steps_per_epoch = n/batch_size/ngpus,
              epochs = epoch, validation_data = ds_valid, callbacks=[checkpoint])
    #model.save_weights("/tmp/model.weights")
else:
    model = text_model(roberta)
    optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5, epsilon = 1e-8)
    def root_mean_squared_error(y_true, y_pred):
            return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))
    loss_fn = root_mean_squared_error #tf.keras.losses.MeanSquaredError()
    model = text_model(roberta)
    model.compile(loss=loss_fn, optimizer=optimizer, metrics=[root_mean_squared_error])
    n = df_train.shape[0]
    model.fit(ds, batch_size = batch_size, steps_per_epoch = 1, epochs = 1)

#this is 128x64x0 model.load_weights("../input/model-weights-04934/model.h5")  
model.load_weights("../input/model-256-64-1-05134/model_256_64_1_05134.h5")
p_roberta1 = model.predict(ds_test)[:,0]
model.load_weights("../input/model-256-64-04983/model_256_64_1_04983.h5")
p_roberta2 = model.predict(ds_test)[:,0]
#print("rmse = ", root_mean_squared_error(p[:,0], df_valid.target))

#p = model.predict(ds_test)

In [None]:
# base 2
from_saved = True
kaggle = True
hasInternet = False

from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf
import pandas as pd
import numpy as np

tf.random.set_seed(1)

class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_learning_rate, learning_rate_scaling, epoch, batch_size, nsamples):
        super().__init__()
        self.total_step = tf.cast(nsamples/batch_size * epoch, tf.float32)
        self.initial_learning_rate = tf.cast(initial_learning_rate, tf.float32)
        self.learning_rate_scaling = tf.cast(learning_rate_scaling, tf.float32)

    def __call__(self, step):
        print("step: ", tf.cast(step, tf.float32), ", total: ", self.total_step)
        #return self.initial_learning_rate / (step + 1)
        r1 = (1 - self.learning_rate_scaling) * 10 * self.initial_learning_rate / self.total_step * step + self.learning_rate_scaling * self.initial_learning_rate
        r2 = -self.initial_learning_rate / (0.9 * self.total_step) * step + 10/9 * self.initial_learning_rate
        r = tf.cond(step < 0.1 * self.total_step, lambda: r1, lambda: r2)
        print("r =",r)

        return r

def root_mean_squared_error(y_true, y_pred):
    return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))

if hasInternet:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    roberta = TFRobertaModel.from_pretrained('roberta-base')
    tokenizer.save_pretrained('tokenizer')
    roberta.save_pretrained('roberta')
else:
    tokenizer = RobertaTokenizer.from_pretrained('../input/roberta')
    roberta = TFRobertaModel.from_pretrained('../input/roberta')

#inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
#outputs = roberta(inputs)
#last_hidden_states = outputs.last_hidden_state

tf.random.set_seed(1)
if kaggle:
    train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
    test0 = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
else:
    train = pd.read_csv("train.csv")
    test0 = train[:7]

n = train.shape[0]
from sklearn.model_selection import train_test_split
df_train, df_valid = train_test_split(train, test_size = 0.15)


epoch=50
ngpus = 1
batch_size = 16*ngpus
MAX_LEN = 256
def encode_text(txt, labels, tokenizer):
    for tt, ll in zip(txt, labels):
        encoded = tokenizer.__call__(tt, max_length = MAX_LEN, add_special_tokens = True, truncation = True,
                                     padding = 'max_length', return_attention_mask = True, return_tensors='tf')
        #encoded = tokenizer.__call__(tt, max_length = MAX_LEN, return_tensors='tf', truncation=True)

        yield (encoded.input_ids[0], encoded.attention_mask[0], ll)
ds = tf.data.Dataset.from_generator(lambda: encode_text(df_train.excerpt, df_train.target, tokenizer), 
                                    output_types = (tf.int32, tf.int32, tf.float64),
                                    output_shapes = ((MAX_LEN,),(MAX_LEN,),()))
ds = ds.map(lambda a,b,c: ({"input_ids":a, "attention_mask":b}, c)).batch(batch_size).repeat(epoch)

ds_valid = tf.data.Dataset.from_generator(lambda: encode_text(df_valid.excerpt, df_valid.target, tokenizer), 
                                    output_types = (tf.int32, tf.int32, tf.float64),
                                    output_shapes = ((MAX_LEN,),(MAX_LEN,),()))
ds_valid = ds_valid.map(lambda a,b,c: ({"input_ids":a, "attention_mask":b}, c)).batch(batch_size)

n_test = test0.shape[0]; test_targets = np.array([1.0 for i in range(n_test)])
ds_test = tf.data.Dataset.from_generator(lambda: encode_text(test0.excerpt, test_targets, tokenizer), 
                                    output_types = (tf.int32, tf.int32, tf.float64),
                                    output_shapes = ((MAX_LEN,),(MAX_LEN,),()))
ds_test = ds_test.map(lambda a,b,c: ({"input_ids":a, "attention_mask":b}, c)).batch(batch_size)

class text_model(tf.keras.Model):
    def __init__(self, base):
        super(text_model,self).__init__(name="text_model")
        self.bert = base
        #self.dense1 = tf.keras.layers.Dense(128, activation = 'relu', name = "dense1")
        self.dense1 = tf.keras.layers.Dense(256, activation = 'relu', name = "dense1")
        self.dense2 = tf.keras.layers.Dense(64, activation = 'relu', name = "dense2")
        self.dense3 = tf.keras.layers.Dense(1, activation = 'linear', name = "dense3")
    def call(self, x):
        #print("pooler_output = ", self.bert(x).pooler_output.shape)
        #print("dim = ", self.bert(x)[0][:,0,:].shape)
        #print("last hidden = ", self.bert(x).last_hidden_state)
        #yb = self.bert(x)[0][:,0,:]
        yb = self.bert(x).pooler_output
        yb = self.dense1(yb)
        #yb = self.dense2(yb)
        yb = self.dense3(yb)
        return yb


if  not from_saved:
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        "model.h5",
        monitor="val_loss",
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
        mode="auto",
        save_freq="epoch",
        options=None
    )
    n = df_train.shape[0]
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=5e-5, decay_steps=n/batch_size/ngpus, decay_rate=0.9)
    #lr_schedule = MyLRSchedule(initial_learning_rate=3e-4, learning_rate_scaling=0.5, 
                                                             #epoch = epoch, batch_size = batch_size, nsamples=n)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    #optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5, epsilon = 1e-8)
    #optimizer = tf.keras.optimizers.SGD(learning_rate = 5e-5)

    loss_fn = root_mean_squared_error #tf.keras.losses.MeanSquaredError()
    model = text_model(roberta)
    model.compile(loss=loss_fn, optimizer=optimizer, metrics=[root_mean_squared_error])
    model.fit(ds, batch_size = batch_size, steps_per_epoch = n/batch_size/ngpus,
              epochs = epoch, validation_data = ds_valid, callbacks=[checkpoint])
    #model.save_weights("/tmp/model.weights")
else:
    model = text_model(roberta)
    optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5, epsilon = 1e-8)
    def root_mean_squared_error(y_true, y_pred):
            return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))
    loss_fn = root_mean_squared_error #tf.keras.losses.MeanSquaredError()
    model = text_model(roberta)
    model.compile(loss=loss_fn, optimizer=optimizer, metrics=[root_mean_squared_error])
    n = df_train.shape[0]
    model.fit(ds, batch_size = batch_size, steps_per_epoch = 1, epochs = 1)

#this is 128x64x0 model.load_weights("../input/model-weights-04934/model.h5")  
p_folds = [0,1,2,3,4]
for fold in range(0,5):
    model.load_weights("../input/5fold-2561step5e5save10pass/model_fold" + str(fold + 1) + ".h5")
    p = model.predict(ds_test)
    p_folds[fold] = p[:,0]
#print("rmse = ", root_mean_squared_error(p[:,0], df_valid.target))

#p = model.predict(ds_test)

In [None]:
from_saved = True
kaggle = True
hasInternet = False

from transformers import RobertaTokenizer, TFRobertaModel
import tensorflow as tf
import pandas as pd
import numpy as np

tf.random.set_seed(1)

def root_mean_squared_error(y_true, y_pred):
    return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))

if hasInternet:
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    roberta = TFRobertaModel.from_pretrained('roberta-base')
    tokenizer.save_pretrained('tokenizer')
    roberta.save_pretrained('roberta')
else:
    tokenizer = RobertaTokenizer.from_pretrained('../input/tokenizerlarge')
    roberta = TFRobertaModel.from_pretrained('../input/roberta-large')

#inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
#outputs = roberta(inputs)
#last_hidden_states = outputs.last_hidden_state

tf.random.set_seed(1)
if kaggle:
    train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
    test0 = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
else:
    train = pd.read_csv("train.csv")
    test0 = train[:7]

n = train.shape[0]
from sklearn.model_selection import train_test_split
df_train, df_valid = train_test_split(train, test_size = 0.15)


epoch=50
ngpus = 1
batch_size = 4*ngpus
MAX_LEN = 256
def encode_text(txt, labels, tokenizer):
    for tt, ll in zip(txt, labels):
        encoded = tokenizer.__call__(tt, max_length = MAX_LEN, add_special_tokens = True, truncation = True,
                                     padding = 'max_length', return_attention_mask = True, return_tensors='tf')
        #encoded = tokenizer.__call__(tt, max_length = MAX_LEN, return_tensors='tf', truncation=True)

        yield (encoded.input_ids[0], encoded.attention_mask[0], ll)
ds = tf.data.Dataset.from_generator(lambda: encode_text(df_train.excerpt, df_train.target, tokenizer), 
                                    output_types = (tf.int32, tf.int32, tf.float64),
                                    output_shapes = ((MAX_LEN,),(MAX_LEN,),()))
ds = ds.map(lambda a,b,c: ({"input_ids":a, "attention_mask":b}, c)).batch(batch_size).repeat(epoch)

ds_valid = tf.data.Dataset.from_generator(lambda: encode_text(df_valid.excerpt, df_valid.target, tokenizer), 
                                    output_types = (tf.int32, tf.int32, tf.float64),
                                    output_shapes = ((MAX_LEN,),(MAX_LEN,),()))
ds_valid = ds_valid.map(lambda a,b,c: ({"input_ids":a, "attention_mask":b}, c)).batch(batch_size)

n_test = test0.shape[0]; test_targets = np.array([1.0 for i in range(n_test)])
ds_test = tf.data.Dataset.from_generator(lambda: encode_text(test0.excerpt, test_targets, tokenizer), 
                                    output_types = (tf.int32, tf.int32, tf.float64),
                                    output_shapes = ((MAX_LEN,),(MAX_LEN,),()))
ds_test = ds_test.map(lambda a,b,c: ({"input_ids":a, "attention_mask":b}, c)).batch(batch_size)

class text_model(tf.keras.Model):
    def __init__(self, base):
        super(text_model,self).__init__(name="text_model")
        self.bert = base
        #self.dense1 = tf.keras.layers.Dense(128, activation = 'relu', name = "dense1")
        self.dense1 = tf.keras.layers.Dense(256, activation = 'relu', name = "dense1")
        self.dense2 = tf.keras.layers.Dense(64, activation = 'relu', name = "dense2")
        self.dense3 = tf.keras.layers.Dense(1, activation = 'linear', name = "dense3")
    def call(self, x):
        #print("pooler_output = ", self.bert(x).pooler_output.shape)
        #print("dim = ", self.bert(x)[0][:,0,:].shape)
        #print("last hidden = ", self.bert(x).last_hidden_state)
        #yb = self.bert(x)[0][:,0,:]
        yb = self.bert(x).pooler_output
        yb = self.dense1(yb)
        yb = self.dense2(yb)
        yb = self.dense3(yb)
        return yb


if  not from_saved:
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        "model.h5",
        monitor="val_loss",
        verbose=1,
        save_best_only=True,
        save_weights_only=True,
        mode="auto",
        save_freq="epoch",
        options=None
    )
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=5e-5, decay_steps=n/batch_size/ngpus, decay_rate=0.9)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

    #optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5, epsilon = 1e-8)
    #optimizer = tf.keras.optimizers.SGD(learning_rate = 5e-5)

    loss_fn = root_mean_squared_error #tf.keras.losses.MeanSquaredError()
    model = text_model(roberta)
    model.compile(loss=loss_fn, optimizer=optimizer, metrics=[root_mean_squared_error])
    n = df_train.shape[0]
    model.fit(ds, batch_size = batch_size, steps_per_epoch = n/batch_size/ngpus,
          epochs = epoch, validation_data = ds_valid, callbacks=[checkpoint])
    #model.save_weights("/tmp/model.weights")
else:
    model = text_model(roberta)
    optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5, epsilon = 1e-8)
    def root_mean_squared_error(y_true, y_pred):
            return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))
    loss_fn = root_mean_squared_error #tf.keras.losses.MeanSquaredError()
    model = text_model(roberta)
    model.compile(loss=loss_fn, optimizer=optimizer, metrics=[root_mean_squared_error])
    n = df_train.shape[0]
    model.fit(ds, batch_size = 1, steps_per_epoch = 1, epochs = 1)

#this is 128x64x0 model.load_weights("../input/model-weights-04934/model.h5")  
model.load_weights("../input/model-large-256-64-1-048277/model_large_256_64_1_048277.h5")
p_roberta3 = model.predict(ds_test)[:,0]
model.load_weights("../input/model-large-256-64-1-046656/model_large_256_64_1_046656.h5")
p_roberta4 = model.predict(ds_test)[:,0]
#print("rmse = ", root_mean_squared_error(p[:,0], df_valid.target))

#p = model.predict(ds_test)

In [None]:
p_roberta = .3 * p_roberta4 + .3 * p_roberta3 + .3 * p_roberta2 + .1 * p_roberta1 
p_f = 0.2*p_folds[0] + 0.2*p_folds[1] + 0.2*p_folds[2] + 0.2*p_folds[3] + 0.2*p_folds[4]
p_roberta = .8 * p_roberta + .2 *p_f

In [None]:
embeddingsTest = np.array([nlp(text).vector for text in test["excerpt"]])
X_stats_test = test[["wordCount", "wordLength", "sentenceLength", "sentenceCount", "syllables"]]
p_combo_lr_test = model_combo_lr.predict(np.concatenate((X_stats_test, embeddingsTest), axis = 1))
p_combo_xgb_test = model_combo_xgb.predict(np.concatenate((X_stats_test, embeddingsTest), axis = 1))
test["target"] = 0.03 * p_combo_lr_test + 0.02 * p_combo_xgb_test + .95 * p_roberta
test[["id", "target"]].to_csv("submission.csv", index=False)

In [None]:
!cat submission.csv

In [None]:
# IDEAS TO TRY:
# parts of speech (hypothesis: more adjectives/adverbs = easier readability)
# Tfidf
# word2vec
