In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import collections

import warnings
warnings.filterwarnings("ignore")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/test.csv')
ss = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv')
print(train.columns)

In [None]:
def make_l(v):
    return len(v)
for col in ["full_text"]:
    train[col+"_L"] = train[col].apply(make_l)

In [None]:
train.full_text_L.describe(percentiles=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0])

In [None]:
max_seq_length = 512*6
max_seq_length

In [None]:
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
model = TFDistilBertModel.from_pretrained("bert-base-uncased")

# model

In [None]:
class MyModel(tf.keras.Model):
    def __init__(self,model):
        super().__init__()
        self.model = model
        self.model.trainable = False
        self.Bidirectional=tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))        
        self.layersG = tf.keras.layers.GlobalMaxPool1D()
        self.layers1 = tf.keras.layers.Dense(200, activation="relu")
        self.layersD2 = tf.keras.layers.Dropout(0.2)
        self.layers3 = tf.keras.layers.Dense(100, activation="relu")
        self.layersD4 = tf.keras.layers.Dropout(0.2)
        self.layers5 = tf.keras.layers.Dense(66)
        self.layers6 = tf.keras.layers.Dense(66, activation="relu")
        
        #self.relu = tf.keras.layers.ReLU()
        self.softmax = tf.keras.layers.Softmax(-1)
    def call(self, inputs):
        input_ids = inputs["input_ids"]
        input_mask = inputs["input_mask"]
        embedding_layer1  = self.model.distilbert(input_ids[:,0:512],input_mask[:,0:512])[0]
        #embedding_layer2  = self.model.distilbert(input_ids[:,512:512*2],input_mask[:,512:512*2])[0]
        embedding_layer3  = self.model.distilbert(input_ids[:,512*2:512*3],input_mask[:,512*2:512*3])[0]
        embedding_layer4  = self.model.distilbert(input_ids[:,512*3:512*4],input_mask[:,512*3:512*4])[0]
        #embedding_layer5  = self.model.distilbert(input_ids[:,512*4:512*5],input_mask[:,512*4:512*5])[0]
        embedding_layer6  = self.model.distilbert(input_ids[:,512*5:512*6],input_mask[:,512*5:512*6])[0]
        #embedding_layer = tf.concat([embedding_layer1,embedding_layer2,embedding_layer3,
        #                             embedding_layer4,embedding_layer5,embedding_layer6],axis = 1)
        embedding_layer = tf.concat([embedding_layer1,embedding_layer3,embedding_layer4,embedding_layer6],axis = 1)
        X =self.Bidirectional(embedding_layer)
        X = self.layersG(X)
        X = self.layers1(X)
        X = self.layersD2(X)
        X = self.layers3(X)
        X = self.layersD4(X)
        X = self.layers5(X)
        X = self.layers6(X) 
        print(X)
        X = tf.reshape(X,(-1,6,11))
        print(X)
        X = self.softmax(X)
        print(X)
        return X

In [None]:
def create_tf_data_file(dt,output_file):
    tf_record_writer = tf.io.TFRecordWriter(output_file)
    for i in range(dt.shape[0]):
        texts = dt["full_text"][i]
        all_features = collections.OrderedDict()
        encode_plus_tokens = tokenizer.encode_plus(texts,padding='max_length',max_length=max_seq_length,truncation=True)
        input_ids = encode_plus_tokens["input_ids"]
        input_mask = encode_plus_tokens["attention_mask"]
        segment_ids = [0] *max_seq_length
        all_features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=input_ids))
        all_features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=input_mask))
        all_features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=segment_ids))
        lables = np.int64(dt[['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']].values[i]/0.5)
        #lables = dt[['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']].values[i]/0.5
        #print(lables)
        all_features["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=lables))
        tf_record = tf.train.Example(features=tf.train.Features(feature=all_features))
        tf_record_writer.write(tf_record.SerializeToString())
    tf_record_writer.close()

In [None]:
create_tf_data_file(train,"train_data.tfrecord")

In [None]:
def select_data_and_label_from_record(record):
    x = {
        "input_ids": record["input_ids"],
        "input_mask": record["input_mask"],
        #"segment_ids": record["segment_ids"],
    }
    y = record["label_ids"]
    return (x, y)
def _decode_record(record, name_to_features):
    """Decodes a record to a TensorFlow example."""
    return tf.io.parse_single_example(record, name_to_features)

In [None]:
def create_train_test_data(file_name,isTrain = False):
    dataset = tf.data.TFRecordDataset(file_name)
    if isTrain :
        dataset = dataset.repeat(40)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    name_to_features = {
            "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            #"segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "label_ids": tf.io.FixedLenFeature([6], tf.int64),
        }
    drop_remainder=False
    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
            lambda record: _decode_record(record, name_to_features),
            batch_size=100,
            drop_remainder=drop_remainder,
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        )
    )
    dataset.cache()
    re_dataset = dataset.map(select_data_and_label_from_record)
    return re_dataset

In [None]:
train_dataset = create_train_test_data("train_data.tfrecord",True)

In [None]:
train_dataset

In [None]:
learning_rate = 1e-3
epsilon = 1e-6
loss = tf.keras.losses.SparseCategoricalCrossentropy()
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)

#cup or gpu
mode_text = MyModel(model)
#mode_text = tf.keras.models.load_model("../input/english-language-learning-model/model_text_kaggle_001.ml")
mode_text.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
history = mode_text.fit(
    train_dataset,
    steps_per_epoch = 100,
    #batch_size=10,
    epochs=4)

In [None]:
mode_text.save("model_text_kaggle_002.ml")