In [21]:
import pandas as pd
import numpy as np

from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU, Rescaling
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from keras.preprocessing.text import Tokenizer

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split as tts

import lightgbm as lgb

# Data Processing

In [82]:
seed = 0

train_df = pd.read_csv("../input/train.csv", index_col="text_id")
X_train = train_df.full_text
cols = [col for col in train_df.columns if col != "full_text"]
y_train = train_df[cols]
X_test = pd.read_csv("../input/test.csv", index_col="text_id").full_text
X_test_idx = X_test.index

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_matrix(X_train, "tfidf")
X_test = tokenizer.texts_to_matrix(X_test, "tfidf")

pca = PCA(n_components=100, whiten=True, random_state=seed)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

X_train, X_val, y_train, y_val = tts(X_train, y_train, test_size=.1, random_state=seed)

lgb_trains = {}
lgb_vals = {}
for col in cols:
    exec(f"lgb_trains['{col}'] = lgb.Dataset(X_train, y_train.{col})")
    exec(f"lgb_vals['{col}'] = lgb.Dataset(X_val, y_val.{col})")

# Custom Loss Function - MCRMSE

In [56]:
@tf.autograph.experimental.do_not_convert
def MCRMSE_keras(y_true, y_pred):
    return tf.reduce_mean(tf.reduce_mean(tf.square(y_true - y_pred), axis=1))

def MCRMSE_lgb(preds, eval_data):
    diff = eval_data - preds
    sq = np.square(diff)
    rmse = np.sum(sq, axis=0) / eval_data.shape[0]
    return "MCRMSE", np.sum(rmse) / eval_data.shape[1], False

# Build Models

## keras 1

In [33]:
keras1_model = Sequential()
keras1_model.add(Dense(500, input_dim=X_train.shape[1], activation="relu"))
keras1_model.add(BatchNormalization())
keras1_model.add(Dense(500, activation="relu"))
keras1_model.add(Dropout(.3))
keras1_model.add(Dense(500, activation=LeakyReLU(.1)))
keras1_model.add(Dropout(.2))
keras1_model.add(Dense(500, activation="relu"))
keras1_model.add(Dense(y_train.shape[1], activation="sigmoid"))
keras1_model.add(Rescaling(4, offset=1))

optimizer = optimizers.Adam(amsgrad=True)
keras1_model.compile(loss=MCRMSE_keras, optimizer=optimizer, metrics=[MCRMSE_keras])
keras1_model.fit(X_train, y_train, batch_size=2**3, epochs=30, verbose=1,
          validation_data=(X_val, y_val), workers=30, use_multiprocessing=True,
          callbacks=[EarlyStopping(monitor="loss", patience=3, restore_best_weights=True)])

keras1_pred = pd.DataFrame(keras1_model.predict(X_test), columns=cols, index=X_test_idx)
keras1_pred

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f86940b2fa0>



Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000C359D63E,3.038789,2.52582,3.111001,2.971141,2.554483,2.692028
000BAD50D026,2.906303,2.725901,2.728677,2.433096,2.416244,2.971677
00367BB2546B,4.091075,3.59216,3.719064,3.751686,3.624677,3.804587


## keras 2

In [31]:
keras2_model = Sequential()
keras2_model.add(Dense(2000, input_dim=X_train.shape[1], activation="relu"))
keras2_model.add(BatchNormalization())
keras2_model.add(Dense(2000, activation="relu"))
keras2_model.add(Dropout(.3))
keras2_model.add(Dense(3000, activation=LeakyReLU(.1)))
keras2_model.add(Dropout(.2))
keras2_model.add(Dense(2000, activation="relu"))
keras2_model.add(Dense(500, activation="relu"))
keras2_model.add(Dense(3000, activation="softplus"))
keras2_model.add(BatchNormalization())
keras2_model.add(Dense(1000, activation=LeakyReLU(.1)))
keras2_model.add(Dropout(.3))
keras2_model.add(Dense(3000, activation="softsign"))
keras2_model.add(Dense(1000, activation=LeakyReLU(.1)))
keras2_model.add(Dropout(.1))
keras2_model.add(Dense(3000, activation="softplus"))
keras2_model.add(Dropout(.4))
keras2_model.add(Dense(3000, activation="relu"))
keras2_model.add(Dense(1000, activation="relu"))
keras2_model.add(BatchNormalization())
keras2_model.add(Dense(y_train.shape[1], activation="sigmoid"))
keras2_model.add(Rescaling(4, offset=1))

optimizer = optimizers.Adam(amsgrad=True)
keras2_model.compile(loss=MCRMSE_keras, optimizer=optimizer, metrics=[MCRMSE_keras])
keras2_model.fit(X_train, y_train, batch_size=2**3, epochs=50, verbose=1,
          validation_data=(X_val, y_val), workers=30, use_multiprocessing=True,
          callbacks=[EarlyStopping(monitor="loss", patience=3, restore_best_weights=True)])

keras2_pred = pd.DataFrame(keras2_model.predict(X_test), columns=cols, index=X_test_idx)
keras2_pred

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f85621b51f0>



Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000C359D63E,2.991902,2.925776,3.117795,2.986269,2.844204,2.960994
000BAD50D026,3.017218,2.951318,3.138716,3.01341,2.8785,2.985172
00367BB2546B,3.739797,3.693804,3.748791,3.783935,3.856252,3.687125


## lgb 1

In [98]:
lgb1_models = {}
lgb1_preds = {}

for score in cols:
    lgb1_params = {'objective': 'regression',
                   'metric': 'rmse',
                   'verbosity': 0,
                   'early_stopping_round': 50,
                   'random_state': seed,
                   'device': 'gpu'}
    
    train_set=lgb_trains[score]
    valid_sets=lgb_vals[score]

    lgb1_model = lgb.train(
        params=lgb1_params,
        train_set=train_set,
        num_boost_round=1000,
        valid_sets=(train_set, valid_sets),
        callbacks=None,
        verbose_eval=100
    )
    
    lgb1_models[score] = lgb1_model
    lgb1_preds[score] = lgb1_model.predict(X_test)
    
lgb1_pred = pd.DataFrame(lgb1_preds, index=X_test_idx)
lgb1_pred



Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.229601	valid_1's rmse: 0.604431
Early stopping, best iteration is:
[71]	training's rmse: 0.288907	valid_1's rmse: 0.598802
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.228208	valid_1's rmse: 0.555123
Early stopping, best iteration is:
[71]	training's rmse: 0.286076	valid_1's rmse: 0.548253
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[41]	training's rmse: 0.322176	valid_1's rmse: 0.498272
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.229137	valid_1's rmse: 0.556112
Early stopping, best iteration is:
[72]	training's rmse: 0.283216	valid_1's rmse: 0.55231
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.249628	valid_1's rmse: 0.618543
Early stopping, best iteration is:
[58]	training's rmse: 0.346478	valid_1's rmse: 0.612825
Training unti

Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000C359D63E,3.011834,2.909469,3.16696,2.825631,2.714723,2.622996
000BAD50D026,3.220171,2.851266,3.033757,3.00667,2.572265,3.079148
00367BB2546B,3.847193,3.421739,3.666559,3.480376,3.541353,3.499535


In [110]:
pred = pd.DataFrame(np.mean(np.array([keras1_pred, keras2_pred, lgb1_pred]), axis=0), columns=cols, index=X_test_idx)

In [60]:
pred.to_csv("submission.csv", index=True)