# Load libraries

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

import tensorflow as tf
import tensorflow_addons as tfa

import transformers

from transformers import AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

from datasets import Dataset

import os
import gc
import sys
from tqdm.notebook import tqdm

# Set Configs

In [2]:
CONFIG = {
        'folds': 5,
        'seed': 101,
        'robertabase': '../input/huggingface-roberta-variants/roberta-base/roberta-base',
        'robertalarge': '../input/huggingface-roberta-variants/roberta-large/roberta-large',
        #'debertav3base': '../input/debertav3base',
        #'debertav3large': '../input/deberta-v3-large/deberta-v3-large/',
        'xlmrobertabase': '../input/huggingface-roberta-variants/tf-xlm-roberta-base/tf-xlm-roberta-base',
        'distilrobertabase': '../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base',
        #'debertav3large_npy': '../input/fb3-save-pretrained-embeddings/debertav3large_FB3.npy',
        #'distilrobertabase_npy': '../input/fb3-save-pretrained-embeddings/distilrobertabase_FB3.npy',

        'batch_size': 4,
        'max_len': 512
        }

# Read in data

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
       os.path.join(dirname, filename)

In [4]:
#df = pd.read_csv("/kaggle/input/large580/train_20000.csv")
#msk = np.random.rand(len(df)) <= 0.9
#tgtCols = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
#train = df[msk].dropna()
#test = df[~msk].dropna()
#test = pd.read_csv("../input/feedback-prize-english-language-learning/test.csv")

In [5]:

train = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv")
test = pd.read_csv("/kaggle/input/580data/test.csv")
tgtCols = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
#train = train[['text_id','full_text','cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']]
#test = test[['text_id','full_text','cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']]
print(train.shape)
print(test.shape)


(3911, 8)
(9, 3)


In [6]:
test

Unnamed: 0.1,Unnamed: 0,text_id,full_text
0,0,0000C359D63E,when a person has no experience on a job their...
1,1,000BAD50D026,Do you think students would benefit from being...
2,2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."
3,3,hp,Dumbledore and Professor McGonagall bent forwa...
4,4,tkm,"Being Southerners, it was a source of shame to..."
5,5,va,To improve the compositionality of the semanti...
6,6,ll,The first Wednesday in every month was a Perfe...
7,7,sp,Thou see'st these lovers seek a place to fight...
8,8,bp,"Fifteen years later, in 1980, Jintong emerges ..."


In [7]:
my_list = train.columns.values.tolist()
my_list

['text_id',
 'full_text',
 'cohesion',
 'syntax',
 'vocabulary',
 'phraseology',
 'grammar',
 'conventions']

In [8]:
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5


# Create folds

In [9]:
train.loc[:, 'kfold'] = -1 # Create a new column `fold` containing `-1`s.
train = train.sample(frac=1).reset_index(drop=True) # Shuffle the rows.
data_labels = train[tgtCols].values

In [10]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [11]:
mskf = MultilabelStratifiedKFold(n_splits=5)
for f, (t, v) in enumerate(mskf.split(X=train, y=data_labels)):
    train.loc[v, 'kfold'] = f + 1

In [12]:
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,kfold
0,94EC7DF6EA74,People these days dont go outdoors as much as ...,2.5,2.5,3.0,3.0,3.0,3.0,1
1,B204869FA146,Is technology a positive or negative effect on...,3.5,3.5,3.5,3.5,3.5,3.5,2
2,C47E98E69AFB,I'm a gree because if we are enthusiasm is pro...,2.5,2.0,2.5,2.5,2.5,2.5,3
3,405CEC6E7F5D,Do you ever wonder if we should choose our own...,3.0,3.5,4.0,3.5,3.5,3.5,4
4,0E03218FDD2F,Educators say that students should have shorte...,4.0,4.5,4.0,5.0,5.0,5.0,5
...,...,...,...,...,...,...,...,...,...
3906,3651697D855B,FAILURE PLAYS IN THE PURSUIT OF SUCCESS.\n\nI ...,3.5,3.0,4.0,3.5,3.0,3.5,2
3907,1928846BB272,I believe that is good idea for the students t...,3.0,2.5,3.0,2.0,2.5,3.0,5
3908,2157AE4AFAA6,I agree that people make their own decisions t...,4.0,4.0,3.5,3.5,3.0,3.0,4
3909,30B9B18CCCC2,I believe technology is good thing because you...,2.5,2.5,2.0,3.0,2.5,2.0,3


In [13]:
train['kfold'].value_counts().sort_index()

1    782
2    782
3    782
4    783
5    782
Name: kfold, dtype: int64

## Data process functions

In [14]:
def hf_encode(texts, chkpt):
    
    tokenizer = transformers.AutoTokenizer.from_pretrained(CONFIG[chkpt])
    tokenizer.save_pretrained('./tokenizer/')

    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=CONFIG['max_len'], 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")

In [15]:
def pickle_dump(path, saveobj):
    import pickle
    filehandler = open(path,"wb")
    pickle.dump(saveobj,filehandler)
#     print("File pickled")
    filehandler.close()

In [16]:
def pickle_load(path):
    import pickle
    file = open(path,'rb')
    loadobj = pickle.load(file)
    file.close()
    return loadobj

## Transformer embeddings

In [17]:
def pretrain_embeddings(chkpt, df):
    cfg = transformers.AutoConfig.from_pretrained(CONFIG[chkpt], output_hidden_states=True)
    cfg.hidden_dropout_prob = 0
    cfg.attention_probs_dropout_prob = 0
    cfg.save_pretrained('./tokenizer/')
    
    input_ids = tf.keras.layers.Input(
        shape=(CONFIG['max_len'],), dtype=tf.int32, name="input_ids"
    )
    
    attention_masks = tf.keras.layers.Input(
        shape=(CONFIG['max_len'],), dtype=tf.int32, name="attention_masks"
    )
    
    try:
        model = transformers.TFAutoModel.from_pretrained(CONFIG[chkpt], config=cfg)
    except:
        model = transformers.TFAutoModel.from_pretrained(CONFIG[chkpt], config=cfg, from_pt=True)
        
    output = model(
        input_ids, attention_mask=attention_masks
    )
    hidden_states = output.hidden_states
    mean_pool = []
    for hidden_s in hidden_states[-1:]:
        #def call(self, inputs, mask=None):
        broadcast_mask = tf.expand_dims(tf.cast(attention_masks, "float32"), -1)
        embedding_sum = tf.reduce_sum(hidden_s * broadcast_mask, axis=1)
        mask_sum = tf.reduce_sum(broadcast_mask, axis=1)
        mask_sum = tf.math.maximum(mask_sum, tf.constant([1e-9]))
        tmp = embedding_sum / mask_sum
        mean_pool.append(tmp)
    output = tf.stack(mean_pool,axis=2)
   
    #output = tf.stack(
    #    [MeanPool()(hidden_s, mask=attention_masks) for hidden_s in hidd20000en_states[-1:]], 
    #    axis=2)
    
    output = tf.squeeze(output, axis=-1)
    
    model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)

    model.compile(optimizer="adam",
                 loss='huber_loss',
                 metrics=[tf.keras.metrics.RootMeanSquaredError()],
                 )
    print(model.summary())
    dataset = hf_encode(df['full_text'], chkpt)
    preds = model.predict(dataset, batch_size=CONFIG['batch_size'])
    
    del model, dataset
    _ = gc.collect()
    
    return preds

# Model training

In [18]:
#train_dataset = pretrain_embeddings('debertav3large', train)
#train_dataset = np.concatenate([train_dataset, pretrain_embeddings('distilrobertabase', train)], axis=1)
#train_dataset = np.concatenate([train_dataset, pretrain_embeddings('robertabase', train)], axis=1)
#train_dataset = np.concatenate([train_dataset, pretrain_embeddings('robertalarge', train)], axis=1)
#train_dataset = np.concatenate([train_dataset, pretrain_embeddings('debertav3base', train)], axis=1)
#train_dataset.shape

#train_dataset = np.load(CONFIG['debertav3large_npy'])
#train_dataset = np.concatenate([train_dataset, np.load(CONFIG['distilrobertabase_npy'])], axis=1)

#train_dataset.shape

train_dataset = pretrain_embeddings('distilrobertabase', train)

#test_dataset = np.concatenate([test_dataset, pretrain_embeddings('bertbasecased', test)], axis=1)
train_dataset = np.concatenate([train_dataset, pretrain_embeddings('robertabase', train)], axis=1)
train_dataset = np.concatenate([train_dataset, pretrain_embeddings('robertalarge', train)], axis=1)

train_dataset.shape

2022-11-29 18:01:21.525088: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 18:01:21.526165: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 18:01:21.527375: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 18:01:21.528234: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 18:01:21.529050: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast (TFOpLambda)            (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_masks[0][0]        

2022-11-29 18:01:48.293260: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast_1 (TFOpLambda)          (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model_1 (TFRobertaMo TFBaseModelOutputWit 124645632   input_ids[0][0]                  
                                                                 attention_masks[0][0]      

(3911, 2560)

In [19]:
scores = []
rmse_scores = []

for fold in range(1,CONFIG['folds']):

    print('#'*25)
    print(f'## Fold {fold}')
    print('#'*25)

    trn_idx = train[train['kfold']==fold].index.values
    val_idx = train[train['kfold']!=fold].index.values
    print(f"trn_idx len is {len(trn_idx)}")

    X_train = train_dataset[trn_idx,:]
    X_valid = train_dataset[val_idx,:]

    y_train = train[train['kfold']==fold][tgtCols].copy()
    y_valid = train[train['kfold']!=fold][tgtCols].copy()

    val_preds = np.zeros((len(val_idx),6))

    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        clf = SVR(C=1)
        clf.fit(X_train, y_train[tgt].values)
        pickle_dump(f"./SVR_tgt{tgt}_fold{fold}.pkl", clf)
        val_preds[:,i] = clf.predict(X_valid)
   
    
    for i in range(len(tgtCols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_valid[tgtCols].values[:,i],val_preds[:,i])))
        score = np.mean(rmse_scores)
    #score = mcrmse(y_valid[tgtCols].values, val_preds)
        scores.append(score)
    print("Fold : {} RMSE score: {}".format(fold,score))

    print('#'*25)
    print('Overall CV RMSE =',np.mean(scores))


#########################
## Fold 1
#########################
trn_idx len is 782
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 1 RMSE score: 0.49369876658596223
#########################
Overall CV RMSE = 0.5003134915992332
#########################
## Fold 2
#########################
trn_idx len is 782
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 2 RMSE score: 0.4951160696311996
#########################
Overall CV RMSE = 0.49792241248174846
#########################
## Fold 3
#########################
trn_idx len is 782
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 3 RMSE score: 0.4961517005707978
#########################
Overall CV RMSE = 0.4973920479731967
#########################
## Fold 4
#########################
trn_idx len is 783
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 4 RMSE score: 0.4962169228261981
#########################
Overall CV RMSE

In [20]:
del train_dataset
_ = gc.collect()

# Model inference

In [21]:
test_dataset = pretrain_embeddings('distilrobertabase', test)

#test_dataset = np.concatenate([test_dataset, pretrain_embeddings('bertbasecased', test)], axis=1)
test_dataset = np.concatenate([test_dataset, pretrain_embeddings('robertabase', test)], axis=1)
test_dataset = np.concatenate([test_dataset, pretrain_embeddings('robertalarge', test)], axis=1)

test_dataset.shape

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast_3 (TFOpLambda)          (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model_3 (TFRobertaMo TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_masks[0][0]      

(9, 2560)

In [22]:
fold_preds = []

for fold in range(1,CONFIG['folds']):

    print('#'*25)
    print(f'## Fold {fold}')
    print('#'*25)
    
    test_preds = np.zeros((len(test_dataset),6))
    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        model = pickle_load(f"./SVR_tgt{tgt}_fold{fold}.pkl")
        test_preds[:,i] = model.predict(test_dataset)
    
    fold_preds.append(test_preds)
    
    for i in range(len(tgtCols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_valid[tgtCols].values[:,i],val_preds[:,i])))
        score = np.mean(rmse_scores)
    #score = mcrmse(y_valid[tgtCols].values, val_preds)
        scores.append(score)
    print("Fold : {} RMSE score: {}".format(fold,score))

    print('#'*25)
    print('Overall CV RMSE =',np.mean(scores))
    
    del model
    _ = gc.collect()

#########################
## Fold 1
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 1 RMSE score: 0.4962560561794383
#########################
Overall CV RMSE = 0.49693927187921205
#########################
## Fold 2
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 2 RMSE score: 0.4962821450815984
#########################
Overall CV RMSE = 0.49683161164700007
#########################
## Fold 3
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 3 RMSE score: 0.4963007800117128
#########################
Overall CV RMSE = 0.4967568860930437
#########################
## Fold 4
#########################
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 4 RMSE score: 0.4963147562092985
#########################
Overall CV RMSE = 0.4967023280817002


In [23]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)

In [24]:
output_df = test[['text_id']].reset_index()
output_df

Unnamed: 0,index,text_id
0,0,0000C359D63E
1,1,000BAD50D026
2,2,00367BB2546B
3,3,hp
4,4,tkm
5,5,va
6,6,ll
7,7,sp
8,8,bp


In [25]:
preds_df = pd.DataFrame(preds, columns = tgtCols)
preds_df

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,2.868558,2.723777,3.030499,2.867313,2.655197,2.683695
1,2.809209,2.584355,2.831141,2.611344,2.397681,2.666667
2,3.354055,3.251415,3.45073,3.396637,3.267125,3.292837
3,3.310319,3.251588,3.468873,3.384124,3.322744,3.133408
4,3.539252,3.475078,3.670111,3.58357,3.504443,3.409015
5,3.620723,3.510035,3.749914,3.576289,3.472366,3.49709
6,3.44924,3.376074,3.614669,3.495952,3.402741,3.243361
7,2.987999,2.87871,3.231297,2.989895,2.841649,2.683772
8,3.516936,3.425442,3.695985,3.536321,3.416549,3.346942


In [26]:
preds_df['text_id'] = output_df['text_id']
preds_df = preds_df.reindex(['text_id', *preds_df.columns], axis=1).iloc[: , :-1]
preds_df

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.868558,2.723777,3.030499,2.867313,2.655197,2.683695
1,000BAD50D026,2.809209,2.584355,2.831141,2.611344,2.397681,2.666667
2,00367BB2546B,3.354055,3.251415,3.45073,3.396637,3.267125,3.292837
3,hp,3.310319,3.251588,3.468873,3.384124,3.322744,3.133408
4,tkm,3.539252,3.475078,3.670111,3.58357,3.504443,3.409015
5,va,3.620723,3.510035,3.749914,3.576289,3.472366,3.49709
6,ll,3.44924,3.376074,3.614669,3.495952,3.402741,3.243361
7,sp,2.987999,2.87871,3.231297,2.989895,2.841649,2.683772
8,bp,3.516936,3.425442,3.695985,3.536321,3.416549,3.346942


## Running SVR after TF-IDF

In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
import math
from sklearn.svm import SVR

In [28]:
# Running for the train full_text with training all
# fit the six test as test
full_df = np.concatenate((train.full_text.values,test.full_text.values))

In [29]:
tfidf_featurizer = TfidfVectorizer(max_features=10000, max_df=0.95, stop_words='english')
X_tfidf = tfidf_featurizer.fit_transform(full_df)

In [30]:
# SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X_tfidf[0:len(train.full_text)], 
                                                    train[tgtCols].values,
                                                    test_size=0.10,
                                                    random_state=42)

In [31]:
print(X_train.shape)
print(X_test.shape)

(3519, 10000)
(392, 10000)


In [32]:
best_params = {'C' : 5, 
                'epsilon': 0.1, 
                'gamma' : 1, 
                'kernel' : 'rbf'} 

In [33]:
data_test = X_tfidf[len(train.full_text):]

In [34]:
#  
df_sum = pd.DataFrame([],index=test.text_id,columns= tgtCols)

In [35]:
svr_clf = SVR(**best_params)
rerror = []
for k in range(0,y_train.shape[1]):
  svr_clf.fit(X_train, y_train[:,k])
  rf_preds = svr_clf.predict(X_test)
  rerror.append(mse(rf_preds,y_test[:,k]))
  MSE = np.mean(rerror)
  RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)


Root Mean Square Error:

0.5395478005692772


In [36]:
svr_clf = SVR(**best_params)
error = []
for k in range(0,y_train.shape[1]):
  svr_clf.fit(X_train, y_train[:,k])
  rf_preds = svr_clf.predict(data_test)
  df_sum[tgtCols[k]] = rf_preds
  #error.append(rmse(rf_preds,y_test[:,k],squared=False))



In [37]:
df_sum

Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000C359D63E,2.884783,2.87827,3.206788,3.080244,2.6903,2.805317
000BAD50D026,3.011864,2.715047,2.910967,2.60153,2.606945,2.941732
00367BB2546B,3.379,3.427497,3.500803,3.390947,3.323504,3.313027
hp,2.739642,2.697657,2.989489,2.872468,2.830054,2.769279
tkm,2.824272,2.766787,2.995233,2.877292,2.892813,2.75463
va,3.124648,2.946333,3.352755,3.092867,3.102044,3.007873
ll,2.965974,2.968808,3.134421,3.019536,3.063529,2.894689
sp,2.74159,2.750508,2.991412,2.947629,2.959438,2.751547
bp,2.837431,2.776432,3.090797,2.986388,2.916751,2.871064
