# Load libraries

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

import tensorflow as tf
import tensorflow_addons as tfa

import transformers

from transformers import AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding
from transformers import logging as hf_logging
hf_logging.set_verbosity_error()

from datasets import Dataset

import os
import gc
import sys
from tqdm.notebook import tqdm

# Set Configs

In [2]:
CONFIG = {
        'folds': 5,
        'seed': 101,
        'robertabase': '../input/huggingface-roberta-variants/roberta-base/roberta-base',
        'robertalarge': '../input/huggingface-roberta-variants/roberta-large/roberta-large',
        #'debertav3base': '../input/debertav3base',
        #'debertav3large': '../input/deberta-v3-large/deberta-v3-large/',
        'xlmrobertabase': '../input/huggingface-roberta-variants/tf-xlm-roberta-base/tf-xlm-roberta-base',
        'distilrobertabase': '../input/huggingface-roberta-variants/distilroberta-base/distilroberta-base',
        #'debertav3large_npy': '../input/fb3-save-pretrained-embeddings/debertav3large_FB3.npy',
        #'distilrobertabase_npy': '../input/fb3-save-pretrained-embeddings/distilrobertabase_FB3.npy',

        'batch_size': 4,
        'max_len': 512
        }

# Read in data

In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
       os.path.join(dirname, filename)

In [4]:
#df = pd.read_csv("/kaggle/input/large580/train_20000.csv")
#msk = np.random.rand(len(df)) <= 0.9
#tgtCols = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
#train = df[msk].dropna()
#test = df[~msk].dropna()
#test = pd.read_csv("../input/feedback-prize-english-language-learning/test.csv")

In [5]:

train = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv")
#test = pd.read_csv("/kaggle/input/580data/test.csv")
test = pd.read_csv("/kaggle/input/580data/test_balanced.csv")
tgtCols = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
#train = train[['text_id','full_text','cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']]
#test = test[['text_id','full_text','cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']]
print(train.shape)
print(test.shape)


(3911, 8)
(783, 11)


In [6]:
test

Unnamed: 0.1,Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,average,bin
0,272,13C400DD9794,The year book is for to not forget anything an...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0
1,3051,D9BC7F4F22F0,Well what i think about praising for a student...,2.5,2.5,2.5,3.0,2.5,2.0,2.5,0
2,800,3E170458E9A1,I\n\ndisagree that first impressions are almos...,2.0,2.0,2.0,2.0,2.0,2.5,2.1,0
3,3206,E0BFF1488787,I disagree with schools having a program with ...,2.5,2.0,2.5,2.0,2.0,2.5,2.3,0
4,2664,C50BE3C76571,I dont like becuase the student forget all inf...,3.0,2.5,2.0,2.0,2.0,2.5,2.4,0
...,...,...,...,...,...,...,...,...,...,...,...
778,2207,A4A90A401002,People who value self-reliance define it as th...,3.0,3.5,3.5,4.0,3.5,4.0,3.6,2
779,2747,CA11FD3CAC43,Many people have been told about the fact that...,4.5,4.0,3.5,4.0,3.5,3.5,3.9,2
780,155,0BB9FAE6E27B,Setting A Good Example\n\nHave you thought of ...,3.5,4.0,4.5,4.0,3.5,3.5,3.9,2
781,3464,ED0A8E614649,Techonology has becoming powerful that let stu...,4.5,3.5,4.0,4.0,4.0,3.5,3.9,2


In [7]:
my_list = train.columns.values.tolist()
my_list

['text_id',
 'full_text',
 'cohesion',
 'syntax',
 'vocabulary',
 'phraseology',
 'grammar',
 'conventions']

In [8]:
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5
...,...,...,...,...,...,...,...,...
3906,FFD29828A873,I believe using cellphones in class for educat...,2.5,3.0,3.0,3.5,2.5,2.5
3907,FFD9A83B0849,"Working alone, students do not have to argue w...",4.0,4.0,4.0,4.0,3.5,3.0
3908,FFDC4011AC9C,"""A problem is a chance for you to do your best...",2.5,3.0,3.0,3.0,3.5,3.0
3909,FFE16D704B16,Many people disagree with Albert Schweitzer's ...,4.0,4.5,4.5,4.0,4.5,4.5


# Create folds

In [9]:
train.loc[:, 'kfold'] = -1 # Create a new column `fold` containing `-1`s.
train = train.sample(frac=1).reset_index(drop=True) # Shuffle the rows.
data_labels = train[tgtCols].values

In [10]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [11]:
mskf = MultilabelStratifiedKFold(n_splits=5)
for f, (t, v) in enumerate(mskf.split(X=train, y=data_labels)):
    train.loc[v, 'kfold'] = f + 1

In [12]:
train

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,kfold
0,FE3F2F729D98,To whom read it.\n\nFirst impressions are poss...,2.5,2.5,2.5,2.0,2.5,2.5,5
1,78018FB48420,Students should not be able to graduate early....,3.5,4.5,4.0,4.5,4.5,4.5,1
2,6792C2E73B6A,Some people agree with the statement ''success...,2.0,2.0,2.5,2.5,2.0,2.5,3
3,CEF302996231,"Dear Dr. Generic_Name,\n\nThank you for taking...",3.0,3.0,3.0,2.5,3.0,3.0,4
4,D581D89A822A,"Should people make their own decisions, or sho...",3.5,3.0,3.5,3.0,3.5,4.0,2
...,...,...,...,...,...,...,...,...,...
3906,FDE0B653AD74,"When you play activities or an instrument, don...",4.0,3.5,3.5,4.0,4.0,3.5,4
3907,00ED2563D0B1,"Philosopher, physician, and humanitarian Alber...",3.5,3.0,3.5,3.5,3.5,4.0,3
3908,B9F6B348FC3A,"In my experience I don't disagree, that Emerso...",2.0,2.0,3.0,3.0,2.0,2.5,5
3909,E65D5A2616D2,Imagine an old proverb said honestly was the b...,3.0,3.0,3.5,4.0,3.5,3.5,1


In [13]:
train['kfold'].value_counts().sort_index()

1    783
2    782
3    782
4    782
5    782
Name: kfold, dtype: int64

## Data process functions

In [14]:
def self_encode(texts, chkpt):
    
    tokenizer = transformers.AutoTokenizer.from_pretrained(CONFIG[chkpt])
    tokenizer.save_pretrained('./tokenizer/')

    input_ids = []
    attention_mask = []
    
    for text in texts.tolist():
        token = tokenizer(text, 
                          add_special_tokens=True, 
                          max_length=CONFIG['max_len'], 
                          return_attention_mask=True, 
                          return_tensors="np", 
                          truncation=True, 
                          padding='max_length')
        input_ids.append(token['input_ids'][0])
        attention_mask.append(token['attention_mask'][0])
    return np.array(input_ids, dtype="int32"), np.array(attention_mask, dtype="int32")

In [15]:
def pickle_dump(path, saveobj):
    import pickle
    handler = open(path,"wb")
    pickle.dump(saveobj,handler)
#     print("File pickled")
    handler.close()

In [16]:
def pickle_load(path):
    import pickle
    file = open(path,'rb')
    loader = pickle.load(file)
    file.close()
    return loader

## Transformer embeddings

In [17]:
def pretrain_embeddings(chkpt, df):
    cfg = transformers.AutoConfig.from_pretrained(CONFIG[chkpt], output_hidden_states=True)
    cfg.hidden_dropout_prob = 0
    cfg.attention_probs_dropout_prob = 0
    cfg.save_pretrained('./tokenizer/')
    
    input_ids = tf.keras.layers.Input(
        shape=(CONFIG['max_len'],), dtype=tf.int32, name="input_ids"
    )
    
    attention_masks = tf.keras.layers.Input(
        shape=(CONFIG['max_len'],), dtype=tf.int32, name="attention_masks"
    )
    
    try:
        model = transformers.TFAutoModel.from_pretrained(CONFIG[chkpt], config=cfg)
    except:
        model = transformers.TFAutoModel.from_pretrained(CONFIG[chkpt], config=cfg, from_pt=True)
        
    output = model(
        input_ids, attention_mask=attention_masks
    )
    hidden_states = output.hidden_states
    mean_pool = []
    for hidden_s in hidden_states[-1:]:
        #def call(self, inputs, mask=None):
        broadcast_mask = tf.expand_dims(tf.cast(attention_masks, "float32"), -1)
        embedding_sum = tf.reduce_sum(hidden_s * broadcast_mask, axis=1)
        mask_sum = tf.reduce_sum(broadcast_mask, axis=1)
        mask_sum = tf.math.maximum(mask_sum, tf.constant([1e-9]))
        tmp = embedding_sum / mask_sum
        mean_pool.append(tmp)
    output = tf.stack(mean_pool,axis=2)
   
    #output = tf.stack(
    #    [MeanPool()(hidden_s, mask=attention_masks) for hidden_s in hidd20000en_states[-1:]], 
    #    axis=2)
    
    output = tf.squeeze(output, axis=-1)
    
    model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=output)

    model.compile(optimizer="adam",
                 loss='huber_loss',
                 metrics=[tf.keras.metrics.RootMeanSquaredError()],
                 )
    print(model.summary())
    dataset = self_encode(df['full_text'], chkpt)
    preds = model.predict(dataset, batch_size=CONFIG['batch_size'])
    
    del model, dataset
    _ = gc.collect()
    
    return preds

# Model training

In [18]:

train_data = pretrain_embeddings('distilrobertabase', train)

#train_data = np.concatenate([train_data, pretrain_embeddings('bertbasecased', train)], axis=1)
train_data = np.concatenate([train_data, pretrain_embeddings('robertabase', train)], axis=1)
train_data = np.concatenate([train_data, pretrain_embeddings('robertalarge', train)], axis=1)

train_data.shape

2022-11-29 22:06:46.717990: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 22:06:46.718969: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 22:06:46.720020: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 22:06:46.720783: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 22:06:46.721526: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast (TFOpLambda)            (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_masks[0][0]        

2022-11-29 22:07:19.123540: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast_1 (TFOpLambda)          (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model_1 (TFRobertaMo TFBaseModelOutputWit 124645632   input_ids[0][0]                  
                                                                 attention_masks[0][0]      

(3911, 2560)

In [19]:
scores = []
rmse_scores = []

for fold in range(1,CONFIG['folds']):

    print('-'*35)
    print(f'## Fold {fold}')
    print('-'*35)

    trn_idx = train[train['kfold']==fold].index.values
    val_idx = train[train['kfold']!=fold].index.values
    print(f"trn_idx len is {len(trn_idx)}")

    X_train = train_data[trn_idx,:]
    X_valid = train_data[val_idx,:]

    y_train = train[train['kfold']==fold][tgtCols].copy()
    y_valid = train[train['kfold']!=fold][tgtCols].copy()

    val_preds = np.zeros((len(val_idx),6))

    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        clf = SVR(C=10)
        clf.fit(X_train, y_train[tgt].values)
        pickle_dump(f"./SVR_tgt{tgt}_fold{fold}.pkl", clf)
        val_preds[:,i] = clf.predict(X_valid)
   
    
    for i in range(len(tgtCols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_valid[tgtCols].values[:,i],val_preds[:,i])))
        score = np.mean(rmse_scores)
    #score = mcrmse(y_valid[tgtCols].values, val_preds)
        scores.append(score)
    print("Fold : {} RMSE score: {}".format(fold,score))

    print('-'*35)
    print('Overall CV RMSE =',np.mean(scores))


-----------------------------------
## Fold 1
-----------------------------------
trn_idx len is 783
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 1 RMSE score: 0.4682714783046971
-----------------------------------
Overall CV RMSE = 0.4751344197501293
-----------------------------------
## Fold 2
-----------------------------------
trn_idx len is 782
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 2 RMSE score: 0.46985900406714975
-----------------------------------
Overall CV RMSE = 0.4727904779900445
-----------------------------------
## Fold 3
-----------------------------------
trn_idx len is 782
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 3 RMSE score: 0.4717039325208523
-----------------------------------
Overall CV RMSE = 0.47230438934714375
-----------------------------------
## Fold 4
-----------------------------------
trn_idx len is 782
cohesion , syntax , vocabulary , phraseolo

In [20]:
del train_data
_ = gc.collect()

# Model inference on Balanced Test

In [21]:
test_data = pretrain_embeddings('distilrobertabase', test)

#test_data = np.concatenate([test_data, pretrain_embeddings('bertbasecased', test)], axis=1)
test_data = np.concatenate([test_data, pretrain_embeddings('robertabase', test)], axis=1)
test_data = np.concatenate([test_data, pretrain_embeddings('robertalarge', test)], axis=1)

test_data.shape

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast_3 (TFOpLambda)          (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model_3 (TFRobertaMo TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_masks[0][0]      

(783, 2560)

In [22]:
fold_preds = []

for fold in range(1,CONFIG['folds']):

    print('-'*35)
    print(f'## Fold {fold}')
    print('-'*35)
    
    test_preds = np.zeros((len(test_data),6))
    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        model = pickle_load(f"./SVR_tgt{tgt}_fold{fold}.pkl")
        test_preds[:,i] = model.predict(test_data)
    
    fold_preds.append(test_preds)
    
    for i in range(len(tgtCols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_valid[tgtCols].values[:,i],val_preds[:,i])))
        score = np.mean(rmse_scores)
    #score = mcrmse(y_valid[tgtCols].values, val_preds)
        scores.append(score)
    print("Fold : {} RMSE score: {}".format(fold,score))

    print('-'*35)
    print('Overall CV RMSE =',np.mean(scores))
    
    del model
    _ = gc.collect()

-----------------------------------
## Fold 1
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 1 RMSE score: 0.47207027621653386
-----------------------------------
Overall CV RMSE = 0.47228764919280547
-----------------------------------
## Fold 2
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 2 RMSE score: 0.47216186214045425
-----------------------------------
Overall CV RMSE = 0.4722990099788883
-----------------------------------
## Fold 3
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 3 RMSE score: 0.4722272806575403
-----------------------------------
Overall CV RMSE = 0.4723125566530724
-----------------------------------
## Fold 4
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 4 RMSE score: 0.47227634454535466
-----

In [23]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)

In [24]:
output_df = test[['text_id']].reset_index()
output_df

Unnamed: 0,index,text_id
0,0,13C400DD9794
1,1,D9BC7F4F22F0
2,2,3E170458E9A1
3,3,E0BFF1488787
4,4,C50BE3C76571
...,...,...
778,778,A4A90A401002
779,779,CA11FD3CAC43
780,780,0BB9FAE6E27B
781,781,ED0A8E614649


In [25]:
preds_df = pd.DataFrame(preds, columns = tgtCols)
preds_df

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,1.816185,1.748447,2.234873,1.884909,1.797422,1.916126
1,2.741063,2.605668,2.957189,2.761612,2.505799,2.403018
2,1.935736,1.798175,2.323907,1.925007,1.722809,2.008836
3,2.612268,2.473251,2.660657,2.777010,2.856524,2.679932
4,2.256999,2.153023,2.318002,2.013254,1.997944,2.102921
...,...,...,...,...,...,...
778,3.622394,3.563503,3.730496,3.708827,3.590345,3.687552
779,3.798472,3.681177,3.750709,3.684108,3.553883,3.786724
780,3.518054,3.622119,3.810292,3.599986,3.503747,3.647713
781,3.781951,3.593794,3.808436,3.634491,3.458773,3.511313


In [26]:
preds_df['text_id'] = output_df['text_id']
preds_df = preds_df.reindex(['text_id', *preds_df.columns], axis=1).iloc[: , :-1]
preds_df

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,13C400DD9794,1.816185,1.748447,2.234873,1.884909,1.797422,1.916126
1,D9BC7F4F22F0,2.741063,2.605668,2.957189,2.761612,2.505799,2.403018
2,3E170458E9A1,1.935736,1.798175,2.323907,1.925007,1.722809,2.008836
3,E0BFF1488787,2.612268,2.473251,2.660657,2.777010,2.856524,2.679932
4,C50BE3C76571,2.256999,2.153023,2.318002,2.013254,1.997944,2.102921
...,...,...,...,...,...,...,...
778,A4A90A401002,3.622394,3.563503,3.730496,3.708827,3.590345,3.687552
779,CA11FD3CAC43,3.798472,3.681177,3.750709,3.684108,3.553883,3.786724
780,0BB9FAE6E27B,3.518054,3.622119,3.810292,3.599986,3.503747,3.647713
781,ED0A8E614649,3.781951,3.593794,3.808436,3.634491,3.458773,3.511313


## Running the final text dataset

In [27]:
test = pd.read_csv("/kaggle/input/580data/test.csv")

In [28]:
test_data = pretrain_embeddings('distilrobertabase', test)

#test_data = np.concatenate([test_data, pretrain_embeddings('bertbasecased', test)], axis=1)
test_data = np.concatenate([test_data, pretrain_embeddings('robertabase', test)], axis=1)
test_data = np.concatenate([test_data, pretrain_embeddings('robertalarge', test)], axis=1)

test_data.shape

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_masks (InputLayer)    [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf.cast_6 (TFOpLambda)          (None, 512)          0           attention_masks[0][0]            
__________________________________________________________________________________________________
tf_roberta_model_6 (TFRobertaMo TFBaseModelOutputWit 82118400    input_ids[0][0]                  
                                                                 attention_masks[0][0]      

(8, 2560)

In [29]:
fold_preds = []

for fold in range(1,CONFIG['folds']):

    print('-'*35)
    print(f'## Fold {fold}')
    print('-'*35)
    
    test_preds = np.zeros((len(test_data),6))
    for i, tgt in enumerate(tgtCols):

        print(tgt,', ',end='')
        model = pickle_load(f"./SVR_tgt{tgt}_fold{fold}.pkl")
        test_preds[:,i] = model.predict(test_data)
    
    fold_preds.append(test_preds)
    
    for i in range(len(tgtCols)):
        rmse_scores.append(np.sqrt(mean_squared_error(y_valid[tgtCols].values[:,i],val_preds[:,i])))
        score = np.mean(rmse_scores)
    #score = mcrmse(y_valid[tgtCols].values, val_preds)
        scores.append(score)
    print("Fold : {} RMSE score: {}".format(fold,score))

    print('-'*35)
    print('Overall CV RMSE =',np.mean(scores))
    
    del model
    _ = gc.collect()

-----------------------------------
## Fold 1
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 1 RMSE score: 0.47231450534698816
-----------------------------------
Overall CV RMSE = 0.472339402591171
-----------------------------------
## Fold 2
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 2 RMSE score: 0.472345033988295
-----------------------------------
Overall CV RMSE = 0.47235167059599165
-----------------------------------
## Fold 3
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 3 RMSE score: 0.4723700119675461
-----------------------------------
Overall CV RMSE = 0.47236302030472926
-----------------------------------
## Fold 4
-----------------------------------
cohesion , syntax , vocabulary , phraseology , grammar , conventions , Fold : 4 RMSE score: 0.4723908269502552
--------

In [30]:
preds = np.mean(fold_preds, axis=0)
preds = np.clip(preds, 1, 5)

In [31]:
preds_df = pd.DataFrame(preds, columns = tgtCols)
preds_df

Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,2.847516,2.767603,2.990523,2.88913,2.576317,2.593019
1,2.724639,2.471339,2.718179,2.438099,2.157643,2.610832
2,3.447244,3.366904,3.545405,3.535074,3.348391,3.309278
3,3.332546,3.299008,3.44146,3.325426,3.447684,3.073018
4,3.748011,3.749285,3.879655,3.733077,3.678509,3.407902
5,3.690385,3.607072,3.933969,3.57578,3.568065,3.728966
6,3.866299,3.822295,3.981464,3.891308,3.855148,3.776293
7,4.067658,3.877154,4.020447,3.846368,3.737488,3.865833


In [32]:
output_df = test[['text_id']].reset_index()
output_df

Unnamed: 0,index,text_id
0,0,0000C359D63E
1,1,000BAD50D026
2,2,00367BB2546B
3,3,hp
4,4,tkm
5,5,high school
6,6,college1
7,7,college2


In [33]:
preds_df['text_id'] = output_df['text_id']
preds_df = preds_df.reindex(['text_id', *preds_df.columns], axis=1).iloc[: , :-1]
preds_df

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.847516,2.767603,2.990523,2.88913,2.576317,2.593019
1,000BAD50D026,2.724639,2.471339,2.718179,2.438099,2.157643,2.610832
2,00367BB2546B,3.447244,3.366904,3.545405,3.535074,3.348391,3.309278
3,hp,3.332546,3.299008,3.44146,3.325426,3.447684,3.073018
4,tkm,3.748011,3.749285,3.879655,3.733077,3.678509,3.407902
5,high school,3.690385,3.607072,3.933969,3.57578,3.568065,3.728966
6,college1,3.866299,3.822295,3.981464,3.891308,3.855148,3.776293
7,college2,4.067658,3.877154,4.020447,3.846368,3.737488,3.865833


## Running SVR after TF-IDF

In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
import math
from sklearn.svm import SVR

In [35]:
# Running for the train full_text with training all
# fit the six test as test
full_df = np.concatenate((train.full_text.values,test.full_text.values))

In [36]:
tfidf_featurizer = TfidfVectorizer(max_features=10000, max_df=0.95, stop_words='english')
X_tfidf = tfidf_featurizer.fit_transform(full_df)

In [37]:
# SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X_tfidf[0:len(train.full_text)], 
                                                    train[tgtCols].values,
                                                    test_size=0.10,
                                                    random_state=42)

In [38]:
print(X_train.shape)
print(X_test.shape)

(3519, 10000)
(392, 10000)


In [39]:
best_params = {'C' : 10, 
                'epsilon': 0.1, 
                'gamma' : 1, 
                'kernel' : 'rbf'} 

In [40]:
data_test = X_tfidf[len(train.full_text):]

In [41]:
#  
df_sum = pd.DataFrame([],index=test.text_id,columns= tgtCols)

In [42]:
svr_clf = SVR(**best_params)
rerror = []
for k in range(0,y_train.shape[1]):
  svr_clf.fit(X_train, y_train[:,k])
  rf_preds = svr_clf.predict(X_test)
  rerror.append(mse(rf_preds,y_test[:,k]))
  MSE = np.mean(rerror)
  RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)


Root Mean Square Error:

0.5608407952277368


In [43]:
svr_clf = SVR(**best_params)
error = []
for k in range(0,y_train.shape[1]):
  svr_clf.fit(X_train, y_train[:,k])
  rf_preds = svr_clf.predict(data_test)
  df_sum[tgtCols[k]] = rf_preds
  #error.append(rmse(rf_preds,y_test[:,k],squared=False))



In [44]:
df_sum

Unnamed: 0_level_0,cohesion,syntax,vocabulary,phraseology,grammar,conventions
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000C359D63E,2.903885,2.846374,3.243762,3.12912,2.709364,2.811444
000BAD50D026,3.051334,2.783242,2.934718,2.746411,2.696641,2.949309
00367BB2546B,3.452976,3.466839,3.510834,3.404067,3.320165,3.38293
hp,2.699314,2.64864,2.981226,2.842798,2.789414,2.725232
tkm,2.816422,2.771121,3.019191,2.854721,2.870556,2.78069
high school,2.934556,2.777547,3.218054,2.970385,2.847546,2.85367
college1,3.00236,3.00476,3.29219,3.087244,3.038965,2.894806
college2,3.086213,2.954979,3.282283,3.00938,2.999821,2.996158
