# Readability Metrics


In [3]:
import os
proxy = 'http://proxy.rockwellcollins.com:9090'
os.environ['http_proxy'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTP_PROXY'] = proxy
os.environ['HTTPS_PROXY'] = proxy

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
#nltk.download('punkt')

https://pypi.org/project/py-readability-metrics/


In [5]:
df = pd.read_csv("/home/jjschued/train.csv", nrows=2400)
#limit is to match batch size in pytorch to make it evenly divisible.

# Model Building

## Bert Transformer Model

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2400 non-null   object 
 1   url_legal       770 non-null    object 
 2   license         770 non-null    object 
 3   excerpt         2400 non-null   object 
 4   target          2400 non-null   float64
 5   standard_error  2400 non-null   float64
dtypes: float64(2), object(4)
memory usage: 112.6+ KB


In [9]:
test_doc = df.iloc[0,3]
print(test_doc)

When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.
The floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.
At each end of the room, on the wall, hung a beautiful bear-skin rug.
These rugs were for prizes, one for the girls and one for the boys. And this was the game.
The girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole.
This would have been an easy matter, but each traveller was obliged to wear snowshoes.


In [10]:
from readability import Readability
r = Readability(test_doc)

In [11]:
# apply Flesch-Kindcaid and Dale–Chall to dataframe
for index, row in df.iterrows():
    r = Readability(row['excerpt'])
    fk = r.flesch_kincaid()
    df.at[index, 'fkscore'] = fk.score
    df.at[index, 'fkgrade_level'] = fk.grade_level
    dc = r.dale_chall()
    df.at[index, 'dcscore'] = dc.score
    gf = r.gunning_fog()
    df.at[index, 'gfscore'] = gf.score
    cl = r.coleman_liau()
    df.at[index, 'clscore'] = cl.score
    ari = r.ari()
    df.at[index, 'ariscore'] = ari.score

In [12]:
!pip install multimodal_transformers
#https://medium.com/georgian-impact-blog/how-to-incorporate-tabular-data-with-huggingface-transformers-b70ac45fcfb4
#https://multimodal-toolkit.readthedocs.io/en/latest/notes/introduction.html#how-to-initialize-transformer-with-tabular-models
#https://github.com/georgian-io/Multimodal-Toolkit/blob/master/main.py

Defaulting to user installation because normal site-packages is not writeable


In [27]:
from transformers import AutoTokenizer
from multimodal_transformers.data import load_data
text_cols = ['excerpt']
label_col = 'target' 
numerical_cols = ['fkscore', 'fkgrade_level', 'dcscore', 'gfscore', 'clscore',
       'ariscore']

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')



# create train test split

In [28]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [29]:
train.columns

Index(['id', 'url_legal', 'license', 'excerpt', 'target', 'standard_error',
       'fkscore', 'fkgrade_level', 'dcscore', 'gfscore', 'clscore',
       'ariscore'],
      dtype='object')

In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1920 entries, 120 to 860
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1920 non-null   object 
 1   url_legal       609 non-null    object 
 2   license         609 non-null    object 
 3   excerpt         1920 non-null   object 
 4   target          1920 non-null   float64
 5   standard_error  1920 non-null   float64
 6   fkscore         1920 non-null   float64
 7   fkgrade_level   1920 non-null   object 
 8   dcscore         1920 non-null   float64
 9   gfscore         1920 non-null   float64
 10  clscore         1920 non-null   float64
 11  ariscore        1920 non-null   float64
dtypes: float64(7), object(5)
memory usage: 195.0+ KB


In [46]:
train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,fkscore,fkgrade_level,dcscore,gfscore,clscore,ariscore
120,833efe648,,,In those days there were none of the thousand ...,-2.315612,0.51309,14.172233,14.0,7.578368,17.211779,10.022577,16.281718
283,fed2fbd39,https://simple.wikipedia.org/wiki/Central_Powers,CC BY-SA 3.0 and GFDL,The Central Powers were a group of nations fig...,-0.755034,0.458004,13.067983,13.0,10.164176,11.740552,11.644841,14.543333
1727,d213a48d8,,,When it becomes a question of practical lighti...,-2.050123,0.520817,16.528667,17.0,8.960702,20.261695,10.63774,18.116949
1984,2516a9594,,,The other children laughed; but a motion of th...,-0.816672,0.444304,10.703601,11.0,7.377768,13.322282,4.474866,11.15467
647,11c3536a9,https://simple.wikipedia.org/wiki/Cave_painting,CC BY-SA 3.0 and GFDL,Cave paintings are paintings on cave walls and...,-0.140723,0.47676,5.127427,5.0,7.151845,6.538012,9.557895,6.406199


In [43]:
#numerical_data = np.stack([train[col].values.astype(np.float32) for col in numerical_cols], 1)
#numerical_data = torch.tensor(numerical_data, dtype=torch.float)

In [45]:
#this was necassary because one metric was not numeric and pytorch didn't like that
train['fkgrade_level'] = train['fkgrade_level'].values.astype(np.float32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


# use train set to train model from 80/20 split

In [47]:
torch_dataset = load_data(
    train,
    text_cols,
    tokenizer,
    numerical_cols=numerical_cols,
    sep_text_token_str=tokenizer.sep_token,
    label_col = 'target'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [48]:
from multimodal_transformers.model import AutoModelWithTabular, TabularConfig
from transformers import AutoConfig

config = AutoConfig.from_pretrained('bert-base-uncased')
tabular_config = TabularConfig(
    num_labels=1, #1 for regression
    numerical_feat_dim=torch_dataset.numerical_feats.shape[1],
    combine_feat_method='weighted_feature_sum_on_transformer_cat_and_numerical_feats',
)
config.tabular_config = tabular_config

model = AutoModelWithTabular.from_pretrained('bert-base-uncased', config=config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertWithTabular: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertWithTabular were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifi

In [49]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [50]:

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./logs/model_name",
    logging_dir="./logs/runs",
    overwrite_output_dir=True,
    do_train=True,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    evaluate_during_training=True,
    logging_steps=25,
)

trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=torch_dataset
)

trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=240.0, style=ProgressStyle(description_wi…



{'loss': 0.9905508422851562, 'learning_rate': 4.4791666666666673e-05, 'epoch': 0.10416666666666667, 'step': 25}
{'loss': 0.4689141845703125, 'learning_rate': 3.958333333333333e-05, 'epoch': 0.20833333333333334, 'step': 50}
{'loss': 0.47462249755859376, 'learning_rate': 3.4375e-05, 'epoch': 0.3125, 'step': 75}
{'loss': 0.38530029296875, 'learning_rate': 2.916666666666667e-05, 'epoch': 0.4166666666666667, 'step': 100}
{'loss': 0.4318911743164062, 'learning_rate': 2.3958333333333334e-05, 'epoch': 0.5208333333333334, 'step': 125}
{'loss': 0.45788330078125, 'learning_rate': 1.8750000000000002e-05, 'epoch': 0.625, 'step': 150}
{'loss': 0.3202679443359375, 'learning_rate': 1.3541666666666666e-05, 'epoch': 0.7291666666666666, 'step': 175}
{'loss': 0.301771240234375, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.8333333333333334, 'step': 200}
{'loss': 0.3229257202148437, 'learning_rate': 3.125e-06, 'epoch': 0.9375, 'step': 225}




TrainOutput(global_step=240, training_loss=0.4508165041605631)

In [83]:
#save model
torch.save(model, '/home/jjschued/trainer.pth')

In [90]:
model1 = torch.load('/home/jjschued/trainer.pth')
model.train()

BertWithTabular(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

# to do: evaluate test set from 80/20 split of train data

In [51]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 480 entries, 2037 to 892
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              480 non-null    object 
 1   url_legal       161 non-null    object 
 2   license         161 non-null    object 
 3   excerpt         480 non-null    object 
 4   target          480 non-null    float64
 5   standard_error  480 non-null    float64
 6   fkscore         480 non-null    float64
 7   fkgrade_level   480 non-null    object 
 8   dcscore         480 non-null    float64
 9   gfscore         480 non-null    float64
 10  clscore         480 non-null    float64
 11  ariscore        480 non-null    float64
dtypes: float64(7), object(5)
memory usage: 48.8+ KB


In [52]:
#this was necassary because one metric was not numeric and pytorch didn't like that
test['fkgrade_level'] = test['fkgrade_level'].values.astype(np.float32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [53]:
torch_dataset2 = load_data(
    test,
    text_cols,
    tokenizer,
    numerical_cols=numerical_cols,
    sep_text_token_str=tokenizer.sep_token,
    label_col = 'target'
)

In [54]:
test.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,fkscore,fkgrade_level,dcscore,gfscore,clscore,ariscore
2037,b6174e8be,,,"When Willy was two years old, he lived in a re...",0.573393,0.497497,5.509572,6.0,5.408524,8.083422,4.941176,5.078824
1978,bbbfe2852,,,An old duck wandered into the barn and caught ...,0.004933,0.495994,2.72167,3.0,6.572428,4.94555,2.242871,0.516627
855,e33c8a8ce,https://simple.wikipedia.org/wiki/Eclipse,CC BY-SA 3.0 and GFDL,An eclipse is an astronomical event. It is a p...,0.031525,0.514281,7.748571,8.0,8.226429,9.0,7.708571,7.505714
1719,87b87e466,,,Among the attractions of the fête was an appar...,-2.407964,0.514421,14.803274,15.0,9.5183,19.077318,12.235976,15.952505
2019,95c02c078,,,"One night Hettie went to bed, and forgot to pu...",0.493527,0.486813,6.627658,7.0,6.614834,8.912658,4.732658,6.092595


In [55]:
predictions = trainer.predict(test_dataset=torch_dataset2).predictions

HBox(children=(FloatProgress(value=0.0, description='Prediction', max=30.0, style=ProgressStyle(description_wi…






In [88]:
model1

BertWithTabular(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [91]:
predictions1 = model1.predict(test_dataset=torch_dataset2).predictions

AttributeError: 'BertWithTabular' object has no attribute 'predict'

In [56]:
from sklearn.metrics import mean_squared_error
mean_squared_error(test['target'], predictions)

0.35841765008013

Before extra metrics: 0.4649954950754039

In [57]:
predictions

array([[ 6.82742774e-01],
       [ 5.73127508e-01],
       [-2.19217867e-01],
       [-1.87657297e+00],
       [ 5.02753675e-01],
       [-3.11743110e-01],
       [ 1.21130206e-01],
       [-1.77217937e+00],
       [-1.10898149e+00],
       [ 2.38459453e-01],
       [-2.88262558e+00],
       [-1.25054348e+00],
       [-2.29188585e+00],
       [-2.20946217e+00],
       [-5.95844030e-01],
       [-1.17055762e+00],
       [-9.24178421e-01],
       [-1.69506216e+00],
       [-2.34360173e-01],
       [-2.26912260e+00],
       [ 2.13942930e-01],
       [-1.44987249e+00],
       [ 5.71198821e-01],
       [-1.02879298e+00],
       [-1.86602771e+00],
       [-8.70509207e-01],
       [-1.81274176e+00],
       [-1.58812714e+00],
       [-1.72568595e+00],
       [-5.17073609e-02],
       [-2.60036898e+00],
       [-1.00596654e+00],
       [-2.54719585e-01],
       [-1.85601044e+00],
       [-1.26574218e+00],
       [-7.60075390e-01],
       [-1.53708887e+00],
       [ 1.32986620e-01],
       [-1.1

# to do: evaluate test set from competition

In [58]:
df2 = pd.read_csv("/home/jjschued/test.csv", nrows=2400)

In [60]:
# apply Flesch-Kindcaid and Dale–Chall to dataframe
for index, row in df2.iterrows():
    r = Readability(row['excerpt'])
    fk = r.flesch_kincaid()
    df2.at[index, 'fkscore'] = fk.score
    df2.at[index, 'fkgrade_level'] = fk.grade_level
    dc = r.dale_chall()
    df2.at[index, 'dcscore'] = dc.score
    gf = r.gunning_fog()
    df2.at[index, 'gfscore'] = gf.score
    cl = r.coleman_liau()
    df2.at[index, 'clscore'] = cl.score
    ari = r.ari()
    df2.at[index, 'ariscore'] = ari.score

In [62]:
#this was necassary because one metric was not numeric and pytorch didn't like that
df2['fkgrade_level'] = df2['fkgrade_level'].values.astype(np.float32)

In [66]:
df2['target'] = 0

In [67]:
df2

Unnamed: 0,id,url_legal,license,excerpt,fkscore,fkgrade_level,dcscore,gfscore,clscore,ariscore,target
0,c0f722661,,,My hope lay in Jack's promise that he would ke...,7.487958,7.0,6.95876,9.741503,6.095425,6.910931,0
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...,7.811164,8.0,7.184449,10.304762,6.528254,8.084444,0
2,0df072751,,,It was a bright and cheerful scene that greete...,6.076884,6.0,7.829642,8.089855,6.077174,5.307047,0
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...,15.486984,15.0,9.999817,20.952381,13.755556,16.02381,0
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...,16.366479,16.0,12.02718,19.507116,13.064719,17.323783,0
5,12537fe78,,,"To explain transitivity, let us look first at ...",8.05,8.0,9.937033,9.866667,8.0216,7.407533,0
6,965e592c0,https://www.africanstorybook.org/#,CC BY 4.0,Milka and John are playing in the garden. Her ...,4.686182,5.0,6.032289,6.777805,7.482236,4.795238,0


In [69]:
torch_dataset2 = load_data(
    df2,
    text_cols,
    tokenizer,
    numerical_cols=numerical_cols,
    sep_text_token_str=tokenizer.sep_token,
    label_col = 'target'
)

In [71]:
predictions2 = trainer.predict(test_dataset=torch_dataset2).predictions

HBox(children=(FloatProgress(value=0.0, description='Prediction', max=1.0, style=ProgressStyle(description_wid…






In [72]:
predictions2

array([[-0.30773157],
       [-0.0828116 ],
       [-0.13975872],
       [-2.3608234 ],
       [-1.8328588 ],
       [-0.9699878 ],
       [ 0.45124972]], dtype=float32)

In [None]:
submission = pd.DataFrame()
submission['id'] = test_df['id']
submission['target'] = predictions2
submission.reset_index(inplace=True,drop=True)
submission.to_csv('submission.csv',index=False)