In [24]:
from transformers import AutoTokenizer, LongformerForSequenceClassification, pipeline
import pandas as pd
import torch

In [3]:
summary = "Here is an essay about economics. It is really interesting and I like to write about economics. Economics is the best in the whole world ever."
source = "Economics is the study of money. Money is fun but also scary. /n Economics is fun!"

In [4]:
tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
wording_model = LongformerForSequenceClassification.from_pretrained('tiedaar/longformer-wording-global', num_labels=1)
content_model = LongformerForSequenceClassification.from_pretrained('tiedaar/longformer-content-global', num_labels=1)

In [5]:
def inference(summary, source, model):
    combined = summary + '</s>' + source
    context = tokenizer(combined)
    sep_index = context['input_ids'].index(2)
    context['global_attention_mask'] = [1]*(sep_index + 1) + [0]*(len(context['input_ids'])-(sep_index + 1))
    inputs = {}
    for key in context:
        inputs[key] = torch.tensor([context[key]])
    return float(model(**inputs)['logits'][0][0])

In [18]:
DATA = '../data/'
summaries_df = pd.read_csv(DATA + 'final_summaries_ai_aloe_fixed.csv').drop(columns = ['Unnamed: 0','Unnamed: 0.1'])


from sklearn.preprocessing import StandardScaler
import numpy as np
  
# copy the data
df_normalized = summaries_df.copy()
  
# apply normalization techniques
df_normalized['content_pca'] = StandardScaler().fit_transform(np.array(df_normalized['content_pca']).reshape(-1,1))
df_normalized['paraphrase_pca'] = StandardScaler().fit_transform(np.array(df_normalized['paraphrase_pca']).reshape(-1,1))  


source_texts = df_normalized['source_text_filename_clean'].value_counts().to_frame().reset_index()
texts_to_remove = list(source_texts.iloc[15:31]['source_text_filename_clean'])

test_df = df_normalized[df_normalized['source_text_filename_clean'].isin(texts_to_remove)]
train_df = df_normalized[df_normalized['source_text_filename_clean'].isin(texts_to_remove) == False]
print('test n:', len(test_df))
print('train n:', len(train_df))

test n: 703
train n: 3987


In [20]:
test_df['content_global_pred'] = test_df.apply(lambda row: inference(row['text'], row['source'], content_model), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['content_global_pred'] = test_df.apply(lambda row: inference(row['text'], row['source'], content_model), axis=1)


In [21]:
test_df['wording_global_pred'] = test_df.apply(lambda row: inference(row['text'], row['source'], wording_model), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['wording_global_pred'] = test_df.apply(lambda row: inference(row['text'], row['source'], wording_model), axis=1)


In [25]:
wording_pipe = pipeline('text-classification', model='tiedaar/summary-longformer-wording', function_to_apply="none", truncation=True)
content_pipe = pipeline('text-classification', model='tiedaar/summary-longformer-content', function_to_apply="none", truncation=True)
def getWordingScore(summary, source):
    text = summary + '</s>' + source
    return wording_pipe(text)[0]['score']

def getContentScore(summary, source):
    text = summary + '</s>' + source
    return content_pipe(text)[0]['score']


Some weights of the model checkpoint at tiedaar/summary-longformer-wording were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at tiedaar/summary-longformer-content were not used when initializing LongformerForSequenceClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another 

In [26]:
test_df['content_pred'] = test_df.apply(lambda row: getContentScore(row['text'], row['source']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['content_pred'] = test_df.apply(lambda row: getContentScore(row['text'], row['source']), axis=1)


In [27]:
test_df['wording_pred'] = test_df.apply(lambda row: getWordingScore(row['text'], row['source']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['wording_pred'] = test_df.apply(lambda row: getWordingScore(row['text'], row['source']), axis=1)


In [28]:
test_df[['content_pca', 'paraphrase_pca', 'content_pred', 'wording_pred', 'content_global_pred', 'wording_global_pred']].corr()

Unnamed: 0,content_pca,paraphrase_pca,content_pred,wording_pred,content_global_pred,wording_global_pred
content_pca,1.0,0.710421,0.881808,0.737226,0.907282,0.68189
paraphrase_pca,0.710421,1.0,0.670423,0.822417,0.696,0.836284
content_pred,0.881808,0.670423,1.0,0.793873,0.944221,0.723264
wording_pred,0.737226,0.822417,0.793873,1.0,0.785188,0.917834
content_global_pred,0.907282,0.696,0.944221,0.785188,1.0,0.741664
wording_global_pred,0.68189,0.836284,0.723264,0.917834,0.741664,1.0
