In [8]:
import spacy
import pandas as pd
from tqdm import tqdm
import string

import torch

tqdm.pandas()
nlp = spacy.load('en_core_web_sm')

torch.cuda.is_available()

True

In [9]:
df = (
    pd.read_csv(
        '/home/jovyan/active-projects/summary-scoring/data/final_summaries_ai_aloe_fixed.csv',
        index_col=2)
    .iloc[:,2:] # remove garbage index columns from repeatedly saving to csv with pandas.
    .rename_axis(None) # remove the index name "Row.names" -- This appears to be the original index.
)
df.head(1)

Unnamed: 0,filename,filename_clean,source_text,Main.Point,Details,Cohesion,Objective.Language,Wording.Para,Lang..Bey..ST,Summ..Length,content_pca,paraphrase_pca,text,source_text_clean,source_text_filename_clean,source
1,1091_CivilServices .txt,1091_CivilServices,CivilService.txt,0.5,0.5,0.5,0.5,0.5,0.5,0.5,1.37,0.785,hard work pays off / / \n,CivilService,11_CivilService,\nCivil service offers jobs to thousands of me...


In [10]:
df['text_lemmatized'] = [' '.join([t.lemma_ for t in doc]) for doc in nlp.pipe(tqdm(df.text))]

100%|██████████| 4690/4690 [00:39<00:00, 117.52it/s]


In [None]:
source_dicts = {}

for row in df.itertuples():
    if row.source_text_filename_clean in source_dicts.keys():
        # ensure that that "row.source_text_clean" uniquely identifies the source text
        assert source_dicts[row.source_text_filename_clean]['text'] == row.source
    else:
        source_dicts[row.source_text_filename_clean] = {
            'text': row.source,
            'text_lemmatized': ' '.join([t.lemma_ for t in nlp(row.source)])
        }

In [28]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Text2TextGenerationPipeline,
)

class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=';', *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    max_length=256,
                                                    ),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(model_outputs=model_outputs)
        return [
            [
                keyphrase.strip().translate(str.maketrans('', '', string.punctuation))
                for keyphrase in result.get('generated_text').split(
                    self.keyphrase_sep_token
                )
                if keyphrase.translate(str.maketrans('', '', string.punctuation)) != ''
            ]
            for result in results
        ][0]

In [29]:
print('Maximum token count:', max([len(source_dict['text'].split()) for source_dict in source_dicts.values()]))

Maximum token count: 684


In [30]:
pipe = KeyphraseGenerationPipeline(model='bloomberg/KeyBART', device=0)

for source, source_dict in tqdm(source_dicts.items()):
    source_dict['keyterms_KeyBART'] = pipe(source_dict['text'])

100%|██████████| 101/101 [00:34<00:00,  2.93it/s]


In [31]:
def keyphrase_lower(row):
    keyphrases = source_dicts[row.source_text_filename_clean]['keyterms_KeyBART']
    summary = row.text
    keyphrase_count = sum(
        [(keyphrase.lower() in summary.lower())
         for keyphrase in keyphrases]
    )
    return keyphrase_count

df['keyterms_KeyBART_lower'] = df.apply(lambda row: keyphrase_lower(row), axis=1)

In [34]:
def keyphrase_lemma(row, split_phrases=False):
    keyphrases = source_dicts[row.source_text_filename_clean]['keyterms_KeyBART']
    
    if split_phrases:
        keyphrases = [t.lemma_ for doc in nlp.pipe(keyphrases) for t in doc]
    else:
        keyphrases = [' '.join([t.lemma_ for t in doc])
                      for doc in nlp.pipe(keyphrases)] 
        
    summary = row.text_lemmatized
    
    keyphrase_count = sum(
        [(keyphrase.lower() in summary.lower())
         for keyphrase in keyphrases]
    )
    
    return keyphrase_count

df['keyterms_KeyBART_lemma'] = df.progress_apply(lambda row: keyphrase_lemma(row), axis=1)

100%|██████████| 4690/4690 [00:41<00:00, 112.68it/s]


In [35]:
df['keyterms_KeyBART_lemma_split'] = df.progress_apply(lambda row: keyphrase_lemma(row, split_phrases=True), axis=1)

100%|██████████| 4690/4690 [00:41<00:00, 111.69it/s]


In [39]:
def overlapping_words(row):
    source_lemmas = source_dicts[row.source_text_filename_clean]['text_lemmatized'].split()

    summary = row.text_lemmatized.split()
    
    lemma_count = sum(
        [(lemma in summary)
         for lemma in source_lemmas]
    )

    return lemma_count

df['lemmas'] = df.progress_apply(lambda row: overlapping_words(row), axis=1)

100%|██████████| 4690/4690 [00:01<00:00, 2753.76it/s]


In [40]:
df[['content_pca', 'paraphrase_pca', 'keyterms_KeyBART_lower', 'keyterms_KeyBART_lemma', 'keyterms_KeyBART_lemma_split', 'lemmas']].corr()

Unnamed: 0,content_pca,paraphrase_pca,keyterms_KeyBART_lower,keyterms_KeyBART_lemma,keyterms_KeyBART_lemma_split,lemmas
content_pca,1.0,0.6606,0.211796,0.223199,0.27735,0.27916
paraphrase_pca,0.6606,1.0,-0.005417,0.000403,0.10917,-0.018682
keyterms_KeyBART_lower,0.211796,-0.005417,1.0,0.978244,0.514628,0.339762
keyterms_KeyBART_lemma,0.223199,0.000403,0.978244,1.0,0.522354,0.324745
keyterms_KeyBART_lemma_split,0.27735,0.10917,0.514628,0.522354,1.0,0.29805
lemmas,0.27916,-0.018682,0.339762,0.324745,0.29805,1.0


In [30]:
df.groupby('keyterms_KeyBART').content_pca.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
keyterms_KeyBART,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1241.0,7.17137,2.446901,0.76,5.48,7.5,9.21,10.96
1,1662.0,8.124747,1.908272,0.76,6.85,8.22,9.59,10.96
2,1149.0,8.502115,1.670218,0.76,7.4,8.66,9.86,10.96
3,470.0,8.360128,1.553779,2.74,7.1925,8.49,9.59,10.96
4,147.0,8.290204,1.597337,4.05,7.23,8.6,9.59,10.58
5,18.0,9.388889,1.45168,6.3,8.67,9.62,10.4925,10.96
6,2.0,9.64,0.0,9.64,9.64,9.64,9.64,9.64
7,1.0,10.96,,10.96,10.96,10.96,10.96,10.96


In [23]:
import statsmodels.api as sm

mod = sm.OLS(df.content_pca, sm.add_constant(df.keyterms_KeyBART_lemma_split))

res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:            content_pca   R-squared:                       0.077
Model:                            OLS   Adj. R-squared:                  0.077
Method:                 Least Squares   F-statistic:                     390.7
Date:                Sun, 26 Mar 2023   Prob (F-statistic):           1.38e-83
Time:                        19:25:43   Log-Likelihood:                -9809.4
No. Observations:                4690   AIC:                         1.962e+04
Df Residuals:                    4688   BIC:                         1.964e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

In [37]:
mod = sm.OLS(df.paraphrase_pca, sm.add_constant(df.keyterms_KeyBART))

res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:         paraphrase_pca   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.1376
Date:                Sun, 26 Mar 2023   Prob (F-statistic):              0.711
Time:                        17:47:36   Log-Likelihood:                -7684.8
No. Observations:                4690   AIC:                         1.537e+04
Df Residuals:                    4688   BIC:                         1.539e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                3.5209      0.028  