In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split

# Load the data into a DataFrame
df = pd.read_csv("/content/drive/MyDrive/abstractive-summarization-data/datasets/train.csv")
df_val = pd.read_csv("/content/drive/MyDrive/abstractive-summarization-data/datasets/val.csv")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Tokenize the text into sentences
df['Abstract'] = df['Abstract'].apply(sent_tokenize)
df['RHS'] = df['RHS'].apply(sent_tokenize)

df_val['Abstract'] = df_val['Abstract'].apply(sent_tokenize)
df_val['RHS'] = df_val['RHS'].apply(sent_tokenize)

# Convert tokenized sentences into a single string
df['Abstract'] = df['Abstract'].apply(lambda x: ' '.join(x))
df['RHS'] = df['RHS'].apply(lambda x: ' '.join(x))

df_val['Abstract'] = df_val['Abstract'].apply(lambda x: ' '.join(x))
df_val['RHS'] = df_val['RHS'].apply(lambda x: ' '.join(x))


In [4]:
df = df.drop(columns='FileName')
df = df.rename(columns={"Abstract":"source_text", "RHS":"target_text"})
df = df[['source_text', 'target_text']]

df['source_text'] = df['source_text'].str.lower()
df['target_text'] = df['target_text'].str.lower()

df['source_text'] = "summarize: " + df['source_text']
df

df_val = df_val.drop(columns='FileName')
df_val = df_val.rename(columns={"Abstract":"source_text", "RHS":"target_text"})
df_val = df_val[['source_text', 'target_text']]

df_val['source_text'] = df_val['source_text'].str.lower()
df_val['target_text'] = df_val['target_text'].str.lower()

df_val['source_text'] = "summarize: " + df_val['source_text']
df_val

Unnamed: 0,source_text,target_text
0,summarize: human face can be seen as a soft t...,we model the deformation of the human face due...
1,summarize: in this paper we use a numerical p...,bifurcation and postbifurcation of inflated hy...
2,summarize: modularisation product platforms p...,existing methods in modular product family dev...
3,summarize: in order to investigate the micros...,a drx model of fgh96 of ifw process is establi...
4,summarize: an efficient approach is proposed ...,propose a pragmatic approach for simulating co...
...,...,...
95,summarize: this paper proposes a strategy for...,efficient strategy for gpu computing of fgfea ...
96,summarize: a family of spatial beam finite el...,we analyse fixed pole approach in geometricall...
97,summarize: a new adaptive multiscale method i...,a new adaptive multiscale method amm is develo...
98,summarize: a nonlocal extension of the damage...,a new nonlocal damage plasticity model has bee...


In [5]:
%%capture
!pip install --upgrade simplet5

In [6]:
# import
from simplet5 import SimpleT5

# instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 models)
model.from_pretrained("t5","t5-small")


INFO:pytorch_lightning.utilities.seed:Global seed set to 42


In [7]:
# model.train(train_df=df, # pandas dataframe with 2 columns: source_text & target_text
#             eval_df=df_val, # pandas dataframe with 2 columns: source_text & target_text
#             source_max_token_len = 512, 
#             target_max_token_len = 128,
#             batch_size = 32,
#             max_epochs = 10,
#             use_gpu = True,
#             outputdir = "/kaggle/working/",
#             early_stopping_patience_epochs = 0,
#             precision = 32
# )

In [8]:
model.load_model("t5","/content/drive/MyDrive/abstractive-summarization-data/t5-small-model-finetuned", use_gpu=True)

In [9]:
%%capture
!pip install pytextrank
!python -m spacy download en_core_web_sm
!pip install --upgrade scipy networkx

In [10]:
import spacy
import pytextrank

def extract_important_sentences(text, limit_phrases=15, limit_sentences=5):
    en_nlp = spacy.load("en_core_web_sm")
    en_nlp.add_pipe("textrank", last=True)
    doc = en_nlp(text)
    tr = doc._.textrank
    summary = ""
    for sent in tr.summary(limit_phrases=limit_phrases, limit_sentences=limit_sentences):
        summary += sent.text + " "
    return summary

In [None]:
!pip install textblob

In [37]:
def create_summaries(text):

    print("ACTUAL ABSTRACT - " + text)
    print("\nLength of Abstract = " + str(len(text.split())))
    sumtext = "summarize: " + text.lower()
    actual_text_prediction = model.predict(sumtext)[0]
    print("\nDIRECT SUMMARIZATION USING T5 - " + actual_text_prediction)
    print("\nLength of Summary = " + str(len(actual_text_prediction.split())))

    newtext = extract_important_sentences(text, 20, 6)
    newtext = "summarize: " + newtext.lower()
    extractive_text_prediction = model.predict(newtext)[0]
    print("\nSUMMARIZATION AFTER EXTRACTIVE USING T5 - " + extractive_text_prediction)
    print("\nLength of Summary = " + str(len(extractive_text_prediction.split())))

In [39]:
text = """Having great edge over traditional semiconductor-based devices, heterostructures are widely used as a basic building block in the advanced semiconductor devices owing to their essential and attractive structural, interfacial, and electronic properties. In general, heterostructures have a comparatively better electrical and optical performance with respect to the layers of individual materials. The study of semiconductor heterojunctions is a step towards utilization of more and more semiconducting materials for the fabrication of heterojunction devices. With the motive to understand basics of such devices, this paper discusses about the main concept of heterojunctions, heterostructures and their classifications in detail. Moving forward, we finally finish off this term paper with the applications of Heterojunctions, by discussing about Lasers and Solar Cells and the references"""

In [40]:
create_summaries(text)

ACTUAL ABSTRACT - Having great edge over traditional semiconductor-based devices, heterostructures are widely used as a basic building block in the advanced semiconductor devices owing to their essential and attractive structural, interfacial, and electronic properties. In general, heterostructures have a comparatively better electrical and optical performance with respect to the layers of individual materials. The study of semiconductor heterojunctions is a step towards utilization of more and more semiconducting materials for the fabrication of heterojunction devices. With the motive to understand basics of such devices, this paper discusses about the main concept of heterojunctions, heterostructures and their classifications in detail. Moving forward, we finally finish off this term paper with the applications of Heterojunctions, by discussing about Lasers and Solar Cells and the references

Length of Abstract = 121

DIRECT SUMMARIZATION USING T5 - the main concept of heterojunction

In [15]:
# import shutil
# shutil.make_archive('yeet', 'zip', '/kaggle/working/simplet5-epoch-9-train-loss-2.4841-val-loss-2.5633')

In [16]:
# Create an empty dataframe
pred_df = pd.DataFrame(columns=["target_text", "predicted_text"])

# Iterate over the validation dataset
for i, row in df_val.iterrows():
    # Make a prediction for the current row
    pred_text = model.predict(row["source_text"])
    # Add the prediction and the target text to the new dataframe
    pred_df.loc[i] = [row["target_text"], pred_text[0]]
    
pred_df

Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,target_text,predicted_text
0,we model the deformation of the human face due...,a new approach to evaluate muscle contribution...
1,bifurcation and postbifurcation of inflated hy...,we propose a numerical procedure to analyse bi...
2,existing methods in modular product family dev...,we propose a novel method for modularisation o...
3,a drx model of fgh96 of ifw process is establi...,the dynamic recrystallization kinetic model of...
4,propose a pragmatic approach for simulating co...,an efficient approach is proposed to predict t...
...,...,...
95,efficient strategy for gpu computing of fgfea ...,we propose a strategy for the efficient implem...
96,we analyse fixed pole approach in geometricall...,a family of spatial beam finite elements based...
97,a new adaptive multiscale method amm is develo...,a new adaptive multiscale method is proposed t...
98,a new nonlocal damage plasticity model has bee...,a nonlocal extension of the damage plasticity ...


In [17]:
!pip install py-rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting py-rouge
  Downloading py_rouge-1.1-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 KB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: py-rouge
Successfully installed py-rouge-1.1


In [18]:
import rouge

def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


for aggregator in ['Avg', 'Best']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                           max_n=2,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)


    all_hypothesis = pred_df['predicted_text']
    all_references = pred_df['target_text']

    scores = evaluator.get_scores(all_hypothesis, all_references)

    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference
            for hypothesis_id, results_per_ref in enumerate(results):
                nb_references = len(results_per_ref['p'])
                for reference_id in range(nb_references):
                    print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id))
                    print('\t' + prepare_results(results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id]))
            print()
        else:
            print(prepare_results(results['p'], results['r'], results['f']))
    print()

Evaluation with Avg
	rouge-1:	P: 42.16	R: 49.52	F1: 44.57
	rouge-2:	P: 18.54	R: 21.66	F1: 19.53
	rouge-l:	P: 36.00	R: 41.06	F1: 37.77
	rouge-w:	P: 22.85	R: 12.14	F1: 15.46

Evaluation with Best
	rouge-1:	P: 42.16	R: 49.52	F1: 44.57
	rouge-2:	P: 18.54	R: 21.66	F1: 19.53
	rouge-l:	P: 36.00	R: 41.06	F1: 37.77
	rouge-w:	P: 22.85	R: 12.14	F1: 15.46



In [19]:
create_summaries("""The study objective is to contemplate the effectiveness of COVID-19 on the air pollution of Indian territory from January 2020 to April 2020. We have executed data from European Space Agency (ESA) and CPCB online portal for air quality data dissemination. The Sentinel e 5 P satellite images elucidate that the Air quality of Indian territory has been improved significantly during COVID-19. Mumbai and Delhi are one of the most populated cities. These two cities have observed a substantial decrease in Nitrogen Dioxide (40e50%) compared to the same period last year. It suggests that the emergence of COVID-19 has been proved to a necessary evil as being advantageous for mitigating air pollution on Indian territory during the lock-down. The study found a significant decline in Nitrogen Dioxide in reputed states of India, i.e., Delhi and Mumbai. Moreover, a faded track of Nitrogen Dioxide can be seen at the Maritime route in the Indian Ocean. An upsurge in the environmental quality of India will also be beneficial for its neighbor countries, i.e., China, Pakistan, Iran, and Afghanistan.""")

ACTUAL ABSTRACT - The study objective is to contemplate the effectiveness of COVID-19 on the air pollution of Indian territory from January 2020 to April 2020. We have executed data from European Space Agency (ESA) and CPCB online portal for air quality data dissemination. The Sentinel e 5 P satellite images elucidate that the Air quality of Indian territory has been improved significantly during COVID-19. Mumbai and Delhi are one of the most populated cities. These two cities have observed a substantial decrease in Nitrogen Dioxide (40e50%) compared to the same period last year. It suggests that the emergence of COVID-19 has been proved to a necessary evil as being advantageous for mitigating air pollution on Indian territory during the lock-down. The study found a significant decline in Nitrogen Dioxide in reputed states of India, i.e., Delhi and Mumbai. Moreover, a faded track of Nitrogen Dioxide can be seen at the Maritime route in the Indian Ocean. An upsurge in the environmental