In [1]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
import pandas as pd

In [9]:
df = pd.read_csv('train_set.csv')
# df = df.iloc[40:60,:]

## Sumy

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lisbo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [12]:
def sumy_summarize(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, 5)  # Summarize to 5 sentences
    return ' '.join([str(sentence) for sentence in summary])

df['summary_sumy'] = df['Content'].apply(sumy_summarize)


In [13]:
df

Unnamed: 0,Ticker,Accession Number,Date,Content,summary_sumy
0,LANC,0000057515-22-000012,20220817,"Item8.01 Other EventsOn August 17, 2022, Lanca...","Item8.01 Other EventsOn August 17, 2022, Lanca..."
1,LANC,0000057515-22-000024,20221110,Item5.07 Submission of Matters to a Vote of Se...,Item5.07 Submission of Matters to a Vote of Se...
2,LANC,0000057515-20-000023,20200827,Item2.02 Results of Operations and Financial C...,Item2.02 Results of Operations and Financial C...
3,LANC,0000057515-21-000020,20211103,Item2.02 Results of Operations and Financial C...,Item2.02 Results of Operations and Financial C...
4,LANC,0000057515-20-000014,20200505,Item2.02 Results of Operations and Financial C...,Item2.02 Results of Operations and Financial C...


## Bert

In [14]:
from summarizer import Summarizer

bert_model = Summarizer()

def bert_summarize(text):
    return bert_model(text, num_sentences=5)

df['summary_bert'] = df['Content'].apply(bert_summarize)

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]



## Bart

In [10]:
from transformers import BartForConditionalGeneration, BartTokenizer

bart_model_name = 'facebook/bart-large-cnn'
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)

def bart_summarize(text):
    inputs = bart_tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
    summary_ids = bart_model.generate(inputs.input_ids, num_beams=4, min_length=30, max_length=250, early_stopping=True)
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

df['summary_bart'] = df['Content'].apply(bart_summarize)


In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,Ticker,Accession Number,Date,Content,Market Ajusted Return,summary_bart
0,0,AAP,0001158449-19-000025,2019-01-29,Item2.04. Triggering Events That Accelerate or...,-0.010733,"On January 29, 2019, Advance Auto Parts, Inc. ..."
1,1,AMBC,0000874501-20-000012,2020-01-23,Item3.01 Notice of Delisting or Failure to Sat...,0.01378,"Ambac Financial Group, Inc. (the “Company”), a..."
2,2,AMBC,0000874501-22-000149,2022-10-13,Item1.01 Entry into a Material Definitive Agre...,-0.042704,Ambac Assurance Corporation (“AAC”) entered in...
3,3,APPS,0001628280-22-014669,2022-05-17,Item2.02. Results of Operations and Financial ...,-0.39873,"Digital Turbine, Inc. issued a press release r..."
4,4,ATEN,0001580808-18-000021,2018-04-06,Item3.01.Notice of Delisting or Failure to Sat...,-0.003451,"A10 Networks, Inc. received a notice from the ..."
5,5,ATEN,0001628280-18-008978,2018-07-02,Item4.02 Non-Reliance on Previously Issued Fin...,0.076433,A10 Networks announced today that the Audit Co...
6,6,AX,0001299709-18-000113,2018-09-12,Item3.01. Notice of Delisting or Failure to Sa...,-0.037189,"BofI Holding, Inc. (the “Company”) provided wr..."
7,7,AX,0001299709-21-000024,2021-02-24,Item2.04 Triggering Events that Accelerate or ...,0.0486,"Axos Financial, Inc. (the “Company”) caused no..."
8,8,AZZ,0000008947-18-000030,2018-03-29,Item2.02 Results of Operations and Financial C...,0.038359,AZZ Inc. issued a press release announcing its...
9,9,CLDT,0001476045-20-000043,2020-03-26,Item5.01. Departure of Directors or Certain Of...,-0.195312,"Effective March 27, 2020, Jeffrey H. Fisher, C..."


In [12]:
df.to_csv("summary.csv")