In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 42.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 36.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from transformers import pipeline

In [2]:
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))

In [3]:
df = pd.DataFrame(list(dataset.data), columns=['text'])
df["label"] = list(dataset.target)
label_mapping = {}
for i in range(20):
  label_mapping[i] = dataset.target_names[i]

Extractive Summarization using spacy and assigning weights based on word frequency

In [4]:
def summarize1(text, per):
    nlp = spacy.load('en_core_web_sm')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    sentence_tokens= [sent for sent in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():                            
                    sentence_scores[sent]=word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
    select_length=int(len(sentence_tokens)*per)
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
    return summary

Abstractive Summarization using Facebook Bart Large CNN

In [5]:
def summarize2(text, per):
  nlp = spacy.load('en_core_web_sm')
  doc= nlp(text)
  sentence_tokens= [sent for sent in doc.sents]
  select_length = int(len(sentence_tokens)*per)
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
  summary = summarizer(text, max_length=130, min_length=30)
  return summary

In [None]:
# cluster summarization
clusters = df['label'].values
final_text = ""
final_summary = ""
for cluster in clusters:
  prev_doc = ""
  filtered = df.loc[df['label'] == cluster]
  for doc in filtered['text']:
    final_text += doc
    if len(prev_doc) + len(doc) < 2700:
      prev_doc += doc
      prev_doc += ". "
      continue
    # summary = summarize1(text[:1000000], 0.01)
    try:
      summary = summarize2(prev_doc, 0.01)
    except:
      summary = summarize2(prev_doc[:1500], 0.01)
    final_summary += summary[0]['summary_text']
    prev_doc = doc
  print(final_summary)
  filename = label_mapping[cluster]
  filename = filename.replace(".", "_")
  filename += "_summary.txt"
  filepath = "./drive/MyDrive/" + filename
  f = open(filepath, "w")
  f.write(final_summary)
  f.close()