# Introduction

In [1]:
# Warnings
import warnings
warnings.filterwarnings('ignore')

# BEGIN: fix Python or Notebook SSL CERTIFICATE_VERIFY_FAILED
import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None)):
    ssl._create_default_https_context = ssl._create_unverified_context
# END: fix Python or Notebook SSL CERTIFICATE_VERIFY_FAILED

## Installing pre-requsite libraries
* https://pypi.org/project/bert-extractive-summarizer/

In [2]:
!pip -q install sumy transformers sentencepiece

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --upgrade pip' command.[0m


### Import libraries

In [3]:
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer

In [4]:
content = [
    "Text_Summarize_Text/Technology_and_Engineering.txt", # 0 Introduction
    "Text_Summarize_Text/Enough_With_the_Trolley_Problem.txt",  # 1 Introduction to Ethics
    "Text_Summarize_Text/Big_Data_Ethics_and_Religion_New_Questions_from_a_New_Science.txt",
    "Text_Summarize_Text/Why_Christians_Should_Study_Computer_Science.txt",
    "Text_Summarize_Text/How_Algorithms_Can_Learn_to_Discredit.txt", # 2 Disinformation
    "Text_Summarize_Text/A_Framework_for_Understanding_Sources of_Harm_throughout_the_Machine_Learning_LifeCycle.txt", # 3 Bias/Fairness
    "Text_Summarize_Text/Discrimination_in_the_Age_of_Algorithms.txt", #
    "Text_Summarize_Text/On_the_Legal_Compatibility_of_Fairness_Definitions.txt", # 4
    "Text_Summarize_Text/Your_apps_know_where_you_were_last_night.txt",  # 5 Privacy
    "Text_Summarize_Text/Trading_privacy_for_survival_is_another_tax_on_the_poor.txt",
    "Text_Summarize_Text/Caught_in_the_Spotlight.txt",
    "Text_Summarize_Text/The_fundamental_problem_with_Silicon_Valleys_favorite_growth_strategy.txt", # 6 Technological Colonialism
    "Text_Summarize_Text/Computing_in_the_Image_of_God.txt", # 7 Wrapping up
    "Text_Summarize_Text/Huawei_Technicians_Helped_African_Governments_Spy_on_Political_Opponents.txt" # 8
]

output_sentences_count = 10

with open(content[6], "r", encoding="utf-8") as f: # open(r'C:\Users\...site_1.html', "r") as f:
    article = f.read()  
    
# article

In [5]:
my_parser = PlaintextParser.from_string(article, Tokenizer('english'))

# Creating a summary of 3 sentences.
lex_rank_summarizer = LexRankSummarizer()
lexrank_summary = lex_rank_summarizer(my_parser.document, sentences_count = output_sentences_count)

# Printing the summary
for sentence in lexrank_summary:
  print(sentence)

## LSA (Latent semantic analysis)

In [6]:
from sumy.summarizers.lsa import LsaSummarizer

# creating the summarizer
lsa_summarizer = LsaSummarizer()
lsa_summary = lsa_summarizer(my_parser.document, sentences_count = output_sentences_count)

# Printing the summary
for sentence in lsa_summary:
    print(sentence)

## Luhn Summarization algorithm’s approach is based on TF-IDF (Term Frequency-Inverse Document Frequency). 

In [7]:
from sumy.summarizers.luhn import LuhnSummarizer

#  Creating the summarizer
luhn_summarizer = LuhnSummarizer()
luhn_summary = luhn_summarizer(my_parser.document, sentences_count = output_sentences_count)

# Printing the summary
for sentence in luhn_summary:
  print(sentence)

## extractive method is the KL-Sum algorithm

In [8]:
from sumy.summarizers.kl import KLSummarizer
kl_summarizer = KLSummarizer()
kl_summary = kl_summarizer(my_parser.document, sentences_count = output_sentences_count)

# Printing the summary
for sentence in kl_summary:
    print(sentence)

## Summarization with T5 Transformers

In [9]:
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration

my_model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

input_ids = tokenizer.encode(article, return_tensors='pt', max_length=750, truncation=False)
summary_ids = my_model.generate(input_ids)

t5_summary = tokenizer.decode(summary_ids[0])
print(t5_summary)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<pad> </s>


# GPT-2 Transformers

In [10]:
# Importing model and tokenizer
from transformers import GPT2Tokenizer,GPT2LMHeadModel

# Instantiating the model and tokenizer with gpt-2
tokenizer=GPT2Tokenizer.from_pretrained('gpt2')
model=GPT2LMHeadModel.from_pretrained('gpt2')

# Encoding text to get input ids & pass them to model.generate()
inputs=tokenizer.batch_encode_plus([article], return_tensors='pt', max_length=750, truncation=False)
summary_ids=model.generate(inputs['input_ids'], early_stopping=True)

GPT_summary=tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(GPT_summary)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


IndexError: list index out of range

# BERT Summarization

In [None]:
!pip install bert-extractive-summarizer

In [None]:
from summarizer import Summarizer
model = Summarizer()
result = model(article, min_length=output_sentences_count)
summary = "".join(result)
print(summary)

## Huggingface sentence transformers

https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2

In [None]:
pip -q install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
embeddings = model.encode(article)
# embeddings.unique_consecutive

In [None]:
# embeddings

# XLM Transformers

In [None]:
# Importing model and tokenizer
from transformers import XLMWithLMHeadModel, XLMTokenizer

# Instantiating the model and tokenizer 
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')

# Encoding text to get input ids & pass them to model.generate()
inputs = tokenizer.batch_encode_plus([article], return_tensors='pt')
summary_ids = model.generate(inputs, early_stopping=False)

# Decode and print the summary
XLM_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(XLM_summary)