In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import re
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
    
import torch
from torch.utils import data

import math
from tqdm import tqdm
import time

import pandas as pd

In [3]:
# device = torch.device("cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [4]:
SOS_token = 0
EOS_token = 1
PAD_token = 2
UNK_TOKEN = 3
CONTENT_MAX_LENGTH = 100
TITLE_MAX_LENGTH = 8

class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {"SOS": 0, "EOS": 1, "PAD": 2, 'UNK':3}
        self.index2word = {0: "SOS", 1: "EOS", 2: "PAD", 3: 'UNK'}
        self.n_words = 3
        self.word2count = {}
    
    def add_sentence(self, sentence):
        for word in sentence.lower().split():
            self.add_word(word)
    
    def add_word(self, word):
        if word not in self.word2index.keys():
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word2count[word] = 1
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def to_json(self, file_path):
        pass
    
    def read_from_json(self, file_path):
        pass

In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/MSU NLP/Final Project/train_split.csv')
test_df = pd.read_csv('/content/drive/MyDrive/MSU NLP/Final Project/test_split.csv')
train_df.shape, test_df.shape

((1977, 4), (349, 4))

In [6]:
train_df.head()

Unnamed: 0,file_path,class,title,content
0,/media/kuldeep/Work/college_stuff/courses/cse_...,entertainment,Elton plays Paris charity concert,Sir Elton John has performed at a special conc...
1,/media/kuldeep/Work/college_stuff/courses/cse_...,politics,Defiant hunts put ban to the test,Thousands of hunt supporters have been out on ...
2,/media/kuldeep/Work/college_stuff/courses/cse_...,sport,Injury doubts beset Wales squad,Wales have a clutch of injury worries before W...
3,/media/kuldeep/Work/college_stuff/courses/cse_...,business,Bombardier chief to leave company,Shares in train and plane-making giant Bombard...
4,/media/kuldeep/Work/college_stuff/courses/cse_...,entertainment,EastEnders 'is set for US remake',Plans to create a US soap based on the BBC's E...


In [7]:
def normalize_string(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


def prepare_data(
    file_path, context_max_length=CONTENT_MAX_LENGTH, 
    title_max_length=TITLE_MAX_LENGTH
):
    df = pd.read_csv(file_path)
    pairs = []
    for _, row in df.iterrows():
        pairs.append(
            [
                row['title'], row['content']
            ]
        )
    print("{} titles and content read.".format(len(pairs)))
    pairs = [[normalize_string(p[0]), normalize_string(p[1])] for p in pairs]

    return pairs

def populate_vocab(vocab, pairs):
    for ti, co in pairs:
        vocab.add_sentence(co)
    return

In [8]:
train_pairs = prepare_data('/content/drive/MyDrive/MSU NLP/Final Project/train_split.csv')
print(random.choice(train_pairs))

1977 titles and content read.
['eu software patent law delayed', 'controversial new eu rules for the patenting of computer based inventions have been put on hold due to a last minute intervention from poland . poland a large and therefore crucial eu member has requested more time to consider the issue especially as it relates to the patenting of software . critics say the law would favour large companies over small innovative ones . they say it could have massive ramifications for developments such as open source software . polish ministers want to see the phrasing of the text of the directive on the patentability of computer implemented inventions changed so that it excludes the patenting of software . the planned law has ignited angry debate about whether the eu should allow the patenting of computer programs and internet business methods as currently happens in the us . so for instance us based amazon .com holds a patent on one click shopping . critics claim the law which the eu say

\<Extractive Summarization\>

Gensim

In [102]:
th1, th2 = 5, 3

In [103]:
train_pairs[0]

['elton plays paris charity concert',
 'sir elton john has performed at a special concert in paris to raise money for the victims of the asian tsunami . the british singer played to a strong audience on sunday at the french capital s bastille opera house . the concert was also part of an attempt to bring a broader range of events to the famous venue . money raised will go to the fondation pour l enfance foundation for childhood which aims to rebuild a children s shelter in sri lanka . sir elton played hits from his vast back catalogue to a sell out crowd which included former french president valery giscard d estaing and his wife anne aymone . the veteran pop star played piano accompaniment throughout the concert which lasted for three hours without an interval . he told the crowd throughout the years i ve done a lot of drugs and alcohol . it s true that i was a nightmare impossible . for the last years i ve been normal . now my drug is called david a reference to david furnish his par

In [104]:
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
import en_core_web_sm

stitle, sbody = train_pairs[0]

summ = summarize(sbody, ratio=0.2, word_count=th1*50)
print(summ)

sir elton john has performed at a special concert in paris to raise money for the victims of the asian tsunami .
the british singer played to a strong audience on sunday at the french capital s bastille opera house .
the concert was also part of an attempt to bring a broader range of events to the famous venue .
money raised will go to the fondation pour l enfance foundation for childhood which aims to rebuild a children s shelter in sri lanka .
sir elton played hits from his vast back catalogue to a sell out crowd which included former french president valery giscard d estaing and his wife anne aymone .
the veteran pop star played piano accompaniment throughout the concert which lasted for three hours without an interval .
he told the crowd throughout the years i ve done a lot of drugs and alcohol .
it s true that i was a nightmare impossible .
for the last years i ve been normal .
now my drug is called david a reference to david furnish his partner .
the crowd who greeted each song w

Lex Rank

In [105]:
!pip install sumy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [99]:
!pip install nltk
import nltk
nltk.download('punkt')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [106]:
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.parsers.plaintext import PlaintextParser as pp
from sumy.nlp.tokenizers import Tokenizer

stitle, sbody = train_pairs[0]
sbody = pp.from_string(sbody, Tokenizer('english'))
lrs = LexRankSummarizer()
summ = lrs(sbody.document, th1)
for sentence in summ:
    print(sentence)

sir elton john has performed at a special concert in paris to raise money for the victims of the asian tsunami .
the concert was also part of an attempt to bring a broader range of events to the famous venue .
money raised will go to the fondation pour l enfance foundation for childhood which aims to rebuild a children s shelter in sri lanka .
the veteran pop star played piano accompaniment throughout the concert which lasted for three hours without an interval .
the crowd who greeted each song with a standing ovation also included french singer charles aznavour and british ambassador sir john holmes .


Luhn

In [107]:
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.parsers.plaintext import PlaintextParser as pp
from sumy.nlp.tokenizers import Tokenizer

stitle, sbody = train_pairs[0]
sbody = pp.from_string(sbody, Tokenizer('english'))
luhn = LuhnSummarizer()
summ = luhn(sbody.document, th1)
for sentence in summ:
 print(sentence)

sir elton john has performed at a special concert in paris to raise money for the victims of the asian tsunami .
the concert was also part of an attempt to bring a broader range of events to the famous venue .
he told the crowd throughout the years i ve done a lot of drugs and alcohol .
the crowd who greeted each song with a standing ovation also included french singer charles aznavour and british ambassador sir john holmes .
sir elton has also teamed up with phil collins to record a version of eric clapton s hit tears in heaven to raise money for the relief fund .


Latent Semantic Analysis (LSA)

In [108]:
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser as pp
from sumy.nlp.tokenizers import Tokenizer

stitle, sbody = train_pairs[0]
sbody = pp.from_string(sbody, Tokenizer('english'))
lsa = LsaSummarizer()
summ = lsa(sbody.document, th1)
for sentence in summ:
    print(sentence)

money raised will go to the fondation pour l enfance foundation for childhood which aims to rebuild a children s shelter in sri lanka .
sir elton played hits from his vast back catalogue to a sell out crowd which included former french president valery giscard d estaing and his wife anne aymone .
the crowd who greeted each song with a standing ovation also included french singer charles aznavour and british ambassador sir john holmes .
sir elton has also teamed up with phil collins to record a version of eric clapton s hit tears in heaven to raise money for the relief fund .
a release date has yet to be set for the recording which was organised by sharon osbourne .


Text Rank

In [109]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser as pp
from sumy.nlp.tokenizers import Tokenizer

stitle, sbody = train_pairs[0]
sbody = pp.from_string(sbody, Tokenizer('english'))
textrank = TextRankSummarizer()
summ = textrank(sbody.document, th1)
for sentence in summ:
    print(sentence)

sir elton john has performed at a special concert in paris to raise money for the victims of the asian tsunami .
the british singer played to a strong audience on sunday at the french capital s bastille opera house .
the concert was also part of an attempt to bring a broader range of events to the famous venue .
money raised will go to the fondation pour l enfance foundation for childhood which aims to rebuild a children s shelter in sri lanka .
sir elton has also teamed up with phil collins to record a version of eric clapton s hit tears in heaven to raise money for the relief fund .


All together

In [133]:
from gensim.summarization.summarizer import summarize
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser as pp
from sumy.nlp.tokenizers import Tokenizer


def short_summary(stitle, sbody, th1, th2, list_output=False):
    summGS = summarize(sbody, ratio=0.2, word_count=th1*25)
    summGS = [each + '.' for each in summGS.split('.')[:-1]]

    sbody = pp.from_string(sbody, Tokenizer('english'))
    lrs = LexRankSummarizer()
    summLRS = lrs(sbody.document, th1)
    summLRS = [str(each) for each in summLRS]
    luhn = LuhnSummarizer()
    summLUHN = luhn(sbody.document, th1)
    summLUHN = [str(each) for each in summLUHN]
    lsa = LsaSummarizer()
    summLSA = lsa(sbody.document, th1)
    summLSA = [str(each) for each in summLSA]
    textrank = TextRankSummarizer()
    summTR = textrank(sbody.document, th1)
    summTR = [str(each) for each in summTR]

    summaries = [summGS, summLRS, summLUHN, summLSA, summTR]
    sents = dict()

    for summ in summaries:
        for i, sent in enumerate(summ):
            if sent not in sents:
                sents[sent] = len(summ) - i
            else:
                sents[sent] = sents[sent] * (len(summ) - i)

    sents = dict(sorted(sents.items(), key=lambda x: x[1], reverse=True))

    if list_output:
        return list(sents.keys())[:th2]
    else:
        return ' '.join(list(sents.keys())[:th2])

stitle, sbody = train_pairs[0]
short_summary(stitle, sbody, th1, th2, list_output=True)

['sir elton john has performed at a special concert in paris to raise money for the victims of the asian tsunami .',
 'the concert was also part of an attempt to bring a broader range of events to the famous venue .',
 'money raised will go to the fondation pour l enfance foundation for childhood which aims to rebuild a children s shelter in sri lanka .']

\</Extractive Summarization\>