In [63]:
import numpy as np
import pandas as pd
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\developer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\developer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\developer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Text Summarization Using Naive Average

In [4]:
df = pd.read_csv("bbc_text_cls.csv")

In [5]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [6]:
doc = df[df.labels=="tech"]["text"].sample(random_state=1234)

In [7]:
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [8]:
print(wrap(doc.iloc[0]))

China 'to overtake US net use'

The Chinese net-using population looks
set to exceed that of the US in less than three years, says a report.
China's net users number 100m but this represents less than 8% of the
country's 1.3 billion people.  Market analysts Panlogic predicts that
net users in China will exceed the 137 million US users of the net by
2008. The report says that the country's culture will mean that
Chinese people will use the net for very different ends than in many
other nations.

Already net use in China has a very different
character than in many Western nations, said William Makower, chief
executive of Panlogic.  In many Western nations desktop computers that
can access the net are hard to escape at work.  By contrast in China
workplace machines are relatively rare.  This, combined with the
relatively high cost of PCs in China and the time it takes to get
phone lines installed, helps to explains the huge number of net cafes
in China.  Only 36% of Chinese homes have tel

In [9]:
sentences = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])

In [11]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words("english"), norm="l1")

In [14]:
X = vectorizer.fit_transform(sentences)
X

<25x175 sparse matrix of type '<class 'numpy.float64'>'
	with 274 stored elements in Compressed Sparse Row format>

In [21]:
scores = np.zeros(len(sentences))
for i in range(len(sentences)):
    row = X[i, :]
    scores[i] = row[row != 0].mean()

In [22]:
scores

array([0.07692308, 0.1       , 0.08333333, 0.07692308, 0.06666667,
       0.1       , 0.16666667, 0.05555556, 0.16666667, 0.06666667,
       0.25      , 0.06666667, 0.09090909, 0.1       , 0.14285714,
       0.07692308, 0.1       , 0.05882353, 0.06666667, 0.14285714,
       0.2       , 0.11111111, 0.05555556, 0.16666667, 0.11111111])

In [23]:
sort_idx = np.argsort(-scores)

In [25]:
print("Generated Summary: ")
for i in sort_idx[:5]:
    print(wrap("%.2f: %s" % (scores[i], sentences[i])))

Generated Summary: 
0.25: "Its fundamentally different usage to what we have here," he
said.
0.20: "Generally it's more difficult for the government to be able to
control it."
0.17: By contrast in China workplace machines are relatively rare.
0.17: Only 36% of Chinese homes have telephones according to reports.
0.17: Familiarity with the net also has a certain social cachet.


In [26]:
doc.iloc[0].split("\n", 1)[0]

"China 'to overtake US net use'"

In [27]:
def summarize(text):
    sents = nltk.sent_tokenize(text)
    X = vectorizer.fit_transform(sents)
    scores = np.zeros(len(sents))
    for i in range(len(sents)):
        row = X[i, :]
        scores[i] = row[row != 0].mean()
    sort_idx = np.argsort(-scores)
    print("Generated Summary: ")
    for i in sort_idx[:5]:
        print(wrap("%.2f: %s" % (scores[i], sents[i])))

In [30]:
doc = df[df.labels == "entertainment"]["text"].sample(random_state=1234)
print(doc.iloc[0].split("\n", 1)[0])
summarize(doc.iloc[0].split("\n", 1)[1])

U2 to play at Grammy awards show
Generated Summary: 
0.33: It will be held at the Staples Center.
0.10: This year the Grammys have been dominated by rap star Kanye
West, who is in contention for 10 awards.
0.10: US comedian Ellen Degeneres and singer Christine Milian will
present awards at the event.
0.09: U2 are nominated twice for their recent single Vertigo,
including a nomination for best rock song.
0.08: Last week Grammy producers announced the show will be hosted by
rap star and Chicago actress Queen Latifah.


# Text Summarization Using TextRank

In [31]:
df = pd.read_csv("bbc_text_cls.csv")

In [32]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [33]:
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [34]:
print(wrap(doc.iloc[0]))

U2 to play at Grammy awards show

Irish rock band U2 are to play live
at the Grammy Awards presentation in the US next month, organisers
have said.

Other acts to play include soul singer Alicia Keys,
country singer Tim McGraw and punk band Green Day at the event on 13
February in Los Angeles.  U2 are nominated twice for their recent
single Vertigo, including a nomination for best rock song.  This year
the Grammys have been dominated by rap star Kanye West, who is in
contention for 10 awards.  US comedian Ellen Degeneres and singer
Christine Milian will present awards at the event.  Last week Grammy
producers announced the show will be hosted by rap star and Chicago
actress Queen Latifah.  It will be held at the Staples Center.  U2 had
number one success in the album charts on both sides of the Atlantic
in November when their latest studio album, How to Dismantle an Atomic
Bomb, topped the US and UK charts.  The band, who are also dominated
for best international album at this year's B

In [35]:
sentences = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])

In [36]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words("english"), norm="l1")

In [46]:
X = vectorizer.fit_transform(sentences)
X

<9x93 sparse matrix of type '<class 'numpy.float64'>'
	with 113 stored elements in Compressed Sparse Row format>

In [66]:
S = cosine_similarity(X)

In [68]:
S.shape

(9, 9)

In [69]:
S /= S.sum(axis=1, keepdims=True)

In [72]:
S[0].sum()

1.0000000000000002

In [73]:
U = np.ones_like(S) / len(S)

In [74]:
U

array([[0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.11111111, 0.11111111, 0.11111111, 0.11111111],
       [0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.11111111, 0.11111111, 0.11111111, 0.11111111],
       [0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.11111111, 0.11111111, 0.11111111, 0.11111111],
       [0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.11111111, 0.11111111, 0.11111111, 0.11111111],
       [0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.11111111, 0.11111111, 0.11111111, 0.11111111],
       [0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.11111111, 0.11111111, 0.11111111, 0.11111111],
       [0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.11111111, 0.11111111, 0.11111111, 0.11111111],
       [0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.11111111, 0.11111111, 0.11111111, 0.11111111],


In [75]:
factor = 0.15
A = U * factor + S * (1 - factor)

In [76]:
eigvals, eigvecs = np.linalg.eig(A.T)

In [77]:
eigvals

array([1.        , 0.42269926, 0.48822834, 0.52764223, 0.85      ,
       0.63054941, 0.67533336, 0.75015993, 0.72869529])

In [78]:
eigvecs[:, 0]

array([0.36815542, 0.31882339, 0.31228841, 0.34754234, 0.33736123,
       0.31007514, 0.33276299, 0.31397953, 0.35387845])

In [79]:
eigvecs[:, 0].dot(A)

array([0.36815542, 0.31882339, 0.31228841, 0.34754234, 0.33736123,
       0.31007514, 0.33276299, 0.31397953, 0.35387845])

In [85]:
norm_eig = eigvecs[:, 0] / eigvecs[:, 0].sum() 
norm_eig

array([0.12292881, 0.10645662, 0.10427455, 0.11604601, 0.11264648,
       0.10353553, 0.11111111, 0.10483923, 0.11816166])

In [81]:
limiting_dist = np.ones(len(S)) / len(S)
threshold = 1e-8
delta = float("inf")
iters = 0
while delta > threshold:
    iters += 1
    p = limiting_dist.dot(A)
    delta = np.abs(limiting_dist - p).sum()
    limiting_dist = p
print(f"Number of iterations: {iters}")

Number of iterations: 39


In [82]:
limiting_dist

array([0.12292881, 0.10645661, 0.10427456, 0.11604601, 0.11264648,
       0.10353553, 0.11111111, 0.10483923, 0.11816166])

In [83]:
limiting_dist.sum()

1.0000000000000036

In [86]:
np.abs(norm_eig - limiting_dist).sum()

2.568320563167692e-08

In [87]:
scores = limiting_dist

In [88]:
sort_idx = np.argsort(-scores)

In [89]:
print("Generated Summary: ")
for i in sort_idx[:5]:
    print(wrap("%.2f: %s" % (scores[i], sentences[i])))

Generated Summary: 
0.12: 
Irish rock band U2 are to play live at the Grammy Awards
presentation in the US next month, organisers have said.
0.12: The band, who are also dominated for best international album at
this year's Brit Awards, are to undertake a major world tour this
year, their first for four years.
0.12: This year the Grammys have been dominated by rap star Kanye
West, who is in contention for 10 awards.
0.11: US comedian Ellen Degeneres and singer Christine Milian will
present awards at the event.
0.11: It will be held at the Staples Center.


In [90]:
doc.iloc[0].split("\n", 1)[0]

'U2 to play at Grammy awards show'

In [106]:
def summarize(text, factor=0.15):
    sentences = nltk.sent_tokenize(text)
    vectorizer = TfidfVectorizer(stop_words=stopwords.words("english"), norm="l1")
    X = vectorizer.fit_transform(sentences)
    S = cosine_similarity(X)
    S /= S.sum(axis=1, keepdims=True)
    S = np.ones_like(S) / len(S) * factor + (1 - factor) * S
    eigvals, eigvecs = np.linalg.eig(S)
    idx = 0
    for i in range(len(eigvals)):
        if eigvals[i] == 1:
            idx = i
            break
    norm_eig = eigvecs[:, idx] / eigvecs[:, idx].sum()
    scores = norm_eig
    sort_idx = np.argsort(norm_eig)
    print("Generated Summary: ")
    for j in sort_idx[:5]:
        print(wrap("%.2f: %s" % (scores[j], sentences[j])))

In [110]:
doc = df[df.labels == "tech"]["text"].sample(random_state=765)
print(doc.iloc[0].split("\n", 1)[0])
summarize(doc.iloc[0].split("\n", 1)[1])

'Evil twin' fear for wireless net
Generated Summary: 
0.04: In the vast majority of cases, base stations straight out of the
box from the manufacturers are automatically set up with the least
secure mode possible, said Dr Nobles.
0.04: Cybercriminals who try to glean personal information using the
scam, jam connections to a legitimate base station by sending a
stronger signal near to the wireless client.
0.04: "Cybercriminals don't have to be that clever to carry out such
an attack," said Dr Phil Nobles, a wireless net and cybercrime expert
at Cranfield.
0.04: Dr Nobles is due to speak about wireless cybercrime at the
Science Museum's Dana Centre in London on Thursday.
0.04: A wireless network that is not protected can provide a backdoor
into a company's computer system.


# Text Summarization Using Libraries

In [111]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
Collecting docopt<0.7,>=0.6.1
  Using cached docopt-0.6.2-py2.py3-none-any.whl
Collecting pycountry>=18.2.23
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting breadability>=0.1.20
  Downloading breadability-0.1.20.tar.gz (32 kB)
Building wheels for collected packages: breadability, pycountry
  Building wheel for breadability (setup.py): started
  Building wheel for breadability (setup.py): finished with status 'done'
  Created wheel for breadability: filename=breadability-0.1.20-py2.py3-none-any.whl size=21712 sha256=c0fada659e66b7cc37595f70e82a9f16eaa27350ec651e282db11d19ff8063fb
  Stored in director

In [112]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

In [113]:
summarizer = TextRankSummarizer()
parser = PlaintextParser.from_string(doc.iloc[0].split("\n", 1)[1], Tokenizer("english"))
summary = summarizer(parser.document, sentences_count=5)

In [114]:
summary

(<Sentence: "Users need to be wary of using their wi-fi enabled laptops or other portable devices in order to conduct financial transactions or anything that is of a sensitive or personal nature," said Professor Brian Collins, head of information systems at Cranfield University.>,
 <Sentence: BT Openzone, which operates a vast proportion of public hotspots in the UK, told the BBC News website that it made every effort to make its wi-fi secure.>,
 <Sentence: He said BT Openzone, as well as others, have sophisticated encryption from the start of the login process to the service at a hotspot.>,
 <Sentence: In the vast majority of cases, base stations straight out of the box from the manufacturers are automatically set up with the least secure mode possible, said Dr Nobles.>,
 <Sentence: Cybercriminals who try to glean personal information using the scam, jam connections to a legitimate base station by sending a stronger signal near to the wireless client.>)

In [115]:
for s in summary:
    print(wrap(str(s)))

"Users need to be wary of using their wi-fi enabled laptops or other
portable devices in order to conduct financial transactions or
anything that is of a sensitive or personal nature," said Professor
Brian Collins, head of information systems at Cranfield University.
BT Openzone, which operates a vast proportion of public hotspots in
the UK, told the BBC News website that it made every effort to make
its wi-fi secure.
He said BT Openzone, as well as others, have sophisticated encryption
from the start of the login process to the service at a hotspot.
In the vast majority of cases, base stations straight out of the box
from the manufacturers are automatically set up with the least secure
mode possible, said Dr Nobles.
Cybercriminals who try to glean personal information using the scam,
jam connections to a legitimate base station by sending a stronger
signal near to the wireless client.


In [116]:
summarizer = LsaSummarizer()
summary = summarizer(parser.document, sentences_count=5)
for s in summary:
    print(wrap(str(s)))

The latest threat, nicknamed evil twins, pose as real hotspots but are
actually unauthorised base stations, say Cranfield University experts.
"Users can also protect themselves by ensuring that their wi-fi device
has its security measures activated," he added.
"Because wireless networks are based on radio signals they can be
easily detected by unauthorised users tuning into the same frequency."
Some companies have been reluctant to use them in large numbers
because of fears about security.
Dr Nobles is due to speak about wireless cybercrime at the Science
Museum's Dana Centre in London on Thursday.


In [122]:
!pip install gensim

