In [1]:
import numpy as np
import pandas as pd
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\developer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\developer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\developer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Text Summarization Using Naive Average

In [4]:
df = pd.read_csv("bbc_text_cls.csv")

In [5]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [6]:
doc = df[df.labels=="tech"]["text"].sample(random_state=1234)

In [7]:
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [8]:
print(wrap(doc.iloc[0]))

China 'to overtake US net use'

The Chinese net-using population looks
set to exceed that of the US in less than three years, says a report.
China's net users number 100m but this represents less than 8% of the
country's 1.3 billion people.  Market analysts Panlogic predicts that
net users in China will exceed the 137 million US users of the net by
2008. The report says that the country's culture will mean that
Chinese people will use the net for very different ends than in many
other nations.

Already net use in China has a very different
character than in many Western nations, said William Makower, chief
executive of Panlogic.  In many Western nations desktop computers that
can access the net are hard to escape at work.  By contrast in China
workplace machines are relatively rare.  This, combined with the
relatively high cost of PCs in China and the time it takes to get
phone lines installed, helps to explains the huge number of net cafes
in China.  Only 36% of Chinese homes have tel

In [9]:
sentences = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])

In [11]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words("english"), norm="l1")

In [14]:
X = vectorizer.fit_transform(sentences)
X

<25x175 sparse matrix of type '<class 'numpy.float64'>'
	with 274 stored elements in Compressed Sparse Row format>

In [21]:
scores = np.zeros(len(sentences))
for i in range(len(sentences)):
    row = X[i, :]
    scores[i] = row[row != 0].mean()

In [22]:
scores

array([0.07692308, 0.1       , 0.08333333, 0.07692308, 0.06666667,
       0.1       , 0.16666667, 0.05555556, 0.16666667, 0.06666667,
       0.25      , 0.06666667, 0.09090909, 0.1       , 0.14285714,
       0.07692308, 0.1       , 0.05882353, 0.06666667, 0.14285714,
       0.2       , 0.11111111, 0.05555556, 0.16666667, 0.11111111])

In [23]:
sort_idx = np.argsort(-scores)

In [25]:
print("Generated Summary: ")
for i in sort_idx[:5]:
    print(wrap("%.2f: %s" % (scores[i], sentences[i])))

Generated Summary: 
0.25: "Its fundamentally different usage to what we have here," he
said.
0.20: "Generally it's more difficult for the government to be able to
control it."
0.17: By contrast in China workplace machines are relatively rare.
0.17: Only 36% of Chinese homes have telephones according to reports.
0.17: Familiarity with the net also has a certain social cachet.


In [26]:
doc.iloc[0].split("\n", 1)[0]

"China 'to overtake US net use'"

In [27]:
def summarize(text):
    sents = nltk.sent_tokenize(text)
    X = vectorizer.fit_transform(sents)
    scores = np.zeros(len(sents))
    for i in range(len(sents)):
        row = X[i, :]
        scores[i] = row[row != 0].mean()
    sort_idx = np.argsort(-scores)
    print("Generated Summary: ")
    for i in sort_idx[:5]:
        print(wrap("%.2f: %s" % (scores[i], sents[i])))

In [30]:
doc = df[df.labels == "entertainment"]["text"].sample(random_state=1234)
print(doc.iloc[0].split("\n", 1)[0])
summarize(doc.iloc[0].split("\n", 1)[1])

U2 to play at Grammy awards show
Generated Summary: 
0.33: It will be held at the Staples Center.
0.10: This year the Grammys have been dominated by rap star Kanye
West, who is in contention for 10 awards.
0.10: US comedian Ellen Degeneres and singer Christine Milian will
present awards at the event.
0.09: U2 are nominated twice for their recent single Vertigo,
including a nomination for best rock song.
0.08: Last week Grammy producers announced the show will be hosted by
rap star and Chicago actress Queen Latifah.


# Text Summarization Using TextRank