In [1]:
import numpy as np   
import pandas as pd  
import nltk          
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import re           
from tqdm import tqdm,trange 
import os    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
type_of_article = 'business'  
root_path = 'BBC News Summary'
num_of_article = len(os.listdir(f"{root_path}/News Articles/{type_of_article}"))

df = pd.DataFrame(columns=['title','article','summary'])

for i in tqdm(range(num_of_article)):
    with open(f'{root_path}/News Articles/{type_of_article}/{(i+1):03d}.txt', 'r') as f:
        article = f.read().splitlines()
    with open(f'{root_path}/Summaries/{type_of_article}/{(i+1):03d}.txt', 'r') as f:
        summary = f.read().splitlines()
    if len(summary) >=2:
        print(i)
    df.loc[i] = [article[0],article[2:],summary[0]]

100%|██████████| 510/510 [00:02<00:00, 175.69it/s]


In [3]:
df.head()

Unnamed: 0,title,article,summary
0,Ad sales boost Time Warner profit,[Quarterly profits at US media giant TimeWarne...,TimeWarner said fourth quarter sales rose 2% t...
1,Dollar gains on Greenspan speech,[The dollar has hit its highest level against ...,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim,[The owners of embattled Russian oil giant Yuk...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits,[British Airways has blamed high fuel prices f...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq,[Shares in UK drinks and food firm Allied Dome...,Pernod has reduced the debt it took on to fund...


In [4]:
def print_article(article):
    for item in article:
        print(item)

print_article( df['article'][0])

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-spe

In [5]:
def sentence_tokenize(sent):
    lst = []
    tmp_lst = []
    for i in range(len(sent)):
        if not (len(tmp_lst)==0 and sent[i]==' '):
            tmp_lst.append(sent[i])
        if i==len(sent)-1 or (sent[i]=='.' and (not(sent[i+1].isdigit()) or sent[i+1]==" ")):
            lst.append(''.join(tmp_lst))
            tmp_lst = []
    return lst

In [6]:
def split_sent(article_list):
    sentences = []
    for s in article_list:
        sentences.extend(sentence_tokenize(s))  
    return sentences

In [7]:
sentences = split_sent(df['article'][0])
sentences

['Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.',
 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales.',
 'TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.',
 'Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.',
 'Time Warner said on Friday that it now owns 8% of search-engine Google.',
 'But its own internet business, AOL, had has mixed fortunes.',
 'It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.',
 "However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.",
 "It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's

In [8]:
print(len(sentences))

20


In [9]:
lower_sentences = [s.lower() for s in sentences]

In [10]:
print(sentences[0])
print(lower_sentences[0])

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.
quarterly profits at us media giant timewarner jumped 76% to $1.13bn (â£600m) for the three months to december, from $639m year-earlier.


In [11]:
nltk.download('stopwords')# one time execution
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [13]:
# remove stopwords from the sentences
clean_stopword_sentences = [remove_stopwords(r.split()) for r in lower_sentences]

In [14]:
print(lower_sentences[4])
print(clean_stopword_sentences[4])

time warner said on friday that it now owns 8% of search-engine google.
time warner said friday owns 8% search-engine google.


In [15]:
def clean_sentence(sentences):
    # remove punctuations, numbers and special characters
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ",regex=True)
    return clean_sentences
clean_sentences = clean_sentence(clean_stopword_sentences)

In [16]:
print(clean_stopword_sentences[4])
print(clean_sentences[4])

time warner said friday owns 8% search-engine google.
time warner said friday owns    search engine google 


In [17]:
# Extract word vectors
def obtain_word_embeddings(path):
    word_embeddings = {}
    with open(path, encoding='utf-8') as f:
        for line in f:
            values = line.split()  # Split data
            word = values[0]       # Get its name
            coefs = np.asarray(values[1:], dtype='float32')  # Get its coefficient
            word_embeddings[word] = coefs
    return word_embeddings

In [18]:
dim = 100 # 50d, 100d, 200d, 300d
word_embeddings = obtain_word_embeddings('glove.6B.100d.txt')

In [19]:
print(len(word_embeddings['power']))
word_embeddings['power']

100


array([-4.0623e-02,  1.2911e-01,  9.2652e-01, -7.2253e-02,  4.3828e-01,
       -3.7762e-01, -2.7500e-01, -9.7944e-02, -1.7680e-01,  3.8279e-01,
        9.2663e-03,  9.0631e-03, -3.1502e-01,  6.2815e-02, -2.2111e-01,
       -9.9742e-01, -1.8360e-01,  3.9113e-01, -9.2952e-02, -1.2779e-01,
        4.8426e-01, -4.9320e-01,  5.0948e-01, -4.0813e-01,  6.3657e-01,
       -3.5722e-01, -2.9193e-01,  3.8334e-01,  5.3071e-01,  3.5986e-01,
        9.4441e-01,  5.1081e-01, -2.8931e-01, -1.8275e-01, -6.4469e-01,
        1.7839e-03,  2.9478e-01, -1.6024e-01, -3.6157e-01, -3.5547e-01,
       -1.7029e-01, -3.6866e-01,  2.1928e-01, -8.1945e-01, -9.6375e-02,
       -9.4109e-02,  3.1669e-01, -5.9285e-01,  5.9422e-01, -3.3568e-01,
       -5.5049e-01,  5.8094e-02, -2.0299e-02,  1.5526e+00,  1.0057e+00,
       -2.2807e+00,  9.0735e-02,  2.5548e-01,  1.9764e+00,  1.9240e-01,
        2.1717e-01, -5.1021e-01, -5.1359e-01,  5.1908e-01,  1.0555e+00,
        5.8991e-01,  3.1111e-01, -6.3756e-01,  8.6152e-02,  3.56

In [20]:
def get_sentence_vector(clean_sentences,word_embeddings,dim):
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0 and len(i.split())!=0:
            v = sum([word_embeddings.get(w, np.zeros((dim,))) for w in i.split()])  /  (len(i.split()))
        else:
            v = np.zeros((dim,))
        sentence_vectors.append(v)
    return sentence_vectors

In [21]:
sentence_vectors = get_sentence_vector(clean_sentences,word_embeddings,dim)

In [22]:

sentence_vectors[0]

array([ 0.25752053,  0.23528284,  0.1550738 , -0.11292683,  0.04785102,
       -0.27084264,  0.18262467,  0.14807741, -0.33969784, -0.02814914,
        0.6900326 ,  0.012676  ,  0.12063119, -0.11588935, -0.02513509,
       -0.08353068, -0.23286067, -0.06393155, -0.11162333,  0.04924533,
        0.4963151 , -0.00559121,  0.06584881,  0.7407327 ,  0.17311713,
       -0.20972148,  0.08057515, -0.05163107,  0.05253493, -0.20233728,
        0.22026818,  0.23793788, -0.19043288, -0.1615126 , -0.02269659,
        0.25451896, -0.06301408,  0.20892058, -0.11234333,  0.11002687,
       -0.09511994, -0.294326  ,  0.02288165, -0.02382299,  0.09188754,
       -0.01889439, -0.03790221, -0.33707657,  0.05119812, -0.6637375 ,
        0.00528113, -0.22597614,  0.45746925,  0.7254707 , -0.2601727 ,
       -1.7819446 , -0.28429502, -0.07216566,  1.5086141 ,  0.34097487,
       -0.05817715,  0.18997292, -0.36550254,  0.0503162 ,  0.11474153,
        0.2679809 ,  0.06190809,  0.27976716,  0.5992585 ,  0.09

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
def get_sim_mat(sentences,sentence_vectors,dim):
    # similarity matrix
    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,dim), sentence_vectors[j].reshape(1,dim))[0,0]
    return sim_mat

In [25]:
sim_mat = get_sim_mat(sentences,sentence_vectors,dim)

In [26]:
sim_mat[0]

array([0.        , 0.84675545, 0.85546279, 0.83095247, 0.80431628,
       0.71415198, 0.87522542, 0.86956829, 0.75488216, 0.82849383,
       0.88327944, 0.86984181, 0.87482041, 0.75345731, 0.82059687,
       0.7541002 , 0.82266867, 0.83089364, 0.81281799, 0.79938984])

In [27]:
import networkx as nx
def rank_sent(sim_mat,sentences):
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i],i) for i,s in enumerate(sentences)), reverse=True)
    return ranked_sentences

In [28]:
ranked_sentences = rank_sent(sim_mat,sentences)


In [29]:
ranked_sentences

[(0.05382744545717655, 7),
 (0.05271410001923109, 1),
 (0.0523572127021195, 10),
 (0.05162670617803707, 18),
 (0.05156791404260303, 0),
 (0.05155716801626093, 3),
 (0.05049959070946769, 14),
 (0.050149716633304454, 19),
 (0.049789680769217574, 17),
 (0.049698028447220315, 6),
 (0.049412523226725055, 11),
 (0.049371272703831265, 8),
 (0.0490546957496752, 4),
 (0.048995438646976676, 12),
 (0.048982802625338245, 15),
 (0.048771953453086506, 9),
 (0.048663849762547694, 16),
 (0.04821598540000167, 13),
 (0.04788184075849474, 5),
 (0.0468620746986848, 2)]

In [30]:
def extract_summary(ranked_sentences, sentence_number):
    # Specify number of sentences to form the summary
    result_lst = []
    # Generate summary
    for i in range(sentence_number):
        result_lst.append(ranked_sentences[i][1])

    
    return result_lst

In [31]:

result_lst = extract_summary(ranked_sentences,8)
result_lst

[7, 1, 10, 18, 0, 3, 14, 19]

In [32]:
data = df['summary'][0]
data = sentence_tokenize(data)
gold = split_sent(data)

In [33]:
gold_lst = []
for i,sent in enumerate(sentences):
    if sent in gold:
        gold_lst.append(i)

In [34]:
result_lst


[7, 1, 10, 18, 0, 3, 14, 19]

In [35]:
gold_lst

[0, 2, 3, 6, 7, 10, 12, 14]

In [36]:
correct = 0
for i in range(len(result_lst)):
    if result_lst[i] in gold_lst:
        correct+=1

In [37]:
correct/8


0.625

In [38]:
def test(i,word_embeddings,dim):
    sentences = split_sent(df['article'][i])
    stop_words = stopwords.words('english')
    lower_sentences = [s.lower() for s in sentences]
    clean_stopword_sentences = [remove_stopwords(r.split()) for r in lower_sentences]
    clean_sentences = clean_sentence(clean_stopword_sentences)
    sentence_vectors = get_sentence_vector(clean_sentences,word_embeddings,dim)
    sim_mat = get_sim_mat(sentences,sentence_vectors,dim)
    ranked_sentences = rank_sent(sim_mat,sentences)


    data = df['summary'][i]
    data = sentence_tokenize(data)
    gold = split_sent(data)
    sentence_num = len(gold)

    result_lst = extract_summary(ranked_sentences,sentence_num)
    result_lst.sort()
    gold_lst = []
    for i,sent in enumerate(sentences):
        if sent in gold:
            gold_lst.append(i)
    correct = 0
    for i in range(len(result_lst)):
        if result_lst[i] in gold_lst:
            correct+=1
    return correct/sentence_num

In [39]:
result = []
t = trange(510)
dim = 300
word_embeddings = obtain_word_embeddings('glove.6B.300d.txt')

for i in t:
    accuracy = test(i,word_embeddings,dim)
    result.append(accuracy)
#     t.set_postfix()
    t.set_postfix(Acc='%g' % accuracy)

100%|██████████| 510/510 [02:03<00:00,  4.11it/s, Acc=0.666667] 


In [40]:
sum(result)/len(result)


0.5763393490754177

In [41]:
data

['TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.',
 'For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.',
 'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.',
 "However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.",
 'Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.',
 'For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.',
 'It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.',
 "Time Warner's fourth quarter profits were slightly better than analysts' expectations."]

In [42]:
from rouge import Rouge
hypothesis = ''.join([sentences[i] for i in result_lst])
reference = ''.join([ i for i in data])

rouge = Rouge()

scores = rouge.get_scores(hypothesis, reference)

In [43]:
scores[0]['rouge-2']['f']


0.5769230719436228

In [44]:
" ".join([sentences[i] for i in sorted(result_lst)])

"Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier. The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. Time Warner's fourth quarter profits were slightly better than analysts' expectations. For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins. It intends to adjust the way it accounts for a deal with German music publisher Bertelsmann's purchase of a stake in AOL Europe, which it had reported as advertising revenue. It will now book the sale of its stake in AOL Europe as a loss on the value of

In [45]:
df['summary'][0]

"TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.For 2005, TimeWarner is projecting operating earnings growth of around 5%, and also expects higher revenue and wider profit margins.It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters.Time Warner's fourth quarter profits were slightly better than analysts' expectations."

In [46]:
len(df)

510