In [1]:
# This is a single-domain-multiple-documents summarization task

In [37]:
import numpy as np
import pandas as pd
import networkx as nx

import nltk
import re

from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt') # one time execution
stop_words = stopwords.words('english')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NunYurBusiness\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NunYurBusiness\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
df = pd.read_csv("tennis_articles_v4.csv")

In [10]:
df.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [11]:
sentences = []
# for s in df['article_text']:
#     sentences.append(sent_tokenize(s))

sentences = [sent_tokenize(s) for s in df['article_text']]
    
sentences = [y for x in sentences for y in x] # flatten list

In [12]:
sentences[:5]

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl."]

In [15]:
glove_path = "D://Datasets//glove//twitter_embeddings//glove.twitter.27B.200d.txt"

f = open(glove_path, encoding='utf-8')
word_embeddings = {}

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [16]:
len(word_embeddings)

1193514

In [17]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).replace("[^a-zA-Z]", " ")

# make words lower case
clean_sentences = [s.lower() for s in clean_sentences]

In [22]:
# function to remove stopwords
def remove_stopwords(sentence):
    sen_new = " ".join([i for i in sentence if i not in stop_words])
    return sen_new

In [23]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [24]:
# extract word vectors
word_embeddings = {}

f = open(glove_path, encoding='utf-8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [29]:
# now we generate the sentence vectors.
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((200,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((200,))
    sentence_vectors.append(v)

In [30]:
# now we prep. the similarity matrix - where we will use the cosine similarity
sim_mat = np.zeros([len(sentences), len(sentences)])

In [35]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,200), sentence_vectors[j].reshape(1,200))[0,0]

In [38]:
# apply textrank / pagerank
# first we must convert the matrix into a graph, where the nodes in the graph represent the similarity scores between sentences
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [40]:
# now we perform summary extraction - where we extract the top N sentences based on their rankings
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

# extract top 10 sentences
for i in range(10):
    print(ranked_sentences[i][1] + "\n")

"I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments.

When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.

He used his first break point to close out the first set before going up 3-0 in the second and wrapping up the win on his first match point.

Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.

"Not always, but I really feel like in the mid-2000 years there was a huge shift of the attitudes of the top players and being more fri