In [1]:
import numpy as np
import pandas as pd
import nltk
# libraries and programs for NLP for English written in Python
nltk.download('punkt')
# tokenizer divides a text into a list of sentences
import re
# specifies a set of strings that matches it

In [3]:
df=pd.read_csv('../input/tennisdata/tennis_articles.csv', encoding='latin-1')

In [4]:
df.head

In [5]:
# let's print some of the values of the variable just to see what they look like
df['article_text'][0]

In [6]:
df['article_text'][1]

In [7]:
df['article_text'][3]

In [9]:
# split text into sentences

from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x]

In [10]:
sentences[:5]

In [11]:
# Download GloVe Word Embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [12]:
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [13]:
len(word_embeddings)

In [14]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [15]:
nltk.download('stopwords')

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [17]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [None]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [18]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [19]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [20]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [23]:
# Applying PageRank Algorithm
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [25]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [26]:
# Extract top 10 sentences as the summary
for i in range(10):
  print(ranked_sentences[i][1])