## Text Rank Algorithm

In [111]:
import numpy as np
import pandas as pd
import nltk
##nltk.download('punkt')
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import contractions
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.remove('no') ## reverify
stop_words.remove('not') ## reverify
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [5]:
document = pd.read_csv('tennis_articles_v4.csv')
document.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [6]:
document['article_text'].iloc[0]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same 

In [13]:
sentences = []
##loop through all article
for news in document['article_text']: 
    ##loop through all sentence of a article
    for sentence in sent_tokenize(news):
        sentences.append(sentence)
    
sentences[:2]

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much."]

In [17]:
## Using 100D Glove Vectors

word_embeddings = {}
glove_dimension = 100
filename = "D:\Project data\glove_vectors\glove.6B.100d.txt"

with open(filename, encoding = 'utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype= 'float32')
            if coefs.shape[0] == glove_dimension:
                word_embeddings[word] = coefs
            else:
                print(word,'  :Embedding Length Exception')
        except:
            print(word,'  :Embedding DataType Exception')
        
print('Total number of words in the Glove Embedding:', len(word_embeddings))

Total number of words in the Glove Embedding: 400000


In [80]:
def clean_text(sentences):
    clean_text = contractions.fix(sentences) 
    clean_text = clean_text.lower()
    clean_text = re.sub(r'[^a-z]', ' ', clean_text)
    clean_text = ' '.join([words for words in clean_text.split() if words not in stop_words])
    clean_text = clean_text.strip()
    return clean_text

clean_sentences = [ clean_text(sentence) for sentence in sentences]

clean_sentences[:2]

['maria sharapova basically no friends tennis players wta tour',
 'russian player no problems openly speaking recent interview said not really hide feelings much']

In [103]:
## mean of the words vector for each sentences
sentence_vector = [] 
for sentence in clean_sentences:
    if len(sentence) != 0:
        vector = np.sum([word_embeddings.get(word, np.zeros((100,))) for word in sentence.split() ], axis =0)
        vector = vector/(len(sentence.split()) + .0001)  ##mean
    else:
        vector = np.zeros((100,))
    sentence_vector.append(vector)





In [109]:
## Similarity Matrix

sim_matrix = np.zeros([len(clean_sentences), len(clean_sentences)])

for i in range(len(sentences)):
    for j in range(len(sentences)):
        sim_matrix[i][j] = cosine_similarity(sentence_vector[i].reshape(1,100), sentence_vector[j].reshape(1,100))[0,0]

In [113]:
##Applying PageRank Algorithm

#Node is the sentence and the transition probability is the similarity matrix value

nx_graph = nx.from_numpy_array(sim_matrix)
scores = nx.pagerank(nx_graph)

In [121]:
ranked_sentences = []
##ranked_sentences =[(scores[i],s) for i,s in enumerate(sentences) ]
for i, s in enumerate(sentences):
    #print(scores[i])
    #print(s)
    ranked_sentences.append((scores[i],s))

ranked_sentences = sorted(ranked_sentences, reverse = True)

In [131]:
for i in range(5):
    print(ranked_sentences[i][1])
    print('\n')

When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.


Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.


Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.


"I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments.


Currently in ninth place, Nishikori with a win could move to within 125 points of the cut for the eight-man event in