In [36]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt_tab')
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
df = pd.read_csv("tennis.csv")
df.head(10)

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...
5,6,Nadal has not played tennis since he was force...,https://www.express.co.uk/sport/tennis/1037119...
6,7,"Tennis giveth, and tennis taketh away. The end...",http://www.tennis.com/pro-game/2018/10/tennisc...
7,8,Federer won the Swiss Indoors last week by bea...,https://www.express.co.uk/sport/tennis/1038186...


In [15]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

# sentences = [y for x in sentences for y in x]

In [17]:
sentences = [y for x in sentences for y in x]
for sentence in df['article_text']:
   sentences.append(sent_tokenize(sentence))

sentences

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl.",
 "I say my hellos, but I'm not sending any players flowers as well.",
 "Uhm, I'm not really friendly or close to many players.",
 "I have not a lot of friends away from the courts.'",
 'When she said she is not really close to a lot of players, is that something strategic that she is doing?',
 "Is it different on the men's tour than the women's tour?",
 "'No, not at all.

In [18]:
#download and unzip the GloVe word vectorizer data
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-01-06 22:12:17--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-01-06 22:12:18--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-01-06 22:12:18--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [19]:
#Load the GloVe word embeddings into a dictionary
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8') #we chose the 100d format where every word  has 100 dimensional vector(like 100 floating numbers representng it)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [None]:
word_embeddings

In [25]:

nltk.download('stopwords')

clean_sentences = pd.Series(sentences).dropna().str.replace("[^a-zA-Z]", " ", regex=True)
clean_sentences = [s.lower() for s in clean_sentences if isinstance(s, str) and s.strip()]


stop_words = stopwords.words('english')

def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Compute TF-IDF scores

sentence_vectors = []

for sentence in clean_sentences:
    words = sentence.split()
    weighted_vectors = []
    tfidf_sum = 0

    for word in words:
        if word in word_embeddings and word in tfidf_feature_names:
            # Get the word's TF-IDF score
            tfidf_score = tfidf_vectorizer.vocabulary_.get(word, 0)
            # Multiply the word embedding by its TF-IDF score
            weighted_vectors.append(word_embeddings[word] * tfidf_score)
            tfidf_sum += tfidf_score

    # Compute the average of the weighted vectors
    if weighted_vectors:
        sentence_vector = np.sum(weighted_vectors, axis=0) / (tfidf_sum + 0.001)  # Normalize by TF-IDF sum
    else:
        sentence_vector = np.zeros((100,))  # Zero vector for empty sentences

    sentence_vectors.append(sentence_vector)



In [37]:


# Compute cosine similarity matrix
sim_mat = np.zeros([len(clean_sentences), len(clean_sentences)])
for i in range(len(clean_sentences)):
    for j in range(len(clean_sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(
                sentence_vectors[i].reshape(1, 100),
                sentence_vectors[j].reshape(1, 100)
            )[0, 0]






In [38]:
# Build graph and apply PageRank
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [39]:
# Ensure sentences and graph nodes are aligned
ranked_sentences = sorted(((scores[i], sentences[i]) for i in scores.keys()), reverse=True)

# Print summary
for i in range(3):  # Adjust the number of sentences as needed
    print(f"Summary Sentence {i+1}: {ranked_sentences[i][1]}")

Summary Sentence 1: Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
Summary Sentence 2: Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
Summary Sentence 3: When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.


In [41]:
# Sort sentences by PageRank scores
ranked_sentences = sorted(((scores[i], sentences[i]) for i in scores.keys()), reverse=True)

# Print original article and its summary
for i in range(min(5, len(ranked_sentences))):  # Ensure we don't exceed the number of ranked sentences
    print("ARTICLE:")
    print(df['article_text'][i])  # The original article text from the DataFrame
    print('\n')
    print("SUMMARY:")
    print(ranked_sentences[i][1])  # The top-ranked sentences as the summary
    print('\n')


ARTICLE:
Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in t