In [None]:
# summary of what did I do.
# I used text rank algorithim which is a derivative of PageRank Algorithim used primarily for ranking web pages in online search results.
# In text rank algo we do similar thing i.e 
   #In place of web pages, we use sentences
   #Similarity between any two sentences is used as an equivalent to the web page transition probability
   #The similarity scores are stored in a square matrix, similar to the matrix M used for PageRank

#TextRank is an extractive and unsupervised text summarization technique.

#PROCEDURE
      #The first step would be to concatenate all the text contained in the articles
      #Then split the text into individual sentences
      #In the next step, i found the vector representation (word embeddings) for each and every sentence using gensim library
      #Similarities between sentence vectors are then calculated and stored in a matrix using cosine similarity with scikit learn library
      #The similarity matrix is then converted into a graph, with sentences as vertices and similarity scores as edges, for sentence rank calculation
      #Finally, a certain number of top-ranked sentences form the final summary.

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

In [None]:
df = pd.read_excel(r'C:\Users\ISHA JAIN\Downloads\TASK.xlsx', sep='delimiter')

In [None]:
df.head()

In [5]:
df['introduction'][0]

'Acnesol Gel is an antibiotic that fights bacteria. It is used to treat acne, which appears as spots or pimples on your face, chest or back. This medicine works by attacking the bacteria that cause these pimples.Acnesol Gel is only meant for external use and should be used as advised by your doctor. You should normally wash and dry the affected area before applying a thin layer of the medicine. It should not be applied to broken or damaged skin. Avoid any contact with your eyes, nose, or mouth. Rinse it off with water if you accidentally get it in these areas. It may take several weeks for your symptoms to improve, but you should keep using this medicine regularly. Do not stop using it as soon as your acne starts to get better. Ask your doctor when you should stop treatment.Common side effects like minor itching, burning, or redness of the skin and oily skin may be seen in some people. These are usually temporary and resolve on their own. Consult your doctor if they bother you or do no

In [6]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['introduction']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list


In [7]:
sentences[:14]

['Acnesol Gel is an antibiotic that fights bacteria.',
 'It is used to treat acne, which appears as spots or pimples on your face, chest or back.',
 'This medicine works by attacking the bacteria that cause these pimples.Acnesol Gel is only meant for external use and should be used as advised by your doctor.',
 'You should normally wash and dry the affected area before applying a thin layer of the medicine.',
 'It should not be applied to broken or damaged skin.',
 'Avoid any contact with your eyes, nose, or mouth.',
 'Rinse it off with water if you accidentally get it in these areas.',
 'It may take several weeks for your symptoms to improve, but you should keep using this medicine regularly.',
 'Do not stop using it as soon as your acne starts to get better.',
 'Ask your doctor when you should stop treatment.Common side effects like minor itching, burning, or redness of the skin and oily skin may be seen in some people.',
 'These are usually temporary and resolve on their own.',
 'Co

In [8]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [9]:
#Get rid of the stopwords (commonly used words of a language – is, am, the, of, in, etc.) present in the sentences. 
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to C:\Users\ISHA
[nltk_data]     JAIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
#import the stopwords.

from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [11]:
# function to remove stopwords

def remove_stopwords(sent):
    sent_new = " ".join([i for i in sent if i not in stop_words])
    return sent_new

# remove stopwords from the sentences

clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [20]:
#Then I used clean_sentences to create vectors for sentences in the data with the help of the GloVe word vectors.

# Extract word vectors
word_embeddings = {}
f = open(r'C:\\Users\\ISHA JAIN\\Downloads\\glove.6B\\glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()



In [21]:
#Now, I create vectors for my sentences. 
#First I fetched vectors (each of size 100 elements) for the constituent words in a sentence
#And then took mean/average of those vectors to arrive at a consolidated vector for the sentence.

sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [22]:
#The next step is to find similarities between the sentences, hence I used the cosine similarity approach for this.
#first I defined a zero matrix of dimensions (n * n). 
#Then I initialized this matrix with cosine similarity scores of the sentences. Here, n is the number of sentences.

# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [None]:
#Applying PageRank Algorithm
#Converting the similarity matrix sim_mat into a graph.
#The nodes of this graph will represent the sentences and the edges will represent the similarity scores between the sentences. 
#On this graph, we will apply the PageRank algorithm to arrive at the sentence rankings.

import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [None]:
#Finally, I extracted the top N sentences based on their rankings for summary generation.

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [None]:
# Extract the summary by giving the row no. of the description you want to be summarized.
n=int(input('enter the row number of the description from the data frame you wish to summarize'))
for i in range(n):
  print(ranked_sentences[i][0])