In [2]:
import numpy as np
import pandas as pd
import re
import nltk
#nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer 
from nltk.tokenize import word_tokenize

# WordNetLemmatizer needs to be downloaded before use
from nltk.stem.wordnet import WordNetLemmatizer
#To track function execution
from tqdm import tqdm
from bs4 import BeautifulSoup


from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx

import math

In [3]:
df = pd.read_csv('assignment_data.csv')

In [4]:
df.head()

Unnamed: 0,SetID,Adverse Reactions,Summary
0,a834d1cf-72fc-93bf-e053-2995a90a6191,The following adverse events were observed and...,
1,a835b697-2beb-1ba8-e053-2995a90a470c,The following serious adverse reactions are de...,
2,a837f13e-fafc-0535-e053-2995a90a5070,ADVERSE REACTIONS Clinical Trials Experience I...,
3,a838204b-9564-9aa6-e053-2a95a90af02f,ADVERSE REACTIONS Clinical Trials Experience I...,
4,f265e6dd-f47e-4511-9468-282184bcd1b1,The most common adverse reactions leading to d...,


In [5]:
len(df)

990

In [6]:
df = df[df['Adverse Reactions'].notnull()]
df = df[df['Adverse Reactions'] != "ERROR1"]
df = df[df['Adverse Reactions'] != "Adverse Reactions"]

In [7]:
len(df)

989

In [8]:
sentences = []

for sentence in df['Adverse Reactions']:
    sentences.append(sentence)
    
df['sentences'] = sentences

In [9]:
df.head()

Unnamed: 0,SetID,Adverse Reactions,Summary,sentences
0,a834d1cf-72fc-93bf-e053-2995a90a6191,The following adverse events were observed and...,,The following adverse events were observed and...
1,a835b697-2beb-1ba8-e053-2995a90a470c,The following serious adverse reactions are de...,,The following serious adverse reactions are de...
2,a837f13e-fafc-0535-e053-2995a90a5070,ADVERSE REACTIONS Clinical Trials Experience I...,,ADVERSE REACTIONS Clinical Trials Experience I...
3,a838204b-9564-9aa6-e053-2a95a90af02f,ADVERSE REACTIONS Clinical Trials Experience I...,,ADVERSE REACTIONS Clinical Trials Experience I...
4,f265e6dd-f47e-4511-9468-282184bcd1b1,The most common adverse reactions leading to d...,,The most common adverse reactions leading to d...


### Text Cleaning

In [10]:
# Text Cleaning
corpus = []
def clean_content(df):
    cleaned_content = []

    for sent in tqdm(df["sentences"]):
        
        #remove html content
        review_content = BeautifulSoup(sent).get_text()
            
        #remove non-alphabetic characters
        review_content = re.sub("[^a-zA-Z]"," ", review_content)
    
        #tokenize the sentences
        words = word_tokenize(review_content.lower())
    
        #lemmatize each word to its lemma
        lem = WordNetLemmatizer()
        lemma_words = [lem.lemmatize(word) for word in words] 
        lemma_words = " ".join(lemma_words)
        cleaned_content.append(lemma_words)
        
        corpus.append(lemma_words)
        
    return(cleaned_content)


In [11]:
cleaned_sentences = clean_content(df)

100%|███████████████████████████████████████████████████████████████████████████████| 989/989 [00:02<00:00, 347.44it/s]


In [12]:
df['cleaned_sentences'] = cleaned_sentences
df.head()

Unnamed: 0,SetID,Adverse Reactions,Summary,sentences,cleaned_sentences
0,a834d1cf-72fc-93bf-e053-2995a90a6191,The following adverse events were observed and...,,The following adverse events were observed and...,the following adverse event were observed and ...
1,a835b697-2beb-1ba8-e053-2995a90a470c,The following serious adverse reactions are de...,,The following serious adverse reactions are de...,the following serious adverse reaction are des...
2,a837f13e-fafc-0535-e053-2995a90a5070,ADVERSE REACTIONS Clinical Trials Experience I...,,ADVERSE REACTIONS Clinical Trials Experience I...,adverse reaction clinical trial experience in ...
3,a838204b-9564-9aa6-e053-2a95a90af02f,ADVERSE REACTIONS Clinical Trials Experience I...,,ADVERSE REACTIONS Clinical Trials Experience I...,adverse reaction clinical trial experience in ...
4,f265e6dd-f47e-4511-9468-282184bcd1b1,The most common adverse reactions leading to d...,,The most common adverse reactions leading to d...,the most common adverse reaction leading to di...


### Summarising the text

In [17]:
def extract_score_summary(vocabulary, stopwords_plus, text, processed_text):
    # finding the co_occurance
    
    vocab_len = len(vocabulary)
    weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

    score = np.zeros((vocab_len),dtype=np.float32)
    window_size = 3
    covered_coocurrences = []

    for i in range(0,vocab_len):
        score[i]=1
        for j in range(0,vocab_len):
            if j==i:
                weighted_edge[i][j]=0
            else:
                for window_start in range(0,(len(processed_text)-window_size+1)):

                    window_end = window_start+window_size

                    window = processed_text[window_start:window_end]

                    if (vocabulary[i] in window) and (vocabulary[j] in window):

                        index_of_i = window_start + window.index(vocabulary[i])
                        index_of_j = window_start + window.index(vocabulary[j])

                        # index_of_x is the absolute position of the xth term in the window 
                        # (counting from 0) 
                        # in the processed_text

                        if [index_of_i,index_of_j] not in covered_coocurrences:
                            weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
                            covered_coocurrences.append([index_of_i,index_of_j])
    
    # Building inout
    inout = np.zeros((vocab_len),dtype=np.float32)
    for i in range(0,vocab_len):
        for j in range(0,vocab_len):
            inout[i]+=weighted_edge[i][j]
    
    # Calculating Score of each word using Pageranking
    MAX_ITERATIONS = 50
    d=0.85
    threshold = 0.0001 #convergence threshold

    for iter in range(0,MAX_ITERATIONS):
        prev_score = np.copy(score)
        for i in range(0,vocab_len):
            summation = 0
            for j in range(0,vocab_len):
                if weighted_edge[i][j] != 0:
                    summation += (weighted_edge[i][j]/inout[j])*score[j]

            score[i] = (1-d) + d*(summation)
            
    # Generating phrases
    
    phrases = []
    phrase = " "
        
    for word in text:
        if word in stopwords_plus:
            if phrase != " ":
                phrases.append(str(phrase).strip().split())
            phrase = " "
        elif word not in stopwords_plus:
            phrase+=str(word)
            phrase+=" "
         
    # Generating most related phrases
#     print("Phrases")
#     print(phrases)
    unique_phrases = []
    for phrase in phrases:
        if phrase not in unique_phrases:
            unique_phrases.append(phrase)
       
#     print(unique_phrases)
    # Shortning the phrases by removing lease significant phrases
    for word in vocabulary:
        for phrase in unique_phrases:
            if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
                unique_phrases.remove([word])
                
    # Scoring the phrases
    phrase_scores = []
    keywords = []
#     print(vocabulary)
#     print(unique_phrases)
    for phrase in unique_phrases:
        phrase_score=0
        keyword = ''
        for word in phrase:
            keyword += str(word)
            keyword += " "
            phrase_score+=score[vocabulary.index(word)]
        phrase_scores.append(phrase_score)
        keywords.append(keyword.strip())
        
    # Generating the summary
    sorted_index = np.flip(np.argsort(phrase_scores),0)
    if len(keywords) >= 5:
        keywords_num = 5
    else:
        keywords_num = len(keywords)
#     keywords_num = len(keywords)    
    
    string_output = ""
    for i in range(0,keywords_num):
        string_output += str(keywords[sorted_index[i]]) + ",\n"
        
    return string_output[:-2]

In [18]:
def summarize_reaction(sentence):
    
    # Preparing the text for summarizing 
    text = sentence
    text = text.split(" ")
    text = [i for i in text if i != ""]
    
    # Parts of speech tagging to identify unique stop words
    POS_tag = nltk.pos_tag(text)
    
    # Customizing the stopword list
    stopwords = []
    wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW']

    for word in POS_tag:
        if word[1] not in wanted_POS:
            stopwords.append(word[0])
            
    stopword_file = open("stopwords.txt", "r")
    
    lots_of_stopwords = []
    for line in stopword_file.readlines():
        lots_of_stopwords.append(str(line.strip()))

    stopwords_plus = []
    stopwords_plus = stopwords + lots_of_stopwords
    stopwords_plus = set(stopwords_plus)
    
    # Processing the text by removing the stop words
    processed_text = []
    for word in text:
        if word not in stopwords_plus:
            processed_text.append(word)
    
    # Creating vocubalory of words by removing the repeated words
    vocabulary = list(set(processed_text))
#     print(vocabulary)
    summarised_sentence = extract_score_summary(vocabulary, stopwords_plus, text, processed_text)
    
    return summarised_sentence

In [19]:
summary = []

In [16]:
count = 0
for sentence in df['cleaned_sentences']:
    summary.append(summarize_reaction(sentence))
    count += 1
    
    if count == 5:
        break

NameError: name 'processed_text' is not defined