In [16]:
import nltk
import math

In [17]:
def create_frequency_matrix(sentences) :
    frequency_matrix = {}
    stopWords = set(nltk.corpus.stopwords.words("english"))
    ps = nltk.PorterStemmer()
    
    for sent in sentences :
        freq_table = {}
        words = nltk.word_tokenize(sent)
        for word in words :
            word = ps.stem(word.lower())
            if word in stopWords :
                continue
            if word in freq_table :
                freq_table[word] += 1
            else :
                freq_table[word] = 1
        frequency_matrix[sent[:15]] = freq_table
    
    return frequency_matrix
        

In [18]:
def create_tf_matrix(freq_matrix) :
    tf_matrix = {}
    
    for sent, f_table in freq_matrix.items() :
        tf_table = {}
        
        count_words_in_sentences = len(f_table)
        for word, count in f_table.items() :
            tf_table[word] = count / count_words_in_sentences
            
        tf_matrix[sent] = tf_table
    
    return tf_matrix

In [19]:
def create_documents_per_words (freq_matrix) :
    word_per_doc_table = {}
    
    for sent, f_table in freq_matrix.items() :
        for word, count in f_table.items() :
            if word in word_per_doc_table :
                word_per_doc_table[word] += 1
            else :
                word_per_doc_table[word] = 1
                
    return word_per_doc_table

In [20]:
def create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [21]:
def create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(), f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [22]:
def score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [23]:
def find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [24]:

def generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [25]:
import os

full_text = ""
dir_path = "./unlabelled_documents/"

i=0
for file in os.listdir(dir_path) :
    i += 1
    if i == 141 :
        break
    f = open(dir_path+file,encoding="UTF-8", errors="replace")
    text = f.read()
    # dataset_file.write(text)
    # text = text.decode("utf-8",'ignore').encode("utf-8")
    full_text += text
    f.close()
  

text = full_text
# print(text)

In [26]:
sentences = nltk.sent_tokenize(text)
total_documents = len(sentences)

freq_matrix = create_frequency_matrix(sentences)

tf_matrix = create_tf_matrix(freq_matrix)

count_doc_per_words = create_documents_per_words(freq_matrix)

idf_matrix = create_idf_matrix(freq_matrix,  count_doc_per_words, total_documents)

tf_idf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix)
# print(tf_idf_matrix)

sentence_scores = score_sentences(tf_idf_matrix)
# print(sentence_scores)

threshold = find_average_score(sentence_scores)

summary = generate_summary(sentences, sentence_scores, 1.3 * threshold)

print(summary)

 The WHO has said there is still a chance of containing the virus if its chain of transmission is broken. The sudden increases of cases in Italy, the Islamic Republic of Iran and the Republic of Korea are deeply concerning. Editors Scott L. Greer, Elizabeth J. The vaccine is produced by the Beijing-based pharmaceutical company Sinovac. It also allows countries to expedite
their own regulatory approval to import and administer COVID-19 vaccines. Emerging data on effectiveness indicates 
that licenced COVID-19vaccines are contributing to controlling the spread of the disease. The endemic stage is when a population learns to live with a virus. But how do the different types of COVID-19 vaccines work? Each COVID-19 vaccine causes the immune system to
create antibodies to fight COVID-19. COVID-19 vaccines use a harmless version of a spikelike structure on the surface of the
COVID-19 virus called an S protein. Mid-term review of the UN Decade of Action on Nutrition


It is almost five years 

In [27]:
# question answering system

import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

article_text = text.lower()

article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)

article_sentences = nltk.sent_tokenize(article_text)
article_words = nltk.word_tokenize(article_text)



In [28]:
wnlemmatizer = nltk.stem.WordNetLemmatizer()

def perform_lemmatization(tokens):
    return [wnlemmatizer.lemmatize(token) for token in tokens]

punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

In [29]:
def generate_response(user_input):
    response = ''
    article_sentences.append(user_input)

    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text)
    all_word_vectors = word_vectorizer.fit_transform(article_sentences)
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)
    similar_sentence_number = similar_vector_values.argsort()[0][-2]

    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0:
        response = response + "I am sorry, no answer found"
        return response
    else:
        response = response + article_sentences[similar_sentence_number]
        return response

In [30]:
continue_dialogue = True
print("Enter your question: ")
while (continue_dialogue) :
    user_input = input().lower()
    print(user_input)
    if (user_input == 'exit') :
        continue_dialogue = False
    else :
        print("Answer is: ")
        print(generate_response(user_input))

Enter your question: 
how fifa helped in coronavirus
Answer is: 
�be it through campaigns or funding, fifa has stood up to the coronavirus, and i am delighted that world football is supporting who to kick out the coronavirus.
what was the vairant of coronavirus in south africa
Answer is: 
the b.1.1.529 variant was first reported to who from south africa on 24 november 2021. the epidemiological situation in south africa has been characterized by three distinct peaks in reported cases, the latest of which was predominantly the delta variant.
how much money was funded by un
Answer is: 
�covid-19 has taken so much from us.

Answer is: 
I am sorry, no answer found
exit
