In [1]:
# Import all the required Library
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import operator

# Text preprocessing libraries
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

# libraries for keyword extraction with tf-idf
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [43]:
# Import the pickle files created from previous notebooks
scoped_categorised_literature = pd.read_pickle("./1_scoped_cat_lit.pkl")
extracted_literature_data = pd.read_pickle("./2_extracted_literature_data.pkl")
print(scoped_categorised_literature.columns)

Index(['Date', 'question_idx', 'pdf_json_files', 'pmc_json_files', 'Study',
       'Study Link', 'Journal', 'Study Type', 'Factors', 'Influential',
       'Excerpt', 'Measure of Evidence', 'Added on'],
      dtype='object')


In [44]:
# group all the text data into one vectorised 
print(scoped_categorised_literature['Excerpt'])

1     Comparing these four scenarios, we shall deduc...
2     Our study reveals that the strict control meas...
3     We then compare the transmission rates in diff...
4     Figure 10 shows that the number of the exposed...
5     Lockdown showed highest reduction (28%) in num...
                            ...                        
46    Generally, the curves tended to be not associa...
53    We find the high temperature and relative humi...
54    We find the high temperature and relative humi...
57    The regression model, demonstrates that both a...
58    The regression model, demonstrates that both a...
Name: Excerpt, Length: 400, dtype: object


In [49]:
# Data pre-processing function
def preprocess(inputText):
    #define stopwords
    stop_words = set(stopwords.words("english"))
    #lower case the text
    outputText = inputText.lower()
    #Convrt percentages into the string percent
    outputText = re.sub('(\\d+%)', 'percent', outputText)
    # Remove special characters and digits
    outputText=re.sub("(\\d|\\W)+"," ",outputText)    
    # Tokenisation
    outputText = outputText.split()
    # Remove Stop Words
    outputText = [word for word in outputText if not word in stop_words]
    # Stemming
    ps=PorterStemmer()
    outputText = [ps.stem(word) for word in outputText]
    # Lemmatisation
    lem = WordNetLemmatizer()
    outputText = [lem.lemmatize(word) for word in outputText] 
    # outputText = ",".join(outputText) 
    
    return outputText

In [61]:
# create a count vectorizer
def build_dict(data, vocab_size = 5000):
    word_count = {} #dict storing words occuring in all documents and how often they occur
    vectorizer = CountVectorizer(max_features = vocab_size, preprocessor=lambda x:x, tokenizer=lambda x:x)
    features_train = vectorizer.fit_transform(data)
    word_list = vectorizer.get_feature_names();
    count_list = features_train.toarray().sum(axis=0);
    word_count = dict(zip(word_list, count_list))
    sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
    sorted_words = list(zip(*sorted_words))[0]
    word_dict = {}
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): #-2 to save room for no words and infrequent labels
        word_dict[word] = idx + 2
    return word_dict

In [51]:
train_text_list = scoped_categorised_literature['Excerpt'].apply(preprocess)

In [52]:
for row in train_text_list:
    print(row)

['compar', 'four', 'scenario', 'shall', 'deduc', 'order', 'effect', 'contain', 'outbreak', 'lockdown', 'shall', 'address', 'least', 'percent', 'popul', 'reduc', 'contact', 'rate', 'percent', 'usual', 'contact']
['studi', 'reveal', 'strict', 'control', 'measur', 'implement', 'india', 'substanti', 'mitig', 'dissemin', 'sar', 'cov', 'importantli', 'model', 'simul', 'predict', 'percent', 'reduct', 'outbreak', 'june', 'percent', 'reduct', 'outbreak', 'juli', 'india']
['compar', 'transmiss', 'rate', 'differ', 'time', 'window', 'first', 'sub', 'sampl', 'one', 'new', 'infect', 'lead', 'case', 'within', 'week', 'impli', 'fast', 'growth', 'number', 'case', 'howev', 'second', 'sub', 'sampl', 'effect', 'decreas', 'suggest', 'public', 'health', 'measur', 'impos', 'late', 'januari', 'effect', 'limit', 'spread', 'viru']
['figur', 'show', 'number', 'expo', 'individu', 'region', 'decreas', 'without', 'control', 'control', 'end', 'implement', 'propos', 'strategi', 'figur', 'demonstr', 'number', 'infect'

In [62]:
word_dict = build_dict(train_text_list)
print(len(word_dict))
print(word_dict)

1030
{'percent': 2, 'case': 3, 'temperatur': 4, 'number': 5, 'infect': 6, 'rate': 7, 'ir': 8, 'covid': 9, 'increas': 10, 'reduc': 11, 'c': 12, 'day': 13, 'reduct': 14, 'quarantin': 15, 'delay': 16, 'decreas': 17, 'peak': 18, 'ccr': 19, 'ci': 20, 'effect': 21, 'humid': 22, 'week': 23, 'cov': 24, 'sar': 25, 'isol': 26, 'distanc': 27, 'popul': 28, 'r': 29, 'wn': 30, 'detect': 31, 'measur': 32, 'scenario': 33, 'social': 34, 'time': 35, 'contact': 36, 'individu': 37, 'result': 38, 'transmiss': 39, 'growth': 40, 'reproduct': 41, 'antibodi': 42, 'control': 43, 'intervent': 44, 'specif': 45, 'daili': 46, 'death': 47, 'epidem': 48, 'averag': 49, 'incid': 50, 'model': 51, 'confirm': 52, 'figur': 53, 'high': 54, 'one': 55, 'school': 56, 'correl': 57, 'cumul': 58, 'posit': 59, 'associ': 60, 'ratio': 61, 'total': 62, 'closur': 63, 'peopl': 64, 'show': 65, 'new': 66, 'valu': 67, 'lockdown': 68, 'march': 69, 'per': 70, 'rel': 71, 'respect': 72, 'sc': 73, 'th': 74, 'end': 75, 'implement': 76, 'infecti