# <center>HW 4: Text preprocessing</center>

In [247]:
import nltk, re, json, string
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
import numpy as np  
import pandas as pd
from nltk.corpus import stopwords
import spacy
import string
# nltk.download('averaged_perceptron_tagger')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Q1: Regular Expression (2 points)

Suppose you have scraped the text shown below from an online source. You'd like to extract data using regular expression.

Define a **extract** function which:
- Takes a piece of text (in the format of shown below) as an input
- Extracts data into a list of tuples using regular expression, e.g.  `[('BTC-USD','56,212.15','-58.16','-0.10%'), ('ETH-USD',  ...), ...]`
- Returns the list of tuples

In [248]:
text='''Symbol   Last Price  Change   % Change   Note
                  BTC-USD  56,212.15   -58.16   -0.10%   Bitcoin 
                  ETH-USD  1,787.79    -53.63   -2.91%   Ether
                  BNB-USD  1,101,290.51      +5.81    +2.04%   Binance
                  USDT-USD 1.0003      -0.0004  -0.04%   Tether
                  ADA-USD  1.1187      -0.0528  -4.51%   Cardano
                  
                  
                  new test row below to show code works for all new values in the same format
                  and disregards any other text
                  TESTADA-USD  jxnjx 1.1187   1.5%   -0.0528  x% 1%  -4.51%    Cardano
                  
                  incase of invalid format of actual data ignore entire row 
                  invalidADA-USD   1.d87  -0.5f -4.51% Cardano
      '''
text

'Symbol   Last Price  Change   % Change   Note\n                  BTC-USD  56,212.15   -58.16   -0.10%   Bitcoin \n                  ETH-USD  1,787.79    -53.63   -2.91%   Ether\n                  BNB-USD  1,101,290.51      +5.81    +2.04%   Binance\n                  USDT-USD 1.0003      -0.0004  -0.04%   Tether\n                  ADA-USD  1.1187      -0.0528  -4.51%   Cardano\n                  \n                  \n                  new test row below to show code works for all new values in the same format\n                  and disregards any other text\n                  TESTADA-USD  jxnjx 1.1187   1.5%   -0.0528  x% 1%  -4.51%    Cardano\n                  \n                  incase of invalid format of actual data ignore entire row \n                  invalidADA-USD   1.d87  -0.5f -4.51% Cardano\n      '

In [249]:
# Define the function

def extract(text):
    # add your code

    #                           Symbol group      Last price group       Change group      %Change group
    tuple_list = re.findall(r'([A-Z]+-[A-Z]+).*\s(\d+[,*\d*]*\.\d+)\s.*([-|+]\d+\.\d+)\s.*([-|+]\d+\.\d+%)',text)
    
    return tuple_list

In [250]:
# Test the function

extract(text)

[('BTC-USD', '56,212.15', '-58.16', '-0.10%'),
 ('ETH-USD', '1,787.79', '-53.63', '-2.91%'),
 ('BNB-USD', '1,101,290.51', '+5.81', '+2.04%'),
 ('USDT-USD', '1.0003', '-0.0004', '-0.04%'),
 ('ADA-USD', '1.1187', '-0.0528', '-4.51%'),
 ('TESTADA-USD', '1.1187', '-0.0528', '-4.51%')]

## Q2: Collocation (3 points)

Define a function `top_collocation(doc, K)` to find top-K collocations in specific patterns in a document as follows:
  - Takes a document (i.e. `doc`) and `K` as inputs
  - Find collocations as follows:
    - Tokenize the document and find POS tag of each token (hint: you can use NLTK word tokenizer or Spacy tokenizer).
    - Create bigrams from the tokens with POS tags.

    - Keep only bigrams matching the following patterns:
       - `Adj + Noun`: e.g. linear function
       - `Noun + Noun`: e.g. regression coefficient
    - Get frequency of each bigram (hint: you can use nltk.FreqDist)
    - Returns top K collocations by frequency

In [251]:
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
# Define the function


def top_collocation(doc, K):
    
    # add your code
    # Custom tokenize words to preserve (-) separated and (') punctuation words.
    tokens = nltk.regexp_tokenize(doc, r'\w[\w\',-]*\w')
    
    
    # Get POS tags for each token
    pos_tagged_tokens = nltk.pos_tag(tokens)
    
    # Create bigrams from the tokens with POS tags.
    bigrams = list(nltk.bigrams(pos_tagged_tokens))
    
    # Keep only bigrams matching the pattern: Adj + Noun or Noun + Noun
    filter_bigrams=[ (x[0],y[0]) for (x,y) in bigrams \
         if (x[1].startswith('JJ') or x[1].startswith('NN'))\
         and y[1].startswith('NN')]
    
    # Get frequency of each bigram
    f_dist = nltk.FreqDist(filter_bigrams)
    
    # Get top K collocations by frequency
    result = f_dist.most_common(K)
    
    return result

In [252]:
data = json.load(open("qa.json","r"))
article = data["context"]

top_collocation(article, 10)

[(('public', 'health'), 14),
 (('community', 'spread'), 9),
 (('United', 'States'), 8),
 (('higher', 'risk'), 4),
 (('COVID-19', 'illness'), 4),
 (('elevated', 'risk'), 4),
 (('new', 'coronavirus'), 3),
 (('health', 'threat'), 3),
 (('serious', 'COVID-19'), 3),
 (('new', 'virus'), 3)]

## Q3: Question and Answering (QA) System (5 points)

Develop a QA system which allow you to search for answers in an article. For example, the file `qa.json` contains a research article. This article can answer a number of questions about COVID-19. You will design a solution to automatically search answers to these questions in this article.

`qa.json` is taken from https://github.com/deepset-ai/COVID-QA. This file contains a few questions, and answers to these questions have been located in the article. Let's define a QA system and check if your system can locate the right answers.

The following script helps you understand `qa.json`:

In [253]:
# Retrieve the article

data = json.load(open("qa.json","r"))
article = data["context"]

# A long article. Just print the first 200 characters
print(article[0:200])

CDC Summary 21 MAR 2020,
https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/summary.html

This is a rapidly evolving situation and CDC will provide updated information and guidance as it becomes 


In [254]:
# Retrieve all the questions and answers
qas = data["qas"]

# show the first question-answer pair. Note the answer starts at the 6117th character
print(qas[0])

# get all questions
qs = [item["question"] for item in qas]
qs

{'question': 'What age group has the highest rate of severe outcomes?', 'id': 236, 'answers': [{'text': 'people 85 years and older', 'answer_start': 6117}], 'is_impossible': False}


['What age group has the highest rate of severe outcomes?',
 'How is COVID-19 spread?',
 'How many states in the U.S. have reported cases of COVID-19?',
 'When did the White House launch the "15 Days to Slow the Spread" program?',
 'What should mildly-ill patients do?',
 'What type of virus is SARS-CoV-2?',
 'What viruses are similar to the COVID-19 coronavirus?',
 'What are the phases of a pandemic?',
 'At which phase does the peak of the pandemic occur?',
 'People with which medical conditions have a higher rate of severe illness?',
 'What kind of test can diagnose COVID-19?',
 'In what species did the COVID-19 virus likely originate?',
 'What risk factors should be considered in addition to clinical symptoms?']

Next, following the instructions below step by step to develop the QA system

### Q3.1. Tokenizer

Define a function `tokenize(doc)`  as follows:
   - Take a piece of text (i.e. variable `doc`) as an input
   - Split the input text into unigrams
   - Clean up tokens as follows:
       - Lemmatize all unigrams
       - Remove all stop words
       - Remove all punctuations
       - Convert all unigrams to the lower case 
       - remove empty unigrams
   - Return the list of unigrams after all the processing. (Hint: you can use spacy package for this task. To test if a token is stop word or punctuation, check https://spacy.io/api/token#attributes)

In [255]:
# Define the function

def tokenize(doc):
    # add your code
    sp = spacy.load('en_core_web_sm')
    
    # tokenize into unigrams
    tokens = sp(doc)
    
    # Lemmatize all unigrams
    lemmatized_tokens = [x.lemma_ for x in tokens]
    
    # Remove all stop words
    filt_stop_words = [x for x in lemmatized_tokens if not sp.vocab[x].is_stop]
    
    # Remove all punctuations
    stripped_puncs = [x for x in filt_stop_words if not sp.vocab[x].is_punct]
    
    # Convert all unigrams to the lower case
    lower_tokens = [x.lower() for x in stripped_puncs]
    
    # Remove empty unigrams
    tokens = [x.strip() for x in lower_tokens if x.strip()!='']
    
    return tokens

In [256]:
doc = 'Older people and people of all ages with severe chronic medical conditions — \
like heart disease, lung disease and diabetes, \
for example — seem to be at higher risk of developing serious COVID-19 illness.'

print(tokenize(doc))

['old', 'people', 'people', 'age', 'severe', 'chronic', 'medical', 'condition', 'like', 'heart', 'disease', 'lung', 'disease', 'diabete', 'example', 'high', 'risk', 'develop', 'covid-19', 'illness']


### Q3.2. Compute TF-IDF Matrix

Define a function `compute_tfidf(docs)` as follows: 

- Take `docs`, a list of documents (e.g. a list of questions) as an input
- Tokenize each document in `docs` using the `tokenize` function defined in Q3.1. 
- Calculate tf_idf weights as shown in lecture notes (Hint: you can reuse the last code segment in NLP Lecture Notes (II))
- Return a smoothed normalized `tf_idf` array. (The result may differ a little bit depending on the tokenize function and packages you use.)

In [257]:
# Define the function

def compute_tfidf(docs):
    # get tokens and convert to dataframe for indexing
    docs_tokens = {idx:nltk.FreqDist(tokenize(doc)) for idx,doc in enumerate(docs)}
    dtm = pd.DataFrame.from_dict(docs_tokens,orient="index")
    # fill empty values and sort by index
    dtm = dtm.fillna(0)
    dtm = dtm.sort_index(axis = 0)
    # Calculate term frequency
    tf = dtm.values
    doc_len=tf.sum(axis=1)
    tf=np.divide(tf, doc_len[:,None])  
    df=np.where(tf>0,1,0)
    # Calculate transformed Inversed Document Frequency
    smoothed_idf=np.log(np.divide(len(docs)+1, np.sum(df, axis=0)+1))+1
    # Calculate TF_IDF
    smoothed_tf_idf=normalize(tf*smoothed_idf)
    return smoothed_tf_idf

In [258]:
# Test the function using three questions

np.set_printoptions(precision=2)
compute_tfidf(qs[0:3])

array([[0.41, 0.41, 0.41, 0.41, 0.41, 0.41, 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.61, 0.8 , 0.  , 0.  , 0.  ,
        0.  ],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.36, 0.  , 0.47, 0.47, 0.47,
        0.47]])

### Q3.2. Put Everything Together

Define a function `find_solutions(qs, article)` as follows: 

- Take two inputs:
    - `qs`: a list of questions (i.e. strings)
    - `article`: a document which may contain answers to the questions
- Segment the article into sentences (i.e. `sents`). You will locate the sentence which can answer a question.
- Concatenate the questions (`qs`) and sentences (`sents`) into a single list (i.e. `qs + sents`)
- Call the function `compute_tfidf` defined in Q3.2 with `qs + sents` to get a `TF-IDF` matrix. (Note, now `qs` and `sents` are converted to TF-IDF vectors in the same dimension. As a result, you can measure their similarities.) 
- Split the `TF-IDF` matrix into two sub matrices, one corresponding to `qs` and the other for `sents`. 
- Next, calculate the pairwise cosine similarity between the `qs` and `sents`. With $m$ questions and $n$ sentences, you should get a $m \times n$ matrix. (hint: you can `sklearn.metrics.pairwise_distances` to calculate pairwise distances between two matrices)
- Finally, the answer to each question is the sentence which has the `maximum similarity` to it. 
- Print out each question and its matched answer. Check if your QA system is able to find the right answer.(Depending on the packages you use, your answer might be a bit different from mine.)

In [261]:
# Define the function
# from sklearn.metrics.pairwise import cosine_similarity

def find_solutions(qs, article):
    
    # create a tokenizer using spacy for getting sentences from article
    sp = spacy.load('en_core_web_sm')
    tokens = sp(article)
    # Get sentences from article by tokens.sents
    # Remove new line by string.replace (strip() fails remove new line characters in the middle of a sentence)....
    all_sentences = [x.text.replace('\n',' ') for x in tokens.sents]
    # Remove sentences with only whitespaces and only one words to avoid giving importance to low discrimination one words.
    sentences = [x for x in all_sentences if not bool(re.search(r'^\s*\S+\s*$|^\s*$',x))]
    # Concatenate the questions and sentences
    qs_plus_sentences = qs + sentences
    # Get tfidf for the combined list
    tf_idf_mat = compute_tfidf(qs_plus_sentences)
    # Split tfidf by questions and sentences
    tf_idf_split = np.split(tf_idf_mat,[len(qs)])
    # Calculate pairwise cosine similarity(both 1-pairwise_distances with metric=cosine and cosine_similarity give same results)
    pwcs = 1 - pairwise_distances(tf_idf_split[0],tf_idf_split[1],metric='cosine')
    # pwcs = cosine_similarity(tf_idf_split[0],tf_idf_split[1])
    
    # Match questions with sentences with maximum pairwise cosine similarities
    # in case of more than one sentences with max cs value, combine them
    results=[['Question: ' + x]+ \
         ['Answers: ' + ','.join([sentences[y] for y in np.where(pwcs[idx]==np.amax(pwcs[idx]))[0]])]  \
         for idx,x in enumerate(qs)]
             
    return results                    

In [262]:
find_solutions(qs, article)

[['Question: What age group has the highest rate of severe outcomes?',
  'Answers: A CDC Morbidity & Mortality Weekly Report that looked at severity of disease among COVID-19 cases in the United States by age group found that 80% of deaths were among adults 65 years and older with the highest percentage of severe outcomes occurring in people 85 years and older.'],
 ['Question: How is COVID-19 spread?',
  'Answers: More than half of the U.S. is seeing some level of community spread of COVID-19.'],
 ['Question: How many states in the U.S. have reported cases of COVID-19?',
  'Answers: All 50 states have reported cases of COVID-19 to CDC.'],
 ['Question: When did the White House launch the "15 Days to Slow the Spread" program?',
  'Answers: On March 16, the White House announced a program called “15 Days to Slow the Spread,”pdf iconexternal icon which is a nationwide effort to slow the spread of COVID-19 through the implementation of social distancing at all levels of society.'],
 ['Quest