In [None]:
# !pip install nltk

In [None]:
import nltk
# nltk.download()

In [None]:
import json
from string import punctuation
import os
import re
import datetime
import math

In [None]:
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN': 'n', 'JJ': 'a',
                  'VB': 'v', 'RB': 'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n'


def lemmatize_sent(text):
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag))
            for word, tag in nltk.pos_tag(text)]



In [None]:
from google.colab import drive
drive.mount('/content/drive')
checkpoints = '/content/drive/MyDrive/colab_files/'
if not os.path.exists(checkpoints):
    os.makedirs(checkpoints)

In [None]:
def nv_preprocess(nv_json_path, trim=None):    
    """
    Loads JSON into environment as dictionary
    Preprocesses the raw PDF export from previously generated json    
    Optional: Trims transcript to exclude list of those present and signature page/list of exhibits
    
    Parameters
    ----------
    nv_json_path : STRING
        Local path of nv_json generated by nv_pdftotext.
    trim: TRUE/Default(NONE)
        Provides option to trim transcript to spoken section and transcriber notes
        
    Returns
    -------
    Cleaned dictionary that excludes PDF formatting and (optional) front and back end 

    """
    
    file_path = open(nv_json_path,'rb')
    data = json.load(file_path)
    
    if trim:
        for key in data:
            if isinstance(data[key], str):
                ##Removes list of attendees on front end
                start_location = re.search(r"(CHAIR.*[A-z]\:|Chair.*[A-z]\:)", data[key]).start() #Chair speaks first
                data[key] = data[key][start_location:] #Starts transcript from when Chair first speaks
                ##Removes signature page after submission (RESPECTFULLY SUBMITTED)
                end_location = re.search(r"(Respectfully\sSUBMITTED\:|RESPECTFULLY\sSUBMITTED\:)", data[key]).start() #Signature page starts with
                data[key] = data[key][:end_location] #End transcript just before respectfully submitted            
                ##PDF formatting
                data[key] = re.sub(r"Page\s[0-9]{1,}", "", data[key]) #Removes page number
                data[key] = re.sub(r"\n", "", data[key])
                data[key] = data[key].strip()
                data[key]=" ".join(data[key].split())
            elif isinstance(data[key], list):
                for i in range(len(data[key])):
                    start_location = re.search(r"(CHAIR.*[A-z]\:|Chair.*[A-z]\:)", data[key][i]).start() #Chair speaks first
                    data[key][i] = data[key][i][start_location:] #Starts transcript from when Chair first speaks
                    end_location = re.search(r"(Respectfully\sSUBMITTED\:|RESPECTFULLY\sSUBMITTED\:|RESPECTFULLY\sSUBMITTED)",
                                             data[key][i]).start()  # Signature page starts with
                    ##Removes signature page after submission (RESPECTFULLY SUBMITTED)
                    # try:
                    #     end_location = re.search(r"(Respectfully\sSUBMITTED\:|RESPECTFULLY\sSUBMITTED\:)", data[key][i]).start() #Signature page starts with
                    # except:
                    #     end_location = -1
                    data[key][i] = data[key][i][:end_location] #End transcript just before respectfully submitted
                    ##PDF formatting
                    data[key][i] = re.sub(r"Page\s[0-9]{1,}", "", data[key][i]) #Removes page number
                    data[key][i] = re.sub(r"\n", "", data[key][i])
                    data[key][i] = data[key][i].strip()
                    data[key][i]=" ".join(data[key][i].split())
            else:
                print("Incompatible File")

        return(data)
            
    else:
        for key in data:
            if isinstance(data[key], str):          
                ##PDF formatting
                data[key] = re.sub(r"Page\s[0-9]{1,}", "", data[key]) #Removes page number
                data[key] = re.sub(r"\n", "", data[key])
                data[key] = data[key].strip()
                data[key]=" ".join(data[key].split())
            elif isinstance(data[key], list):
                for i in range(len(data[key])):      
                    ##PDF formatting
                    data[key][i] = re.sub(r"Page\s[0-9]{1,}", "", data[key][i]) #Removes page number
                    data[key][i] = re.sub(r"\n", "", data[key][i])
                    data[key][i] = data[key][i].strip()
                    data[key][i]=" ".join(data[key][i].split())
            else:
                print("Incompatible File")

        return(data)

In [None]:
file_name = "nv_hhs_m_2021.json"
data = nv_preprocess(checkpoints + file_name, trim=True)

### Now all documents within 1 month are saved together coresponding to 1 key.  json_split_by_date split the documents and save them into a dictionary where 1 document coresponding to 1 hearing.

In [None]:
def json_split_by_date(json_file):
    """
    
    Parameters
    ----------
    Local path of nv_json generated by nv_pdftotext.
        Local path of cleaned nv_json file. 
    Returns
    -------
    A new json file with month as the keys. We can call new_json_file[month] if we want the transcripts of meetings for this month.
    Eg: call new_json_file[4], we would get the transcripts for April.

    """
    json_date = {}
    month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

    for key in json_file.keys():
      for doc in json_file[key]:
        rx = r'{0}[ ]([1-9]|[12][0-9]|3[01])[,][ ](2020|2021)'.format(month[int(key)-1])
        match = re.search(rx, doc)
        date = datetime.datetime.strptime(match.group(), '%B %d, %Y').date()
        json_date[date] = doc
    return json_date

In [None]:
data = json_split_by_date(data)
# match = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December)[ ]([1-9]|[12][0-9]|3[01])[,][ ](2020|2021)', doc)


### Preprocessing (refered from Shujie's text_analysis codes)

In [None]:
raw = {}
for i in data.keys():
    raw[i] = json.dumps(data[i])

# Break up the string into words and punctuation, and create a list of words and punctuation.
text = {}
for i in raw.keys():
    text[i] = [word.lower() for word in nltk.word_tokenize(raw[i])]

# Stopwords are non-content words that primarily has only grammatical function
stopwords_en = set(nltk.corpus.stopwords.words('english'))
text_no_stopwords = {}
for i in text.keys():
    text_no_stopwords[i] = [word for word in text[i] if word not in stopwords_en]

# Remove the punctuations
text_no_stopwords_punc = {}
for i in text_no_stopwords.keys():
    text_no_stopwords_punc[i] = [word for word in text_no_stopwords[i] if word not in punctuation]

# Lemmatization
wnl = nltk.stem.WordNetLemmatizer()
for i in text_no_stopwords.keys():
    text_no_stopwords_punc[i] = lemmatize_sent(text_no_stopwords_punc[i])

# Remove the line breaks
text_no_stopwords_punc_lb={}
for i in text_no_stopwords_punc.keys():
    text_no_stopwords_punc_lb[i] = [word for word in text_no_stopwords_punc[i] if not word.startswith('\\n')] + \
                                   [word[2:] for word in text_no_stopwords_punc[i] if word.startswith('\\n')]
# Why
text_no_stopwords_punc_lb_lemma={}
for i in text_no_stopwords_punc_lb.keys():
    text_no_stopwords_punc_lb_lemma[i]=lemmatize_sent(text_no_stopwords_punc_lb[i])

text_no_stopwords_punc_lb_lemma_md={}
for i in text_no_stopwords_punc_lb_lemma.keys():
    text_no_stopwords_punc_lb_lemma_md[i]=[word for word in text_no_stopwords_punc_lb_lemma[i] if nltk.pos_tag([word])[0][1] != 'MD' ]

In [None]:
for i in text_no_stopwords_punc_lb_lemma_md.keys():
  print(text_no_stopwords_punc_lb_lemma_md[i])

### Term frequency (TF) 
##### Word frequency within each document, same as word counting

In [None]:
from nltk.probability import FreqDist
textdist={}
for i in text_no_stopwords_punc_lb_lemma_md.keys():
    textdist[i] = FreqDist(text_no_stopwords_punc_lb_lemma_md[i])

In [None]:
for i in textdist.keys():
  for sent, f_table in textdist[i].items():
    print(sent, f_table)

In [None]:
termdist = {}
for i in textdist.keys():
  count_words = len(textdist[i].keys())
  termdist[i] = textdist[i]
  for word, count in textdist[i].items():
    termdist[i][word] = count / count_words

In [None]:
for i in termdist.keys():
  for sent, f_table in termdist[i].items():
    print(sent, f_table)

### Inverse Document frequency
##### IDF = log(Number of total documents/(1 + Occurence of the word in all documents))

In [None]:
idfdist = {}

for i in termdist.keys():
  for word, count in termdist[i].items():
    if word in idfdist:
      idfdist[word] += 1
    else:
      idfdist[word] = 1
doc_count = len(termdist.keys())
for word, count in idfdist.items():
  idfdist[word] = math.log(doc_count/(count+1))

In [None]:
for sent, f_table in idfdist.items():
  print(sent, f_table)

### Term Freqency - Inverse Document frequency
##### TF-IDF = TF*IDF

In [None]:
tfidfdist = {}
for i in termdist.keys():
  tfidfdist[i] = termdist[i]
  for word, count in termdist[i].items():
    tfidfdist[i][word] = count * idfdist[word]

In [None]:
for i in tfidfdist.keys():
  for sent, f_table in tfidfdist[i].items():
    print(sent, f_table)

### Sort TF-IDF in each document
##### TF-IDF = TF*IDF

In [None]:
sort_dict = {}
for i in tfidfdist.keys():
  sort_dict[i] = dict(sorted(tfidfdist[i].items(), key=lambda item: item[1], reverse=True))

In [None]:
for i in sort_dict.keys():
  for sent, f_table in sort_dict[i].items():
    print(sent, f_table)

In [None]:
for i in sort_dict.keys():
  print(i)