In [None]:
"""
Jupyter Notebook file which was born out of necessity because the review
text lacks punctuations which is needed to get accurate aspects. The 
script does some manual restoration first, then pipes all the review text
to a command which feeds the unpunctuated text to a punctuator model which
outputs the text with correct punctuation with high accuracy. The punctuator
uses a bidirectional recurrent neural network model with attention mechanism 
for restoring the missing punctuation in unsegmented text. The software is
cited below.

@inproceedings{tilk2016,
  author    = {Ottokar Tilk and Tanel Alum{\"a}e},
  title     = {Bidirectional Recurrent Neural Network with Attention Mechanism for Punctuation Restoration},
  booktitle = {Interspeech 2016},
  year      = {2016}
}

"""

# Import Libraries
import os
import nltk
import pandas as pd
import pickle as pk
import time

# Assigning variable to library element which tokenizes on white space
wt = nltk.tokenize.WhitespaceTokenizer()

# Import processed sample data from '01 Fitting Dataset.ipynb'
infile = open("Stored Data/sample_reviews.pickle", "rb")
data = pk.load(infile)
infile.close()

# Given unpunctuated text, restore missing punctuations 
# and return punctuated text
def adjust_para(text):
    text = str(text)
    tokens = wt.tokenize(text)
    
    # the model doesn't work accurately on apostrophe so
    # that is restored manually
    for word in range(len(tokens)):
        if tokens[word] == "s":
            tokens[word] = "'s"
        elif tokens[word] == "t":
            tokens[word] = "'t"
        elif tokens[word] == "d":
            tokens[word] = "'d"
        elif tokens[word] == "ll":
            tokens[word] = "'ll"
        elif tokens[word] == "m":
            tokens[word] = "'m"
        elif tokens[word] == "o":
            tokens[word] = "'o"
        elif tokens[word] == "re":
            tokens[word] = "'re"
        elif tokens[word] == "ve":
            tokens[word] = "'ve"
        elif tokens[word] == "y":
            tokens[word] = "'y"
    
    # create a partial command
    text = " ".join(tokens).replace(" '", "'")
    mod = "text=" + text
    
    # pipe the partial command to the punctuator model and get
    # punctuated text
    puntuator = !curl -d "$mod" http://bark.phon.ioc.ee/punctuator
    
    # return punctuated text
    return puntuator[-1]

# Create new column which for each row, stores punctuated review text
data["Mod"] = data['Reviews'].apply(lambda x: adjust_para(x))

# Delete column containing unpunctuated review text, rename
# the column containing punctuated review text to Reviews,
# effectively replacing unpunctuated review column with
# punctuated one
data = data.drop(['Reviews'], axis=1)
data = data.rename(index=str, columns={"Mod": "Reviews"})

# Update the pickle file so that it now contains punctuated review text
outfile = open("Stored Data/sample_reviews.pickle", "wb")
pk.dump(data, outfile)
outfile.close()

# Also write all of the reviews to a text file and formate it appropriately
# as this file will then be passed to the Java code which extracts the aspects
input_file = open("Stored Data/raw_review_input_file.txt","w+")

for index, row in data.iterrows():
    fixed_text = row['Reviews']
    input_file.write("{}\n".format(fixed_text))
    
input_file.close()